COMPMID-344 Updated doxygen

Change-Id: I32f7b84daa560e460b77216add529c8fa8b327ae
author: Anthony Barbier <anthony.barbier@arm.com> 2017-09-04 18:44:23 +0100
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-09-17 13:03:09 +0100
commit: 6ff3b19ee6120edf015fad8caab2991faa3070af (patch)
tree: a7a6dcd16dfd56d79fa1b56a313caeebcc939b68
download: ComputeLibrary-6ff3b19ee6120edf015fad8caab2991faa3070af.tar.gz
942 files changed, 157278 insertions, 0 deletions
diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000000..01a1f0eadc
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,48 @@
+---
+Language: Cpp
+AccessModifierOffset: '0'
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: 'true'
+AlignConsecutiveDeclarations: 'true'
+AlignEscapedNewlinesLeft: 'true'
+AlignTrailingComments: 'true'
+AllowShortBlocksOnASingleLine: 'false'
+AllowShortCaseLabelsOnASingleLine: 'false'
+AllowShortFunctionsOnASingleLine: None
+AllowShortIfStatementsOnASingleLine: 'false'
+AllowShortLoopsOnASingleLine: 'false'
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: 'true'
+AlwaysBreakTemplateDeclarations: 'true'
+BreakBeforeBinaryOperators: NonAssignment
+BreakBeforeBraces: Allman
+BreakBeforeTernaryOperators: 'false'
+BreakConstructorInitializersBeforeComma: 'false'
+#BreakStringLiterals: 'true'
+ConstructorInitializerAllOnOneLineOrOnePerLine: 'true'
+Cpp11BracedListStyle: 'false'
+DerivePointerAlignment: 'false'
+IndentCaseLabels: 'true'
+IndentWidth: '4'
+IndentWrappedFunctionNames: 'false'
+KeepEmptyLinesAtTheStartOfBlocks: 'false'
+MaxEmptyLinesToKeep: '1'
+NamespaceIndentation: None
+PointerAlignment: Right
+SortIncludes: 'true'
+SpaceAfterCStyleCast: 'false'
+SpaceBeforeAssignmentOperators: 'true'
+SpaceBeforeParens: Never
+SpaceInEmptyParentheses: 'false'
+SpacesInAngles: 'false'
+SpacesInCStyleCastParentheses: 'false'
+SpacesInParentheses: 'false'
+SpacesInSquareBrackets: 'false'
+Standard: Cpp11
+TabWidth: '4'
+UseTab: Never
+ReflowComments: 'false'
+ContinuationIndentWidth: '4'
+ColumnLimit: 0
+---
diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 0000000000..7ee4757259
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,410 @@
+---
+Checks:          'clang-diagnostic-*,clang-analyzer-*,*,-cppcoreguidelines-pro-bounds-pointer-arithmetic,-cppcoreguidelines-pro-bounds-array-to-pointer-decay,-cppcoreguidelines-pro-bounds-constant-array-index,-cert-err58-cpp,-cppcoreguidelines-pro-type-reinterpret-cast,-google-runtime-references,-google-build-using-namespace,-readability-redundant-member-init,-readability-redundant-declaration,-readability-else-after-return,-performance-type-promotion-in-math-fn,-cert-err60-cpp'
+WarningsAsErrors: ''
+HeaderFilterRegex: ''
+AnalyzeTemporaryDtors: false
+CheckOptions:
+  - key:             cert-dcl59-cpp.HeaderFileExtensions
+    value:           h,hh,hpp,hxx
+  - key:             cert-err09-cpp.CheckThrowTemporaries
+    value:           '1'
+  - key:             cert-err61-cpp.CheckThrowTemporaries
+    value:           '1'
+  - key:             cert-oop11-cpp.IncludeStyle
+    value:           llvm
+  - key:             cppcoreguidelines-pro-bounds-constant-array-index.GslHeader
+    value:           ''
+  - key:             cppcoreguidelines-pro-bounds-constant-array-index.IncludeStyle
+    value:           '0'
+  - key:             cppcoreguidelines-pro-type-member-init.IgnoreArrays
+    value:           '0'
+  - key:             google-build-namespaces.HeaderFileExtensions
+    value:           h,hh,hpp,hxx
+  - key:             google-global-names-in-headers.HeaderFileExtensions
+    value:           h
+  - key:             google-readability-braces-around-statements.ShortStatementLines
+    value:           '1'
+  - key:             google-readability-function-size.BranchThreshold
+    value:           '4294967295'
+  - key:             google-readability-function-size.LineThreshold
+    value:           '4294967295'
+  - key:             google-readability-function-size.StatementThreshold
+    value:           '800'
+  - key:             google-readability-namespace-comments.ShortNamespaceLines
+    value:           '10'
+  - key:             google-readability-namespace-comments.SpacesBeforeComments
+    value:           '2'
+  - key:             google-runtime-int.SignedTypePrefix
+    value:           int
+  - key:             google-runtime-int.TypeSuffix
+    value:           ''
+  - key:             google-runtime-int.UnsignedTypePrefix
+    value:           uint
+  - key:             google-runtime-references.WhiteListTypes
+    value:           ''
+  - key:             llvm-header-guard.HeaderFileExtensions
+    value:           ',h,hh,hpp,hxx'
+  - key:             llvm-namespace-comment.ShortNamespaceLines
+    value:           '1'
+  - key:             llvm-namespace-comment.SpacesBeforeComments
+    value:           '1'
+  - key:             misc-argument-comment.StrictMode
+    value:           '0'
+  - key:             misc-assert-side-effect.AssertMacros
+    value:           assert
+  - key:             misc-assert-side-effect.CheckFunctionCalls
+    value:           '0'
+  - key:             misc-dangling-handle.HandleClasses
+    value:           'std::basic_string_view;std::experimental::basic_string_view'
+  - key:             misc-definitions-in-headers.HeaderFileExtensions
+    value:           ',h,hh,hpp,hxx'
+  - key:             misc-definitions-in-headers.UseHeaderFileExtension
+    value:           '1'
+  - key:             misc-misplaced-widening-cast.CheckImplicitCasts
+    value:           '0'
+  - key:             misc-move-constructor-init.IncludeStyle
+    value:           llvm
+  - key:             misc-sizeof-expression.WarnOnSizeOfCompareToConstant
+    value:           '1'
+  - key:             misc-sizeof-expression.WarnOnSizeOfConstant
+    value:           '1'
+  - key:             misc-sizeof-expression.WarnOnSizeOfThis
+    value:           '1'
+  - key:             misc-string-constructor.LargeLengthThreshold
+    value:           '8388608'
+  - key:             misc-string-constructor.WarnOnLargeLength
+    value:           '1'
+  - key:             misc-suspicious-enum-usage.StrictMode
+    value:           '0'
+  - key:             misc-suspicious-missing-comma.MaxConcatenatedTokens
+    value:           '5'
+  - key:             misc-suspicious-missing-comma.RatioThreshold
+    value:           '0.200000'
+  - key:             misc-suspicious-missing-comma.SizeThreshold
+    value:           '5'
+  - key:             misc-suspicious-string-compare.StringCompareLikeFunctions
+    value:           ''
+  - key:             misc-suspicious-string-compare.WarnOnImplicitComparison
+    value:           '1'
+  - key:             misc-suspicious-string-compare.WarnOnLogicalNotComparison
+    value:           '0'
+  - key:             misc-throw-by-value-catch-by-reference.CheckThrowTemporaries
+    value:           '1'
+  - key:             modernize-loop-convert.MaxCopySize
+    value:           '16'
+  - key:             modernize-loop-convert.MinConfidence
+    value:           reasonable
+  - key:             modernize-loop-convert.NamingStyle
+    value:           CamelCase
+  - key:             modernize-pass-by-value.IncludeStyle
+    value:           llvm
+  - key:             modernize-pass-by-value.ValuesOnly
+    value:           '0'
+  - key:             modernize-replace-auto-ptr.IncludeStyle
+    value:           llvm
+  - key:             modernize-use-auto.RemoveStars
+    value:           '0'
+  - key:             modernize-use-default-member-init.UseAssignment
+    value:           '0'
+  - key:             modernize-use-emplace.ContainersWithPushBack
+    value:           '::std::vector;::std::list;::std::deque'
+  - key:             modernize-use-emplace.SmartPointers
+    value:           '::std::shared_ptr;::std::unique_ptr;::std::auto_ptr;::std::weak_ptr'
+  - key:             modernize-use-nullptr.NullMacros
+    value:           'NULL'
+  - key:             modernize-use-transparent-functors.SafeMode
+    value:           '0'
+  - key:             performance-faster-string-find.StringLikeClasses
+    value:           'std::basic_string'
+  - key:             performance-for-range-copy.WarnOnAllAutoCopies
+    value:           '0'
+  - key:             performance-inefficient-string-concatenation.StrictMode
+    value:           '0'
+  - key:             performance-type-promotion-in-math-fn.IncludeStyle
+    value:           llvm
+  - key:             performance-unnecessary-value-param.IncludeStyle
+    value:           llvm
+  - key:             readability-braces-around-statements.ShortStatementLines
+    value:           '0'
+  - key:             readability-function-size.BranchThreshold
+    value:           '4294967295'
+  - key:             readability-function-size.LineThreshold
+    value:           '4294967295'
+  - key:             readability-function-size.StatementThreshold
+    value:           '800'
+  - key:             readability-identifier-naming.AbstractClassCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.AbstractClassPrefix
+    value:           ''
+  - key:             readability-identifier-naming.AbstractClassSuffix
+    value:           ''
+  - key:             readability-identifier-naming.ClassCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.ClassConstantCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.ClassConstantPrefix
+    value:           ''
+  - key:             readability-identifier-naming.ClassConstantSuffix
+    value:           ''
+  - key:             readability-identifier-naming.ClassMemberCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.ClassMemberPrefix
+    value:           ''
+  - key:             readability-identifier-naming.ClassMemberSuffix
+    value:           ''
+  - key:             readability-identifier-naming.ClassMethodCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.ClassMethodPrefix
+    value:           ''
+  - key:             readability-identifier-naming.ClassMethodSuffix
+    value:           ''
+  - key:             readability-identifier-naming.ClassPrefix
+    value:           ''
+  - key:             readability-identifier-naming.ClassSuffix
+    value:           ''
+  - key:             readability-identifier-naming.ConstantCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.ConstantMemberCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.ConstantMemberPrefix
+    value:           ''
+  - key:             readability-identifier-naming.ConstantMemberSuffix
+    value:           ''
+  - key:             readability-identifier-naming.ConstantParameterCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.ConstantParameterPrefix
+    value:           ''
+  - key:             readability-identifier-naming.ConstantParameterSuffix
+    value:           ''
+  - key:             readability-identifier-naming.ConstantPrefix
+    value:           ''
+  - key:             readability-identifier-naming.ConstantSuffix
+    value:           ''
+  - key:             readability-identifier-naming.ConstexprFunctionCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.ConstexprFunctionPrefix
+    value:           ''
+  - key:             readability-identifier-naming.ConstexprFunctionSuffix
+    value:           ''
+  - key:             readability-identifier-naming.ConstexprMethodCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.ConstexprMethodPrefix
+    value:           ''
+  - key:             readability-identifier-naming.ConstexprMethodSuffix
+    value:           ''
+  - key:             readability-identifier-naming.ConstexprVariableCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.ConstexprVariablePrefix
+    value:           ''
+  - key:             readability-identifier-naming.ConstexprVariableSuffix
+    value:           ''
+  - key:             readability-identifier-naming.EnumCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.EnumConstantCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.EnumConstantPrefix
+    value:           ''
+  - key:             readability-identifier-naming.EnumConstantSuffix
+    value:           ''
+  - key:             readability-identifier-naming.EnumPrefix
+    value:           ''
+  - key:             readability-identifier-naming.EnumSuffix
+    value:           ''
+  - key:             readability-identifier-naming.FunctionCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.FunctionPrefix
+    value:           ''
+  - key:             readability-identifier-naming.FunctionSuffix
+    value:           ''
+  - key:             readability-identifier-naming.GlobalConstantCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.GlobalConstantPrefix
+    value:           ''
+  - key:             readability-identifier-naming.GlobalConstantSuffix
+    value:           ''
+  - key:             readability-identifier-naming.GlobalFunctionCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.GlobalFunctionPrefix
+    value:           ''
+  - key:             readability-identifier-naming.GlobalFunctionSuffix
+    value:           ''
+  - key:             readability-identifier-naming.GlobalVariableCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.GlobalVariablePrefix
+    value:           ''
+  - key:             readability-identifier-naming.GlobalVariableSuffix
+    value:           ''
+  - key:             readability-identifier-naming.IgnoreFailedSplit
+    value:           '0'
+  - key:             readability-identifier-naming.InlineNamespaceCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.InlineNamespacePrefix
+    value:           ''
+  - key:             readability-identifier-naming.InlineNamespaceSuffix
+    value:           ''
+  - key:             readability-identifier-naming.LocalConstantCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.LocalConstantPrefix
+    value:           ''
+  - key:             readability-identifier-naming.LocalConstantSuffix
+    value:           ''
+  - key:             readability-identifier-naming.LocalVariableCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.LocalVariablePrefix
+    value:           ''
+  - key:             readability-identifier-naming.LocalVariableSuffix
+    value:           ''
+  - key:             readability-identifier-naming.MacroDefinitionCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.MacroDefinitionPrefix
+    value:           ''
+  - key:             readability-identifier-naming.MacroDefinitionSuffix
+    value:           ''
+  - key:             readability-identifier-naming.MemberCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.MemberPrefix
+    value:           ''
+  - key:             readability-identifier-naming.MemberSuffix
+    value:           ''
+  - key:             readability-identifier-naming.MethodCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.MethodPrefix
+    value:           ''
+  - key:             readability-identifier-naming.MethodSuffix
+    value:           ''
+  - key:             readability-identifier-naming.NamespaceCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.NamespacePrefix
+    value:           ''
+  - key:             readability-identifier-naming.NamespaceSuffix
+    value:           ''
+  - key:             readability-identifier-naming.ParameterCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.ParameterPackCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.ParameterPackPrefix
+    value:           ''
+  - key:             readability-identifier-naming.ParameterPackSuffix
+    value:           ''
+  - key:             readability-identifier-naming.ParameterPrefix
+    value:           ''
+  - key:             readability-identifier-naming.ParameterSuffix
+    value:           ''
+  - key:             readability-identifier-naming.PrivateMemberCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.PrivateMemberPrefix
+    value:           ''
+  - key:             readability-identifier-naming.PrivateMemberSuffix
+    value:           ''
+  - key:             readability-identifier-naming.PrivateMethodCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.PrivateMethodPrefix
+    value:           ''
+  - key:             readability-identifier-naming.PrivateMethodSuffix
+    value:           ''
+  - key:             readability-identifier-naming.ProtectedMemberCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.ProtectedMemberPrefix
+    value:           ''
+  - key:             readability-identifier-naming.ProtectedMemberSuffix
+    value:           ''
+  - key:             readability-identifier-naming.ProtectedMethodCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.ProtectedMethodPrefix
+    value:           ''
+  - key:             readability-identifier-naming.ProtectedMethodSuffix
+    value:           ''
+  - key:             readability-identifier-naming.PublicMemberCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.PublicMemberPrefix
+    value:           ''
+  - key:             readability-identifier-naming.PublicMemberSuffix
+    value:           ''
+  - key:             readability-identifier-naming.PublicMethodCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.PublicMethodPrefix
+    value:           ''
+  - key:             readability-identifier-naming.PublicMethodSuffix
+    value:           ''
+  - key:             readability-identifier-naming.StaticConstantCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.StaticConstantPrefix
+    value:           ''
+  - key:             readability-identifier-naming.StaticConstantSuffix
+    value:           ''
+  - key:             readability-identifier-naming.StaticVariableCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.StaticVariablePrefix
+    value:           ''
+  - key:             readability-identifier-naming.StaticVariableSuffix
+    value:           ''
+  - key:             readability-identifier-naming.StructCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.StructPrefix
+    value:           ''
+  - key:             readability-identifier-naming.StructSuffix
+    value:           ''
+  - key:             readability-identifier-naming.TemplateParameterCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.TemplateParameterPrefix
+    value:           ''
+  - key:             readability-identifier-naming.TemplateParameterSuffix
+    value:           ''
+  - key:             readability-identifier-naming.TemplateTemplateParameterCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.TemplateTemplateParameterPrefix
+    value:           ''
+  - key:             readability-identifier-naming.TemplateTemplateParameterSuffix
+    value:           ''
+  - key:             readability-identifier-naming.TypeAliasCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.TypeAliasPrefix
+    value:           ''
+  - key:             readability-identifier-naming.TypeAliasSuffix
+    value:           ''
+  - key:             readability-identifier-naming.TypeTemplateParameterCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.TypeTemplateParameterPrefix
+    value:           ''
+  - key:             readability-identifier-naming.TypeTemplateParameterSuffix
+    value:           ''
+  - key:             readability-identifier-naming.TypedefCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.TypedefPrefix
+    value:           ''
+  - key:             readability-identifier-naming.TypedefSuffix
+    value:           ''
+  - key:             readability-identifier-naming.UnionCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.UnionPrefix
+    value:           ''
+  - key:             readability-identifier-naming.UnionSuffix
+    value:           ''
+  - key:             readability-identifier-naming.ValueTemplateParameterCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.ValueTemplateParameterPrefix
+    value:           ''
+  - key:             readability-identifier-naming.ValueTemplateParameterSuffix
+    value:           ''
+  - key:             readability-identifier-naming.VariableCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.VariablePrefix
+    value:           ''
+  - key:             readability-identifier-naming.VariableSuffix
+    value:           ''
+  - key:             readability-identifier-naming.VirtualMethodCase
+    value:           aNy_CasE
+  - key:             readability-identifier-naming.VirtualMethodPrefix
+    value:           ''
+  - key:             readability-identifier-naming.VirtualMethodSuffix
+    value:           ''
+  - key:             readability-implicit-bool-cast.AllowConditionalIntegerCasts
+    value:           '0'
+  - key:             readability-implicit-bool-cast.AllowConditionalPointerCasts
+    value:           '0'
+  - key:             readability-simplify-boolean-expr.ChainedConditionalAssignment
+    value:           '0'
+  - key:             readability-simplify-boolean-expr.ChainedConditionalReturn
+    value:           '0'
+...
+
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000000..a15c40f0d1
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,8 @@
+[submodule "data"]
+	path = data
+	url = ../data
+	branch = master
+[submodule "3rdparty"]
+	path = 3rdparty
+	url = ../3rdparty
+	branch = master
diff --git a/3rdparty b/3rdparty
new file mode 160000
+Subproject e2e08d62647278f63993b43dd84d16484f111a7
diff --git a/SConscript b/SConscript
new file mode 100644
index 0000000000..970466e5ae
--- /dev/null
+++ b/SConscript
@@ -0,0 +1,199 @@
+# Copyright (c) 2016, 2017 ARM Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import collections
+import os.path
+import re
+import subprocess
+
+VERSION = "v0.0-unreleased"
+SONAME_VERSION="1.0.0"
+
+Import('env')
+Import('vars')
+
+def build_library(name, sources, static=False):
+    if static:
+        obj = arm_compute_env.StaticLibrary(name, source=sources)
+    else:
+        if env['set_soname']:
+            obj = arm_compute_env.SharedLibrary(name, source=sources, SHLIBVERSION = SONAME_VERSION)
+
+            symlinks = []
+            # Manually delete symlinks or SCons will get confused:
+            directory = os.path.dirname(obj[0].path)
+            library_prefix = obj[0].path[:-(1 + len(SONAME_VERSION))]
+            real_lib = "%s.%s" % (library_prefix, SONAME_VERSION)
+
+            for f in Glob("#%s*" % library_prefix):
+                if str(f) != real_lib:
+                    symlinks.append("%s/%s" % (directory,str(f)))
+
+            clean = arm_compute_env.Command('clean-%s' % str(obj[0]), [], Delete(symlinks))
+            Default(clean)
+            Depends(obj, clean)
+        else:
+            obj = arm_compute_env.SharedLibrary(name, source=sources)
+
+    Default(obj)
+    return obj
+
+def resolve_includes(target, source, env):
+    # File collection
+    FileEntry = collections.namedtuple('FileEntry', 'target_name file_contents')
+
+    # Include pattern
+    pattern = re.compile("#include \"(.*)\"")
+
+    # Get file contents
+    files = []
+    for i in range(len(source)):
+        src = source[i]
+        dst = target[i]
+        contents = src.get_contents().splitlines()
+        entry = FileEntry(target_name=dst, file_contents=contents)
+        files.append((os.path.basename(src.get_path()),entry))
+
+    # Create dictionary of tupled list
+    files_dict = dict(files)
+
+    # Check for includes (can only be files in the same folder)
+    final_files = []
+    for file in files:
+        done = False
+        tmp_file = file[1].file_contents
+        while not done:
+            file_count = 0
+            updated_file = []
+            for line in tmp_file:
+                found = pattern.search(line)
+                if found:
+                    include_file = found.group(1)
+                    data = files_dict[include_file].file_contents
+                    updated_file.extend(data)
+                else:
+                    updated_file.append(line)
+                    file_count += 1
+
+            # Check if all include are replaced.
+            if file_count == len(tmp_file):
+                done = True
+
+            # Update temp file
+            tmp_file = updated_file
+
+        # Append and prepend string literal identifiers and add expanded file to final list
+        tmp_file.insert(0, "R\"(\n")
+        tmp_file.append("\n)\"")
+        entry = FileEntry(target_name=file[1].target_name, file_contents=tmp_file)
+        final_files.append((file[0], entry))
+
+    # Write output files
+    for file in final_files:
+        with open(file[1].target_name.get_path(), 'w+') as out_file:
+            out_file.write( "\n".join( file[1].file_contents ))
+
+def create_version_file(target, source, env):
+# Generate string with build options library version to embed in the library:
+    try:
+        git_hash = subprocess.check_output(["git", "rev-parse", "HEAD"])
+    except (OSError, subprocess.CalledProcessError):
+        git_hash="unknown"
+
+    version_filename = "%s/arm_compute_version.embed" % Dir("src/core").path
+    build_info = "\"arm_compute_version=%s Build options: %s Git hash=%s\"" % (VERSION, vars.args, git_hash.strip())
+    with open(target[0].get_path(), "w") as fd:
+        fd.write(build_info)
+
+
+arm_compute_env = env.Clone()
+
+generate_embed = [ arm_compute_env.Command("src/core/arm_compute_version.embed", "", action=create_version_file) ]
+arm_compute_env.Append(CPPPATH =[Dir("./src/core/").path] )
+
+if env["os"] not in ["android", "bare_metal"]:
+    arm_compute_env.Append(LIBS = ['pthread'])
+
+arm_compute_env.Append(LIBS = ['dl'])
+
+core_files = Glob('src/core/*.cpp')
+core_files += Glob('src/core/CPP/*.cpp')
+core_files += Glob('src/core/CPP/kernels/*.cpp')
+
+files = Glob('src/runtime/*.cpp')
+# CLHarrisCorners uses the Scheduler to run CPP kernels
+files += Glob('src/runtime/CPP/SingleThreadScheduler.cpp')
+
+if env['cppthreads']:
+     files += Glob('src/runtime/CPP/CPPScheduler.cpp')
+
+if env['openmp']:
+     files += Glob('src/runtime/OMP/OMPScheduler.cpp')
+
+if env['opencl']:
+    core_files += Glob('src/core/CL/*.cpp')
+    core_files += Glob('src/core/CL/kernels/*.cpp')
+
+    files += Glob('src/runtime/CL/*.cpp')
+    files += Glob('src/runtime/CL/functions/*.cpp')
+
+    # Generate embed files
+    if env['embed_kernels']:
+        cl_files = Glob('src/core/CL/cl_kernels/*.cl')
+        cl_files += Glob('src/core/CL/cl_kernels/*.h')
+
+        embed_files = [ f.get_path()+"embed" for f in cl_files ]
+        arm_compute_env.Append(CPPPATH =[Dir("./src/core/CL/").path] )
+
+        generate_embed.append(arm_compute_env.Command(embed_files, cl_files, action=resolve_includes))
+
+if env['neon']:
+    core_files += Glob('src/core/NEON/*.cpp')
+    core_files += Glob('src/core/NEON/kernels/*.cpp')
+
+    files += Glob('src/runtime/NEON/*.cpp')
+    files += Glob('src/runtime/NEON/functions/*.cpp')
+
+static_core_objects = [arm_compute_env.StaticObject(f) for f in core_files]
+shared_core_objects = [arm_compute_env.SharedObject(f) for f in core_files]
+
+arm_compute_core_a = build_library('arm_compute_core-static', static_core_objects, static=True)
+Export('arm_compute_core_a')
+
+if env['os'] != 'bare_metal':
+    arm_compute_core_so = build_library('arm_compute_core', shared_core_objects, static=False)
+    Export('arm_compute_core_so')
+
+shared_objects = [arm_compute_env.SharedObject(f) for f in files]
+static_objects = [arm_compute_env.StaticObject(f) for f in files]
+
+arm_compute_a = build_library('arm_compute-static', static_core_objects + static_objects, static=True)
+Export('arm_compute_a')
+
+if env['os'] != 'bare_metal':
+    arm_compute_so = build_library('arm_compute', shared_core_objects + shared_objects, static=False)
+    Export('arm_compute_so')
+
+alias = arm_compute_env.Alias("arm_compute", [arm_compute_a, arm_compute_so])
+Default(alias)
+
+Default(generate_embed)
+Depends([alias,arm_compute_core_so, arm_compute_core_a], generate_embed)
diff --git a/SConstruct b/SConstruct
new file mode 100644
index 0000000000..3927e3acc9
--- /dev/null
+++ b/SConstruct
@@ -0,0 +1,208 @@
+# Copyright (c) 2016, 2017 ARM Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import SCons
+import os
+import subprocess
+
+def version_at_least(version, required):
+    end = min(len(version), len(required))
+
+    for i in range(0, end, 2):
+        if int(version[i]) < int(required[i]):
+            return False
+        elif int(version[i]) > int(required[i]):
+            return True
+
+    return True
+
+vars = Variables("scons")
+vars.AddVariables(
+    BoolVariable("debug", "Debug", False),
+    BoolVariable("asserts", "Enable asserts (this flag is forced to 1 for debug=1)", False),
+    EnumVariable("arch", "Target Architecture", "armv7a", allowed_values=("armv7a", "arm64-v8a", "arm64-v8.2-a", "x86_32", "x86_64")),
+    EnumVariable("os", "Target OS", "linux", allowed_values=("linux", "android", "bare_metal")),
+    EnumVariable("build", "Build type", "cross_compile", allowed_values=("native", "cross_compile")),
+    BoolVariable("examples", "Build example programs", True),
+    BoolVariable("Werror", "Enable/disable the -Werror compilation flag", True),
+    BoolVariable("opencl", "Enable OpenCL support", True),
+    BoolVariable("neon", "Enable Neon support", False),
+    BoolVariable("embed_kernels", "Embed OpenCL kernels in library binary", False),
+    BoolVariable("set_soname", "Set the library's soname and shlibversion (requires SCons 2.4 or above)", False),
+    BoolVariable("openmp", "Enable OpenMP backend", False),
+    BoolVariable("cppthreads", "Enable C++11 threads backend", True),
+    PathVariable("build_dir", "Specify sub-folder for the build", ".", PathVariable.PathAccept),
+    ("extra_cxx_flags", "Extra CXX flags to be appended to the build command", "")
+)
+
+env = Environment(platform="posix", variables=vars, ENV = os.environ)
+
+SConsignFile('build/.%s' % env['build_dir'])
+
+Help(vars.GenerateHelpText(env))
+
+if env['neon'] and 'x86' in env['arch']:
+    print "Cannot compile NEON for x86"
+    Exit(1)
+
+if env['set_soname'] and not version_at_least(SCons.__version__, "2.4"):
+    print "Setting the library's SONAME / SHLIBVERSION requires SCons 2.4 or above"
+    print "Update your version of SCons or use set_soname=0"
+    Exit(1)
+
+if env['os'] == 'bare_metal':
+    if env['cppthreads'] or env['openmp']:
+         print("ERROR: OpenMP and C++11 threads not supported in bare_metal. Use cppthreads=0 openmp=0")
+         Exit(1)
+
+env.Append(CXXFLAGS = ['-Wno-deprecated-declarations','-Wall','-DARCH_ARM',
+         '-Wextra','-Wno-unused-parameter','-pedantic','-Wdisabled-optimization','-Wformat=2',
+         '-Winit-self','-Wstrict-overflow=2','-Wswitch-default',
+         '-fpermissive','-std=gnu++11','-Wno-vla','-Woverloaded-virtual',
+         '-Wctor-dtor-privacy','-Wsign-promo','-Weffc++','-Wno-format-nonliteral','-Wno-overlength-strings','-Wno-strict-overflow'])
+env.Append(CPPDEFINES = ['_GLIBCXX_USE_NANOSLEEP'])
+
+if os.environ.get('CXX', 'g++') == 'clang++':
+    env.Append(CXXFLAGS = ['-Wno-format-nonliteral','-Wno-deprecated-increment-bool','-Wno-vla-extension','-Wno-mismatched-tags'])
+else:
+    env.Append(CXXFLAGS = ['-Wlogical-op','-Wnoexcept','-Wstrict-null-sentinel'])
+
+if env['cppthreads']:
+    env.Append(CPPDEFINES = [('ARM_COMPUTE_CPP_SCHEDULER', 1)])
+
+if env['openmp']:
+    if os.environ.get('CXX', 'g++') == 'clang++':
+        print "Clang does not support OpenMP. Use scheduler=cpp."
+        Exit(1)
+
+    env.Append(CPPDEFINES = [('ARM_COMPUTE_OPENMP_SCHEDULER', 1)])
+    env.Append(CXXFLAGS = ['-fopenmp'])
+    env.Append(LINKFLAGS = ['-fopenmp'])
+
+prefix = ""
+if env['arch'] == 'armv7a':
+    env.Append(CXXFLAGS = ['-march=armv7-a', '-mthumb', '-mfpu=neon'])
+
+    if env['os'] in ['linux', 'bare_metal']:
+        prefix = "arm-linux-gnueabihf-"
+        env.Append(CXXFLAGS = ['-mfloat-abi=hard'])
+    elif env['os'] == 'android':
+        prefix = "arm-linux-androideabi-"
+        env.Append(CXXFLAGS = ['-mfloat-abi=softfp'])
+elif env['arch'] == 'arm64-v8a':
+    env.Append(CXXFLAGS = ['-march=armv8-a'])
+
+    if env['os'] in ['linux', 'bare_metal']:
+        prefix = "aarch64-linux-gnu-"
+    elif env['os'] == 'android':
+        prefix = "aarch64-linux-android-"
+elif env['arch'] == 'arm64-v8.2-a':
+    env.Append(CXXFLAGS = ['-march=armv8.2-a+fp16+simd'])
+    env.Append(CPPDEFINES = ['ARM_COMPUTE_ENABLE_FP16'])
+
+    if env['os'] in ['linux', 'bare_metal']:
+        prefix = "aarch64-linux-gnu-"
+    elif env['os'] == 'android':
+        prefix = "aarch64-linux-android-"
+elif env['arch'] == 'x86_32':
+    env.Append(CCFLAGS = ['-m32'])
+    env.Append(LINKFLAGS = ['-m32'])
+elif env['arch'] == 'x86_64':
+    env.Append(CCFLAGS = ['-m64'])
+    env.Append(LINKFLAGS = ['-m64'])
+
+if env['build'] == 'native':
+    prefix = ""
+
+env['CC'] = prefix + os.environ.get('CC', 'gcc')
+env['CXX'] = prefix + os.environ.get('CXX', 'g++')
+env['LD'] = prefix + "ld"
+env['AS'] = prefix + "as"
+env['AR'] = prefix + "ar"
+env['RANLIB'] = prefix + "ranlib"
+
+if not GetOption("help"):
+    try:
+        compiler_ver = subprocess.check_output([env['CXX'], "-dumpversion"]).strip()
+    except OSError:
+        print("ERROR: Compiler '%s' not found" % env['CXX'])
+        Exit(1)
+
+    if os.environ.get('CXX','g++') == 'g++':
+        if env['arch'] == 'arm64-v8.2-a' and not version_at_least(compiler_ver, '6.2.1'):
+            print "GCC 6.2.1 or newer is required to compile armv8.2-a code"
+            Exit(1)
+        elif env['arch'] == 'arm64-v8a' and not version_at_least(compiler_ver, '4.9'):
+            print "GCC 4.9 or newer is required to compile NEON code for AArch64"
+            Exit(1)
+
+        if version_at_least(compiler_ver, '6.1'):
+            env.Append(CXXFLAGS = ['-Wno-ignored-attributes'])
+
+        if compiler_ver == '4.8.3':
+            env.Append(CXXFLAGS = ['-Wno-array-bounds'])
+
+if env['Werror']:
+    env.Append(CXXFLAGS = ['-Werror'])
+
+if env['os'] == 'android':
+    env.Append(CPPDEFINES = ['ANDROID'])
+    env.Append(LINKFLAGS = ['-pie', '-static-libstdc++'])
+elif env['os'] == 'bare_metal':
+    env.Append(LINKFLAGS = ['-static'])
+    env.Append(CXXFLAGS = ['-fPIC'])
+    env.Append(CPPDEFINES = ['NO_MULTI_THREADING'])
+
+if env['opencl']:
+    if env['os'] == 'bare_metal':
+        print("Cannot link OpenCL statically, which is required on bare metal")
+        Exit(1)
+
+    if env['embed_kernels']:
+        env.Append(CPPDEFINES = ['EMBEDDED_KERNELS'])
+
+if env['debug']:
+    env['asserts'] = True
+    env.Append(CXXFLAGS = ['-O0','-g','-gdwarf-2'])
+    env.Append(CPPDEFINES = ['ARM_COMPUTE_DEBUG_ENABLED'])
+else:
+    env.Append(CXXFLAGS = ['-O3','-ftree-vectorize'])
+
+if env['asserts']:
+    env.Append(CPPDEFINES = ['ARM_COMPUTE_ASSERTS_ENABLED'])
+
+env.Append(CPPPATH = ['#/include', "#"])
+env.Append(CXXFLAGS = env['extra_cxx_flags'])
+
+Export('vars')
+Export('env')
+Export('version_at_least')
+
+SConscript('./SConscript', variant_dir='#build/%s' % env['build_dir'], duplicate=0)
+
+if env['opencl']:
+    SConscript("./opencl-1.2-stubs/SConscript", variant_dir="build/%s/opencl-1.2-stubs" % env['build_dir'], duplicate=0)
+
+if env['examples']:
+    SConscript('./examples/SConscript', variant_dir='#build/%s/examples' % env['build_dir'], duplicate=0)
+
+SConscript('./tests/SConscript', variant_dir='#build/%s/tests' % env['build_dir'], duplicate=0)
diff --git a/arm_compute/core/AccessWindowAutoPadding.h b/arm_compute/core/AccessWindowAutoPadding.h
new file mode 100644
index 0000000000..0a3344b115
--- /dev/null
+++ b/arm_compute/core/AccessWindowAutoPadding.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ACCESS_WINDOW_AUTO_PADDING_H__
+#define __ARM_COMPUTE_ACCESS_WINDOW_AUTO_PADDING_H__
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class Window;
+class ITensorInfo;
+
+/** Dummy access window.
+ *
+ * This implementation always uses the auto padding of the tensor info and
+ * never updates the window. The valid region is always set to cover the entire
+ * tensor.
+ *
+ * @note This access window is only used during the migration to the new
+ *       padding system. It will be removed once all kernels have been ported.
+ *
+ * */
+class AccessWindowAutoPadding : public IAccessWindow
+{
+public:
+    /** Default constructor.
+     *
+     * @param[in,out] info Tensor info of the accessed kernel.
+     */
+    AccessWindowAutoPadding(ITensorInfo *info);
+    AccessWindowAutoPadding(const AccessWindowAutoPadding &) = delete;
+    AccessWindowAutoPadding &operator=(const AccessWindowAutoPadding &) = delete;
+    AccessWindowAutoPadding(AccessWindowAutoPadding &&)                 = default;
+    AccessWindowAutoPadding &operator=(AccessWindowAutoPadding &&) = default;
+    ~AccessWindowAutoPadding()                                     = default;
+
+    /** Set the valid region to match the entire tensor. */
+    void set_valid_region();
+
+    /** Return a valid region that spans across the entire tensor. */
+    ValidRegion compute_valid_region() const;
+
+    // Inherited methods overridden:
+    bool update_window_if_needed(Window &window) const override;
+    bool update_padding_if_needed(const Window &window) const override;
+    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
+
+private:
+    ITensorInfo *_info;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_ACCESS_WINDOW_AUTO_PADDING_H__*/
diff --git a/arm_compute/core/AccessWindowStatic.h b/arm_compute/core/AccessWindowStatic.h
new file mode 100644
index 0000000000..6dcba072c4
--- /dev/null
+++ b/arm_compute/core/AccessWindowStatic.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_IACCESS_WINDOW_STATIC_H__
+#define __ARM_COMPUTE_IACCESS_WINDOW_STATIC_H__
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+
+#include <array>
+
+namespace arm_compute
+{
+class Window;
+class ITensorInfo;
+
+/** Implementation of a static rectangular access pattern.
+ *
+ * In this implementation the access offsets and sizes are not relative to the
+ * current element. Instead they are considered to be absolute coordinates
+ * within the accessed tensor's shape.
+ *
+ * */
+class AccessWindowStatic : public IAccessWindow
+{
+public:
+    /** Constructor for a static access pattern.
+     *
+     * @param[in,out] info    Tensor info of the accessed kernel.
+     * @param[in]     start_x Start of the access in X direction.
+     * @param[in]     start_y Start of the access in Y direction.
+     * @param[in]     end_x   End of the access in X direction.
+     * @param[in]     end_y   End of the access in Y direction.
+     */
+    AccessWindowStatic(ITensorInfo *info, int start_x, int start_y, int end_x, int end_y);
+
+    AccessWindowStatic(const AccessWindowStatic &) = delete;
+    AccessWindowStatic &operator=(const AccessWindowStatic &) = delete;
+    AccessWindowStatic(AccessWindowStatic &&)                 = default;
+    AccessWindowStatic &operator=(AccessWindowStatic &&) = default;
+    ~AccessWindowStatic()                                = default;
+
+    /** Set the valid region based on the static access pattern and valid
+     *  region of the inputs.
+     *
+     * @param[in] window             Execution window of the kernel.
+     * @param[in] input_valid_region Combined valid region of all inputs.
+     */
+    void set_valid_region(const Window &window, const ValidRegion &input_valid_region);
+
+    /** Compute the valid region based on the static access pattern and valid region of the inputs.
+     *
+     * @param[in] window             Execution window of the kernel.
+     * @param[in] input_valid_region Combined valid region of all inputs.
+     */
+    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region) const;
+
+    // Inherited methods overriden:
+    bool update_window_if_needed(Window &window) const override;
+    bool update_padding_if_needed(const Window &window) const override;
+    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
+
+    ITensorInfo *_info;
+    int          _start_x;
+    int          _start_y;
+    int          _end_x;
+    int          _end_y;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_IACCESS_WINDOW_STATIC_H__*/
diff --git a/arm_compute/core/AccessWindowTranspose.h b/arm_compute/core/AccessWindowTranspose.h
new file mode 100644
index 0000000000..102860f9d8
--- /dev/null
+++ b/arm_compute/core/AccessWindowTranspose.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_IACCESS_WINDOW_TRANSPOSE_H__
+#define __ARM_COMPUTE_IACCESS_WINDOW_TRANSPOSE_H__
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class Window;
+class ITensorInfo;
+
+/** Implementation of a XY-transpose access pattern. */
+class AccessWindowTranspose : public AccessWindowRectangle
+{
+public:
+    using AccessWindowRectangle::AccessWindowRectangle;
+    bool update_window_if_needed(Window &window) const override;
+    bool update_padding_if_needed(const Window &window) const override;
+    using AccessWindowRectangle::compute_valid_region;
+    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_IACCESS_WINDOW_TRANSPOSE_H__*/
diff --git a/arm_compute/core/CL/CLHelpers.h b/arm_compute/core/CL/CLHelpers.h
new file mode 100644
index 0000000000..26253e3f38
--- /dev/null
+++ b/arm_compute/core/CL/CLHelpers.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHELPERS_H__
+#define __ARM_COMPUTE_CLHELPERS_H__
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <string>
+
+namespace arm_compute
+{
+enum class DataType;
+enum class GPUTarget;
+
+/** Enable operation operations on GPUTarget enumerations */
+template <>
+struct enable_bitwise_ops<arm_compute::GPUTarget>
+{
+    static constexpr bool value = true;
+};
+
+/** Max vector width of an OpenCL vector */
+static constexpr const unsigned int max_cl_vector_width = 16;
+
+/** Translates a tensor data type to the appropriate OpenCL type.
+ *
+ * @param[in] dt @ref DataType to be translated to OpenCL type.
+ *
+ * @return The string specifying the OpenCL type to be used.
+ */
+std::string get_cl_type_from_data_type(const DataType &dt);
+
+/** Translates a given gpu device target to string.
+ *
+ * @param[in] target Given gpu target.
+ *
+ * @return The string describing the target.
+ */
+const std::string &string_from_target(GPUTarget target);
+
+/** Helper function to create and return a unique_ptr pointed to a CL kernel object
+ *  It also calls the kernel's configuration.
+ *
+ * @param[in] args All the arguments that need pass to kernel's configuration.
+ *
+ * @return A unique pointer pointed to a CL kernel object
+ */
+template <typename Kernel, typename... T>
+std::unique_ptr<Kernel> create_configure_kernel(T &&... args)
+{
+    std::unique_ptr<Kernel> k = arm_compute::cpp14::make_unique<Kernel>();
+    k->configure(std::forward<T>(args)...);
+    return k;
+}
+
+/** Helper function to create and return a unique_ptr pointed to a CL kernel object
+ *
+ * @return A unique pointer pointed to a CL kernel object
+ */
+template <typename Kernel>
+std::unique_ptr<Kernel> create_kernel()
+{
+    std::unique_ptr<Kernel> k = arm_compute::cpp14::make_unique<Kernel>();
+    return k;
+}
+
+/** Helper function to get the GPU target from CL device
+ *
+ * @param[in] device A CL device
+ *
+ * @return the GPU target
+ */
+GPUTarget get_target_from_device(cl::Device &device);
+
+/** Helper function to get the GPU arch
+ *
+ * @param[in] target GPU target
+ *
+ * @return the GPU target which shows the arch
+ */
+GPUTarget get_arch_from_target(GPUTarget target);
+}
+#endif
diff --git a/arm_compute/core/CL/CLKernelLibrary.h b/arm_compute/core/CL/CLKernelLibrary.h
new file mode 100644
index 0000000000..c29610c252
--- /dev/null
+++ b/arm_compute/core/CL/CLKernelLibrary.h
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLKERNELLIBRARY_H__
+#define __ARM_COMPUTE_CLKERNELLIBRARY_H__
+
+#include "arm_compute/core/CL/OpenCL.h"
+
+#include <map>
+#include <set>
+#include <string>
+#include <utility>
+
+namespace arm_compute
+{
+/** Program class */
+class Program
+{
+public:
+    /** Default constructor. */
+    Program();
+    /** Construct program from source file.
+     *
+     * @param[in] context CL context used to create the program.
+     * @param[in] name    Program name.
+     * @param[in] source  Program source.
+     */
+    Program(cl::Context context, std::string name, std::string source);
+    /** Construct program from binary file.
+     *
+     * @param[in] context CL context used to create the program.
+     * @param[in] device  CL device for which the programs are created.
+     * @param[in] name    Program name.
+     * @param[in] binary  Program binary.
+     */
+    Program(cl::Context context, cl::Device device, std::string name, std::vector<unsigned char> binary);
+    /** Default Copy Constructor. */
+    Program(const Program &) = default;
+    /** Default Move Constructor. */
+    Program(Program &&) = default;
+    /** Default copy assignment operator. */
+    Program &operator=(const Program &) = default;
+    /** Default move assignment operator. */
+    Program &operator=(Program &&) = default;
+    /**Returns program name.
+     *
+     * @return Program's name.
+     */
+    std::string name() const
+    {
+        return _name;
+    }
+    /** User-defined conversion to the underlying CL program.
+     *
+     * @return The CL program object.
+     */
+    explicit operator cl::Program() const;
+
+    static bool build(const cl::Program &program, const std::string &build_options = "");
+    /** Build the underlying CL program.
+     *
+     * @param[in] build_options Options used to build the CL program.
+     *
+     * @return A reference to itself.
+     */
+    cl::Program build(const std::string &build_options = "") const;
+
+private:
+    cl::Context                _context;   /**< Underlying CL context. */
+    cl::Device                 _device;    /**< CL device for which the programs are created. */
+    bool                       _is_binary; /**< Create program from binary? */
+    std::string                _name;      /**< Program name. */
+    std::string                _source;    /**< Source code for the program. */
+    std::vector<unsigned char> _binary;    /**< Binary from which to create the program. */
+};
+
+/** Kernel class */
+class Kernel
+{
+public:
+    /** Default Constructor. */
+    Kernel();
+    /** Default Copy Constructor. */
+    Kernel(const Kernel &) = default;
+    /** Default Move Constructor. */
+    Kernel(Kernel &&) = default;
+    /** Default copy assignment operator. */
+    Kernel &operator=(const Kernel &) = default;
+    /** Default move assignment operator. */
+    Kernel &operator=(Kernel &&) = default;
+    /** Constructor.
+     *
+     * @param[in] name    Kernel name.
+     * @param[in] program Built program.
+     */
+    Kernel(std::string name, const cl::Program &program);
+    /** Returns kernel name.
+     *
+     * @return Kernel's name.
+     */
+    std::string name() const
+    {
+        return _name;
+    }
+    /** Returns OpenCL kernel.
+     *
+     * @return OpenCL Kernel.
+     */
+    explicit operator cl::Kernel() const
+    {
+        return _kernel;
+    }
+
+private:
+    std::string _name;   /**< Kernel name */
+    cl::Kernel  _kernel; /**< OpenCL Kernel */
+};
+
+/** CLKernelLibrary class */
+class CLKernelLibrary
+{
+    using StringSet = std::set<std::string>;
+
+private:
+    /** Default Constructor. */
+    CLKernelLibrary();
+
+public:
+    /** Prevent instances of this class from being copied. */
+    CLKernelLibrary(const CLKernelLibrary &) = delete;
+    /** Prevent instances of this class from being copied. */
+    const CLKernelLibrary &operator=(const CLKernelLibrary &) = delete;
+    /** Access the KernelLibrary singleton.
+     * @return The KernelLibrary instance.
+     */
+    static CLKernelLibrary &get();
+    /** Initialises the kernel library.
+     *
+     * @param[in] kernel_path (Optional) Path of the directory from which kernel sources are loaded.
+     * @param[in] context     (Optional) CL context used to create programs.
+     * @param[in] device      (Optional) CL device for which the programs are created.
+     */
+    void init(std::string kernel_path = ".", cl::Context context = cl::Context::getDefault(), cl::Device device = cl::Device::getDefault())
+    {
+        _kernel_path = std::move(kernel_path);
+        _context     = std::move(context);
+        _device      = std::move(device);
+    }
+    /** Sets the path that the kernels reside in.
+     *
+     * @param[in] kernel_path Path of the kernel.
+     */
+    void set_kernel_path(const std::string &kernel_path)
+    {
+        _kernel_path = kernel_path;
+    };
+    /** Sets the CL context used to create programs.
+     *
+     * @note Setting the context also resets the device to the
+     *       first one available in the new context.
+     *
+     * @param[in] context A CL context.
+     */
+    void set_context(cl::Context context)
+    {
+        _context = std::move(context);
+
+        const auto cl_devices = _context.getInfo<CL_CONTEXT_DEVICES>();
+
+        if(cl_devices.empty())
+        {
+            _device = cl::Device();
+        }
+        else
+        {
+            _device = cl_devices[0];
+        }
+    };
+    /** Sets the CL device for which the programs are created.
+     *
+     * @param[in] device A CL device.
+     */
+    void set_device(cl::Device device)
+    {
+        _device = std::move(device);
+    };
+    /** Creates a kernel from the kernel library.
+     *
+     * @param[in] kernel_name       Kernel name.
+     * @param[in] build_options_set Kernel build options as a set.
+     *
+     * @return The created kernel.
+     */
+    Kernel create_kernel(const std::string &kernel_name, const StringSet &build_options_set = {}) const;
+    /** Serializes and saves programs to a binary.
+     *
+     */
+    void save_binary();
+    /** Load serialized binary with all the programs.
+     *
+     */
+    void load_binary();
+
+private:
+    /** Load program and its dependencies.
+     *
+     * @param[in] program_name Name of the program to load.
+     */
+    const Program &load_program(const std::string &program_name) const;
+    /** Concatenates contents of a set into a single string.
+     *
+     * @param[in] s Input set to concatenate.
+     *
+     * @return Concatenated string.
+     */
+    std::string stringify_set(const StringSet &s) const;
+
+    cl::Context _context;                                                /**< Underlying CL context. */
+    cl::Device  _device;                                                 /**< Underlying CL device. */
+    std::string _kernel_path;                                            /**< Path to the kernels folder. */
+    mutable std::map<std::string, const Program>    _programs_map;       /**< Map with all already loaded program data. */
+    mutable std::map<std::string, cl::Program>      _built_programs_map; /**< Map with all already built program data. */
+    static const std::map<std::string, std::string> _kernel_program_map; /**< Map that associates kernel names with programs. */
+    static const std::map<std::string, std::string> _program_source_map; /**< Contains sources for all programs.
+                                                                              Used for compile-time kernel inclusion. >*/
+};
+}
+#endif /* __ARM_COMPUTE_CLKERNELLIBRARY_H__ */
diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h
new file mode 100644
index 0000000000..0e9f356e52
--- /dev/null
+++ b/arm_compute/core/CL/CLKernels.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLKERNELS_H__
+#define __ARM_COMPUTE_CLKERNELS_H__
+
+/* Header regrouping all the CL kernels */
+#include "arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
+#include "arm_compute/core/CL/kernels/CLAccumulateKernel.h"
+#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h"
+#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h"
+#include "arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLBitwiseAndKernel.h"
+#include "arm_compute/core/CL/kernels/CLBitwiseNotKernel.h"
+#include "arm_compute/core/CL/kernels/CLBitwiseOrKernel.h"
+#include "arm_compute/core/CL/kernels/CLBitwiseXorKernel.h"
+#include "arm_compute/core/CL/kernels/CLBox3x3Kernel.h"
+#include "arm_compute/core/CL/kernels/CLCannyEdgeKernel.h"
+#include "arm_compute/core/CL/kernels/CLChannelCombineKernel.h"
+#include "arm_compute/core/CL/kernels/CLChannelExtractKernel.h"
+#include "arm_compute/core/CL/kernels/CLCol2ImKernel.h"
+#include "arm_compute/core/CL/kernels/CLColorConvertKernel.h"
+#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
+#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h"
+#include "arm_compute/core/CL/kernels/CLDepthConvertKernel.h"
+#include "arm_compute/core/CL/kernels/CLDerivativeKernel.h"
+#include "arm_compute/core/CL/kernels/CLDilateKernel.h"
+#include "arm_compute/core/CL/kernels/CLErodeKernel.h"
+#include "arm_compute/core/CL/kernels/CLFastCornersKernel.h"
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h"
+#include "arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h"
+#include "arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h"
+#include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h"
+#include "arm_compute/core/CL/kernels/CLHistogramKernel.h"
+#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
+#include "arm_compute/core/CL/kernels/CLIntegralImageKernel.h"
+#include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h"
+#include "arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
+#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
+#include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h"
+#include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h"
+#include "arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h"
+#include "arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h"
+#include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
+#include "arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
+#include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLRemapKernel.h"
+#include "arm_compute/core/CL/kernels/CLScaleKernel.h"
+#include "arm_compute/core/CL/kernels/CLScharr3x3Kernel.h"
+#include "arm_compute/core/CL/kernels/CLSobel3x3Kernel.h"
+#include "arm_compute/core/CL/kernels/CLSobel5x5Kernel.h"
+#include "arm_compute/core/CL/kernels/CLSobel7x7Kernel.h"
+#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLTableLookupKernel.h"
+#include "arm_compute/core/CL/kernels/CLThresholdKernel.h"
+#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
+#include "arm_compute/core/CL/kernels/CLWarpAffineKernel.h"
+#include "arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h"
+#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
+
+#endif /* __ARM_COMPUTE_CLKERNELS_H__ */
diff --git a/arm_compute/core/CL/CLTypes.h b/arm_compute/core/CL/CLTypes.h
new file mode 100644
index 0000000000..c5643d8939
--- /dev/null
+++ b/arm_compute/core/CL/CLTypes.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CL_TYPES_H__
+#define __ARM_COMPUTE_CL_TYPES_H__
+
+namespace arm_compute
+{
+/** Available GPU Targets */
+enum class GPUTarget
+{
+    GPU_ARCH_MASK = 0xF00,
+    MIDGARD       = 0x100,
+    BIFROST       = 0x200,
+    T600          = 0x110,
+    T700          = 0x120,
+    T800          = 0x130,
+    G70           = 0x210
+};
+}
+#endif /* __ARM_COMPUTE_CL_TYPES_H__ */
diff --git a/arm_compute/core/CL/ICLArray.h b/arm_compute/core/CL/ICLArray.h
new file mode 100644
index 0000000000..1b676ed5a3
--- /dev/null
+++ b/arm_compute/core/CL/ICLArray.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ICLARRAY_H__
+#define __ARM_COMPUTE_ICLARRAY_H__
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/ITensor.h"
+
+namespace arm_compute
+{
+/** Interface for OpenCL Array */
+template <class T>
+class ICLArray : public IArray<T>
+{
+public:
+    /* Constructor */
+    explicit ICLArray(size_t max_num_values)
+        : IArray<T>(max_num_values), _mapping(nullptr)
+    {
+    }
+
+    ICLArray(const ICLArray &) = delete;
+    ICLArray &operator=(const ICLArray &) = delete;
+    virtual ~ICLArray()                   = default;
+    /** Interface to be implemented by the child class to return a reference to the OpenCL buffer containing the array's data.
+     *
+     * @return A reference to an OpenCL buffer containing the array's data.
+     */
+    virtual const cl::Buffer &cl_buffer() const = 0;
+    /** Enqueue a map operation of the allocated buffer on the given queue.
+     *
+     * @param[in,out] q        The CL command queue to use for the mapping operation.
+     * @param[in]     blocking If true, then the mapping will be ready to use by the time
+     *                         this method returns, else it is the caller's responsibility
+     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
+     *
+     * @return The mapping address.
+     */
+    void map(cl::CommandQueue &q, bool blocking = true)
+    {
+        _mapping = do_map(q, blocking);
+    }
+    /** Enqueue an unmap operation of the allocated and mapped buffer on the given queue.
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     *
+     * @param[in,out] q The CL command queue to use for the mapping operation.
+     */
+    void unmap(cl::CommandQueue &q)
+    {
+        do_unmap(q, _mapping);
+        _mapping = nullptr;
+    }
+
+    // Inherited methods overridden:
+    T *buffer() const override
+    {
+        return reinterpret_cast<T *>(_mapping);
+    }
+
+protected:
+    /** Method to be implemented by the child class to map the OpenCL buffer
+     *
+     * @param[in,out] q        The CL command queue to use for the mapping operation.
+     * @param[in]     blocking If true, then the mapping will be ready to use by the time
+     *                         this method returns, else it is the caller's responsibility
+     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
+     */
+    virtual uint8_t *do_map(cl::CommandQueue &q, bool blocking) = 0;
+    /** Method to be implemented by the child class to unmap the OpenCL buffer
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     *
+     * @param[in,out] q       The CL command queue to use for the mapping operation.
+     * @param[in]     mapping Pointer to the buffer to be unmapped.
+     */
+    virtual void do_unmap(cl::CommandQueue &q, uint8_t *mapping) = 0;
+
+private:
+    uint8_t *_mapping;
+};
+
+using ICLKeyPointArray        = ICLArray<KeyPoint>;
+using ICLCoordinates2DArray   = ICLArray<Coordinates2D>;
+using ICLDetectionWindowArray = ICLArray<DetectionWindow>;
+using ICLSize2DArray          = ICLArray<Size2D>;
+using ICLUInt8Array           = ICLArray<cl_uchar>;
+using ICLUInt16Array          = ICLArray<cl_ushort>;
+using ICLUInt32Array          = ICLArray<cl_uint>;
+using ICLInt16Array           = ICLArray<cl_short>;
+using ICLInt32Array           = ICLArray<cl_int>;
+using ICLFloatArray           = ICLArray<cl_float>;
+}
+#endif /*__ARM_COMPUTE_ICLARRAY_H__*/
diff --git a/arm_compute/core/CL/ICLDistribution1D.h b/arm_compute/core/CL/ICLDistribution1D.h
new file mode 100644
index 0000000000..8fbbbbf548
--- /dev/null
+++ b/arm_compute/core/CL/ICLDistribution1D.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ICLDISTRIBUTION1D_H__
+#define __ARM_COMPUTE_ICLDISTRIBUTION1D_H__
+
+#include "arm_compute/core/IDistribution1D.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace cl
+{
+class Buffer;
+class CommandQueue;
+}
+
+namespace arm_compute
+{
+/** ICLDistribution1D interface class */
+class ICLDistribution1D : public IDistribution1D
+{
+public:
+    /** Constructor: Creates a 1D CLDistribution of a consecutive interval [offset, offset + range - 1]
+     *               defined by a start offset and valid range, divided equally into num_bins parts.
+     *
+     * @param[in] num_bins The number of bins the distribution is divided in.
+     * @param[in] offset   The start of the values to use.
+     * @param[in] range    The total number of the consecutive values of the distribution interval.
+     */
+    ICLDistribution1D(size_t num_bins, int32_t offset, uint32_t range);
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    ICLDistribution1D(const ICLDistribution1D &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    const ICLDistribution1D &operator=(const ICLDistribution1D &) = delete;
+    /** Enqueue a map operation of the allocated buffer on the given queue.
+     *
+     * @param[in,out] q        The CL command queue to use for the mapping operation.
+     * @param[in]     blocking If true, then the mapping will be ready to use by the time
+     *                         this method returns, else it is the caller's responsibility
+     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
+     */
+    void map(cl::CommandQueue &q, bool blocking = true);
+    /** Enqueue an unmap operation of the allocated and mapped buffer on the given queue.
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     *
+     * @param[in,out] q The CL command queue to use for the mapping operation.
+     */
+    void unmap(cl::CommandQueue &q);
+    /** Interface to be implemented by the child class to return a reference to the OpenCL buffer containing the distribution's data.
+     *
+     * @return A reference to an OpenCL buffer containing the distribution's data.
+     */
+    virtual cl::Buffer &cl_buffer() = 0;
+    // Inherited methods overridden:
+    uint32_t *buffer() const override;
+
+protected:
+    /** Method to be implemented by the child class to map the OpenCL buffer
+     *
+     * @param[in,out] q        The CL command queue to use for the mapping operation.
+     * @param[in]     blocking If true, then the mapping will be ready to use by the time
+     *                         this method returns, else it is the caller's responsibility
+     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
+     */
+    virtual uint32_t *do_map(cl::CommandQueue &q, bool blocking) = 0;
+    /** Method to be implemented by the child class to unmap the OpenCL buffer
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     *
+     * @param[in,out] q The CL command queue to use for the mapping operation.
+     */
+    virtual void do_unmap(cl::CommandQueue &q) = 0;
+
+protected:
+    uint32_t *_mapping; /**< The distribution data. */
+};
+}
+#endif /* __ARM_COMPUTE_ICLDISTRIBUTION1D_H__ */
diff --git a/arm_compute/core/CL/ICLHOG.h b/arm_compute/core/CL/ICLHOG.h
new file mode 100644
index 0000000000..a3d2fb4a57
--- /dev/null
+++ b/arm_compute/core/CL/ICLHOG.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ICLHOG_H__
+#define __ARM_COMPUTE_ICLHOG_H__
+
+#include "arm_compute/core/IHOG.h"
+
+#include <cstdint>
+
+namespace cl
+{
+class Buffer;
+class CommandQueue;
+}
+
+namespace arm_compute
+{
+/** Interface for OpenCL HOG data-object */
+class ICLHOG : public IHOG
+{
+public:
+    /** Default constructor */
+    ICLHOG();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    ICLHOG(const ICLHOG &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    ICLHOG &operator=(const ICLHOG &) = delete;
+    /** Allow instances of this class to be moved */
+    ICLHOG(ICLHOG &&) = default;
+    /** Allow instances of this class to be moved */
+    ICLHOG &operator=(ICLHOG &&) = default;
+    /** Default destructor */
+    virtual ~ICLHOG() = default;
+
+    /** Interface to be implemented by the child class to return a reference to the OpenCL buffer containing the hog's descriptor
+     *
+     * @return A reference to an OpenCL buffer containing the hog's descriptor
+     */
+    virtual const cl::Buffer &cl_buffer() const = 0;
+
+    /** Enqueue a map operation of the allocated buffer on the given queue.
+     *
+     * @param[in,out] q        The CL command queue to use for the mapping operation.
+     * @param[in]     blocking If true, then the mapping will be ready to use by the time
+     *                         this method returns, else it is the caller's responsibility
+     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
+     *
+     * @return The mapping address.
+     */
+    void map(cl::CommandQueue &q, bool blocking = true);
+
+    /** Enqueue an unmap operation of the allocated and mapped buffer on the given queue.
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     *
+     * @param[in,out] q The CL command queue to use for the mapping operation.
+     */
+    void unmap(cl::CommandQueue &q);
+
+    /** Interface to be implemented by the child class to free the allocated cl buffer.
+     *
+     * @warning The buffer must have been allocated previously. Otherwise calling the function will fail.
+     */
+    virtual void free() = 0;
+
+    // Inherited methods overridden:
+    float *descriptor() const override;
+
+protected:
+    /** Method to be implemented by the child class to map the OpenCL buffer
+     *
+     * @param[in,out] q        The CL command queue to use for the mapping operation.
+     * @param[in]     blocking If true, then the mapping will be ready to use by the time
+     *                         this method returns, else it is the caller's responsibility
+     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
+     */
+    virtual uint8_t *do_map(cl::CommandQueue &q, bool blocking) = 0;
+    /** Method to be implemented by the child class to unmap the OpenCL buffer
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     *
+     * @param[in,out] q The CL command queue to use for the mapping operation.
+     */
+    virtual void do_unmap(cl::CommandQueue &q) = 0;
+
+private:
+    uint8_t *_mapping;
+};
+}
+#endif /*__ARM_COMPUTE_ICLHOG_H__ */
diff --git a/arm_compute/core/CL/ICLKernel.h b/arm_compute/core/CL/ICLKernel.h
new file mode 100644
index 0000000000..72c963d11b
--- /dev/null
+++ b/arm_compute/core/CL/ICLKernel.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ICLKERNEL_H__
+#define __ARM_COMPUTE_ICLKERNEL_H__
+
+#include "arm_compute/core/CL/CLTypes.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/IKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+class Window;
+
+/** Common interface for all the OpenCL kernels */
+class ICLKernel : public IKernel
+{
+public:
+    /** Constructor */
+    ICLKernel();
+    /** Returns a reference to the OpenCL kernel of this object.
+     *
+     * @return A reference to the OpenCL kernel of this object.
+     */
+    cl::Kernel &kernel();
+    /** Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx.
+     *
+     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
+     * @param[in]     window Window the kernel will be executed on.
+     */
+    void add_1D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window);
+    /** Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx.
+     *
+     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
+     * @param[in]     window Window the kernel will be executed on.
+     */
+    void add_2D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window);
+    /** Add the passed 3D tensor's parameters to the object's kernel's arguments starting from the index idx.
+     *
+     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
+     * @param[in]     window Window the kernel will be executed on.
+     */
+    void add_3D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window);
+    /** Returns the number of arguments enqueued per 1D tensor object.
+     *
+     * @return The number of arguments enqueues per 1D tensor object.
+     */
+    unsigned int num_arguments_per_1D_tensor() const;
+    /** Returns the number of arguments enqueued per 2D tensor object.
+     *
+     * @return The number of arguments enqueues per 2D tensor object.
+     */
+    unsigned int num_arguments_per_2D_tensor() const;
+    /** Returns the number of arguments enqueued per 3D tensor object.
+     *
+     * @return The number of arguments enqueues per 3D tensor object.
+     */
+    unsigned int num_arguments_per_3D_tensor() const;
+    /** Enqueue the OpenCL kernel to process the given window  on the passed OpenCL command queue.
+     *
+     * @note The queue is *not* flushed by this method, and therefore the kernel will not have been executed by the time this method returns.
+     *
+     * @param[in]     window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     * @param[in,out] queue  Command queue on which to enqueue the kernel.
+     */
+    virtual void run(const Window &window, cl::CommandQueue &queue) = 0;
+    /** Add the passed parameters to the object's kernel's arguments starting from the index idx.
+     *
+     * @param[in,out] idx   Index at which to start adding the arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     value Value to set as an argument of the object's kernel.
+     */
+    template <typename T>
+    void add_argument(unsigned int &idx, T value)
+    {
+        _kernel.setArg(idx++, value);
+    }
+
+    /** Set the targeted GPU architecture
+     *
+     * @param[in] target The targeted GPU architecture
+     */
+    void set_target(GPUTarget target);
+
+    /** Set the targeted GPU architecture according to the CL device
+     *
+     * @param[in] device A CL device
+     */
+    void set_target(cl::Device &device);
+
+    /** Get the targeted GPU architecture
+     *
+     * @return The targeted GPU architecture.
+     */
+    GPUTarget get_target() const;
+
+private:
+    /** Add the passed tensor's parameters to the object's kernel's arguments starting from the index idx.
+     *
+     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
+     * @param[in]     window Window the kernel will be executed on.
+     */
+    template <unsigned int dimension_size>
+    void add_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window);
+    /** Returns the number of arguments enqueued per tensor object.
+     *
+     * @return The number of arguments enqueued per tensor object.
+     */
+    template <unsigned int dimension_size>
+    unsigned int           num_arguments_per_tensor() const;
+
+protected:
+    cl::Kernel  _kernel;   /**< OpenCL kernel to run */
+    cl::NDRange _lws_hint; /**< Local workgroup size hint for the OpenCL kernel */
+    GPUTarget   _target;   /**< The targeted GPU */
+};
+
+/** Add the kernel to the command queue with the given window.
+ *
+ * @note Depending on the size of the window, this might translate into several jobs being enqueued.
+ *
+ * @note If kernel->kernel() is empty then the function will return without adding anything to the queue.
+ *
+ * @param[in,out] queue    OpenCL command queue.
+ * @param[in]     kernel   Kernel to enqueue
+ * @param[in]     window   Window the kernel has to process.
+ * @param[in]     lws_hint Local workgroup size requested, by default (128,1)
+ *
+ * @note If any dimension of the lws is greater than the global workgroup size then no lws will be passed.
+ */
+void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint = cl::Range_128_1);
+}
+#endif /*__ARM_COMPUTE_ICLKERNEL_H__ */
diff --git a/arm_compute/core/CL/ICLLut.h b/arm_compute/core/CL/ICLLut.h
new file mode 100644
index 0000000000..2016ebb5c3
--- /dev/null
+++ b/arm_compute/core/CL/ICLLut.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ICLLUT_H__
+#define __ARM_COMPUTE_ICLLUT_H__
+
+#include "arm_compute/core/ILut.h"
+
+#include <cstdint>
+
+namespace cl
+{
+class Buffer;
+class CommandQueue;
+}
+
+namespace arm_compute
+{
+/** Interface for OpenCL LUT */
+class ICLLut : public ILut
+{
+public:
+    ICLLut();
+    ICLLut(const ICLLut &) = delete;
+    ICLLut &operator=(const ICLLut &) = delete;
+
+    /** Interface to be implemented by the child class to return a reference to the OpenCL buffer containing the lut's data.
+     *
+     * @return A reference to an OpenCL buffer containing the lut's data.
+     */
+    virtual const cl::Buffer &cl_buffer() const = 0;
+    /** Enqueue a map operation of the allocated buffer on the given queue.
+     *
+     * @param[in,out] q        The CL command queue to use for the mapping operation.
+     * @param[in]     blocking If true, then the mapping will be ready to use by the time
+     *                         this method returns, else it is the caller's responsibility
+     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
+     */
+    void map(cl::CommandQueue &q, bool blocking = true);
+    /** Enqueue an unmap operation of the allocated and mapped buffer on the given queue.
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     *
+     * @param[in,out] q The CL command queue to use for the mapping operation.
+     */
+    void unmap(cl::CommandQueue &q);
+
+    // Inherited methods overridden:
+    uint8_t *buffer() const override;
+
+protected:
+    /** Method to be implemented by the child class to map the OpenCL buffer
+     *
+     * @param[in,out] q        The CL command queue to use for the mapping operation.
+     * @param[in]     blocking If true, then the mapping will be ready to use by the time
+     *                         this method returns, else it is the caller's responsibility
+     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
+     */
+    virtual uint8_t *do_map(cl::CommandQueue &q, bool blocking) = 0;
+    /** Method to be implemented by the child class to unmap the OpenCL buffer
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     *
+     * @param[in,out] q The CL command queue to use for the mapping operation.
+     */
+    virtual void do_unmap(cl::CommandQueue &q) = 0;
+
+private:
+    uint8_t *_mapping;
+};
+}
+#endif /*__ARM_COMPUTE_ICLLUT_H__ */
diff --git a/arm_compute/core/CL/ICLMultiHOG.h b/arm_compute/core/CL/ICLMultiHOG.h
new file mode 100644
index 0000000000..9f3c775230
--- /dev/null
+++ b/arm_compute/core/CL/ICLMultiHOG.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ICLMULTIHOG_H__
+#define __ARM_COMPUTE_ICLMULTIHOG_H__
+
+#include "arm_compute/core/CL/ICLHOG.h"
+#include "arm_compute/core/IMultiHOG.h"
+
+namespace arm_compute
+{
+/** Interface for storing multiple HOG data-objects */
+class ICLMultiHOG : public IMultiHOG
+{
+public:
+    /** Return a pointer to the requested OpenCL HOG model
+     *
+     *  @param[in] index The index of the wanted OpenCL HOG model.
+     *
+     *  @return A pointer pointed to the HOG model
+     */
+    virtual ICLHOG *cl_model(size_t index) = 0;
+    /** Return a constant pointer to the requested OpenCL HOG model
+     *
+     *  @param[in] index The index of the wanted OpenCL HOG model.
+     *
+     *  @return A constant pointer pointed to the OpenCL HOG model
+     */
+    virtual const ICLHOG *cl_model(size_t index) const = 0;
+
+    // Inherited methods overridden:
+    IHOG *model(size_t index) override;
+    const IHOG *model(size_t index) const override;
+};
+}
+#endif /*__ARM_COMPUTE_ICLMULTIHOG_H__ */
diff --git a/arm_compute/core/CL/ICLMultiImage.h b/arm_compute/core/CL/ICLMultiImage.h
new file mode 100644
index 0000000000..e8705b1824
--- /dev/null
+++ b/arm_compute/core/CL/ICLMultiImage.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ICLMULTIIMAGE_H__
+#define __ARM_COMPUTE_ICLMULTIIMAGE_H__
+
+#include "arm_compute/core/IMultiImage.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Interface for OpenCL multi-planar images */
+class ICLMultiImage : public IMultiImage
+{
+public:
+    /** Return a pointer to the requested OpenCL plane of the image.
+     *
+     *  @param[in] index The index of the wanted planed.
+     *
+     *  @return A pointer pointed to the OpenCL plane
+     */
+    virtual ICLImage *cl_plane(unsigned int index) = 0;
+    /** Return a constant pointer to the requested OpenCL plane of the image.
+     *
+     *  @param[in] index The index of the wanted planed.
+     *
+     *  @return A constant pointer pointed to the OpenCL plane
+     */
+    virtual const ICLImage *cl_plane(unsigned int index) const = 0;
+
+    // Inherited methods overridden:
+    IImage *plane(unsigned int index) override;
+    const IImage *plane(unsigned int index) const override;
+};
+}
+#endif /*__ARM_COMPUTE_ICLMULTIIMAGE_H__ */
diff --git a/arm_compute/core/CL/ICLSimple2DKernel.h b/arm_compute/core/CL/ICLSimple2DKernel.h
new file mode 100644
index 0000000000..a1366fb211
--- /dev/null
+++ b/arm_compute/core/CL/ICLSimple2DKernel.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ICLSIMPLE2DKERNEL_H__
+#define __ARM_COMPUTE_ICLSIMPLE2DKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimpleKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for simple OpenCL kernels having 1 tensor input and 1 tensor output. This interface can be used when the work-item processes a 2D tile */
+class ICLSimple2DKernel : public ICLSimpleKernel
+{
+public:
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+};
+}
+#endif /*__ARM_COMPUTE_ICLSIMPLE2DKERNEL_H__ */
diff --git a/arm_compute/core/CL/ICLSimple3DKernel.h b/arm_compute/core/CL/ICLSimple3DKernel.h
new file mode 100644
index 0000000000..5e981027de
--- /dev/null
+++ b/arm_compute/core/CL/ICLSimple3DKernel.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ICLSIMPLE3DKERNEL_H__
+#define __ARM_COMPUTE_ICLSIMPLE3DKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for simple OpenCL kernels having 1 tensor input and 1 tensor output.
+ *  Both input tensor and output tensor must have at least 3 dimensions.
+ */
+class ICLSimple3DKernel : public ICLSimple2DKernel
+{
+public:
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+};
+}
+#endif /*__ARM_COMPUTE_ICLSIMPLE3DKERNEL_H__ */
diff --git a/arm_compute/core/CL/ICLSimpleKernel.h b/arm_compute/core/CL/ICLSimpleKernel.h
new file mode 100644
index 0000000000..e9fdb7fb8b
--- /dev/null
+++ b/arm_compute/core/CL/ICLSimpleKernel.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ICLSIMPLEKERNEL_H__
+#define __ARM_COMPUTE_ICLSIMPLEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+
+namespace arm_compute
+{
+/** Interface for simple OpenCL kernels having 1 tensor input and 1 tensor output */
+class ICLSimpleKernel : public ICLKernel
+{
+public:
+    /** Constructor. */
+    ICLSimpleKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    ICLSimpleKernel(const ICLSimpleKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    ICLSimpleKernel &operator=(const ICLSimpleKernel &) = delete;
+    /** Allow instances of this class to be moved. */
+    ICLSimpleKernel(ICLSimpleKernel &&) = default;
+    /** Allow instances of this class to be moved. */
+    ICLSimpleKernel &operator=(ICLSimpleKernel &&) = default;
+    /** Default destructor */
+    ~ICLSimpleKernel() = default;
+
+    /** Configure the kernel
+     *
+     * @param[in]  input                             Source tensor.
+     * @param[out] output                            Destination tensor.
+     * @param[in]  num_elems_processed_per_iteration Number of processed elements per iteration.
+     * @param[in]  border_undefined                  (Optional) True if the border mode is undefined. False if it's replicate or constant.
+     * @param[in]  border_size                       (Optional) Size of the border.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined = false, const BorderSize &border_size = BorderSize());
+
+protected:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+};
+}
+
+#endif /*__ARM_COMPUTE_ICLSIMPLEKERNEL_H__ */
diff --git a/arm_compute/core/CL/ICLTensor.h b/arm_compute/core/CL/ICLTensor.h
new file mode 100644
index 0000000000..abc0131379
--- /dev/null
+++ b/arm_compute/core/CL/ICLTensor.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ICLTENSOR_H__
+#define __ARM_COMPUTE_ICLTENSOR_H__
+
+#include "arm_compute/core/ITensor.h"
+
+#include <cstdint>
+
+namespace cl
+{
+class Buffer;
+class CommandQueue;
+}
+
+namespace arm_compute
+{
+/** Interface for OpenCL tensor */
+class ICLTensor : public ITensor
+{
+public:
+    ICLTensor();
+    ICLTensor(const ICLTensor &) = delete;
+    ICLTensor &operator=(const ICLTensor &) = delete;
+    ICLTensor(ICLTensor &&)                 = default;
+    ICLTensor &operator=(ICLTensor &&) = default;
+    virtual ~ICLTensor()               = default;
+
+    /** Interface to be implemented by the child class to return a reference to the OpenCL buffer containing the image's data.
+     *
+     * @return A reference to an OpenCL buffer containing the image's data.
+     */
+    virtual const cl::Buffer &cl_buffer() const = 0;
+    /** Enqueue a map operation of the allocated buffer on the given queue.
+     *
+     * @param[in,out] q        The CL command queue to use for the mapping operation.
+     * @param[in]     blocking If true, then the mapping will be ready to use by the time
+     *                         this method returns, else it is the caller's responsibility
+     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
+     *
+     * @return The mapping address.
+     */
+    void map(cl::CommandQueue &q, bool blocking = true);
+    /** Enqueue an unmap operation of the allocated and mapped buffer on the given queue.
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     *
+     * @param[in,out] q The CL command queue to use for the mapping operation.
+     */
+    void unmap(cl::CommandQueue &q);
+    /** Clear the contents of the tensor synchronously.
+     *
+     * @param[in,out] q The CL command queue to use for the clear operation.
+     */
+    void clear(cl::CommandQueue &q);
+
+    // Inherited methods overridden:
+    uint8_t *buffer() const override;
+
+protected:
+    /** Method to be implemented by the child class to map the OpenCL buffer
+     *
+     * @param[in,out] q        The CL command queue to use for the mapping operation.
+     * @param[in]     blocking If true, then the mapping will be ready to use by the time
+     *                         this method returns, else it is the caller's responsibility
+     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
+     */
+    virtual uint8_t *do_map(cl::CommandQueue &q, bool blocking) = 0;
+    /** Method to be implemented by the child class to unmap the OpenCL buffer
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     *
+     * @param[in,out] q The CL command queue to use for the mapping operation.
+     */
+    virtual void do_unmap(cl::CommandQueue &q) = 0;
+
+private:
+    uint8_t *_mapping;
+};
+
+using ICLImage = ICLTensor;
+}
+#endif /*__ARM_COMPUTE_ICLTENSOR_H__ */
diff --git a/arm_compute/core/CL/OpenCL.h b/arm_compute/core/CL/OpenCL.h
new file mode 100644
index 0000000000..2fae35c974
--- /dev/null
+++ b/arm_compute/core/CL/OpenCL.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_OPENCL_H__
+#define __ARM_COMPUTE_OPENCL_H__
+
+/* Configure the Khronos C++ wrapper to target OpenCL 1.2: */
+#define CL_HPP_ENABLE_EXCEPTIONS
+#define CL_HPP_CL_1_2_DEFAULT_BUILD
+#define CL_HPP_TARGET_OPENCL_VERSION 110
+#define CL_HPP_MINIMUM_OPENCL_VERSION 110
+#include <CL/cl2.hpp>
+
+namespace cl
+{
+static const NDRange Range_128_1 = NDRange(128, 1);
+}
+
+namespace arm_compute
+{
+bool opencl_is_available();
+}
+#endif /* __ARM_COMPUTE_OPENCL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h b/arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h
new file mode 100644
index 0000000000..e8bd6aac7f
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLABSOLUTEDIFFERENCEKERNEL_H__
+#define __ARM_COMPUTE_CLABSOLUTEDIFFERENCEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the absolute difference kernel.
+ *
+ * Absolute difference is computed by:
+ * @f[ output(x,y) = | input1(x,y) - input2(x,y) | @f]
+ */
+class CLAbsoluteDifferenceKernel : public ICLKernel
+{
+public:
+    /** Default constructor. */
+    CLAbsoluteDifferenceKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLAbsoluteDifferenceKernel(const CLAbsoluteDifferenceKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLAbsoluteDifferenceKernel &operator=(const CLAbsoluteDifferenceKernel &) = delete;
+    /** Allow instances of this class to be moved. */
+    CLAbsoluteDifferenceKernel(CLAbsoluteDifferenceKernel &&) = default;
+    /** Allow instances of this class to be moved. */
+    CLAbsoluteDifferenceKernel &operator=(CLAbsoluteDifferenceKernel &&) = default;
+    /** Default destructor */
+    ~CLAbsoluteDifferenceKernel() = default;
+
+    /** Set the inputs and output images.
+     *
+     * @param[in]  input1 Source tensor. Data types supported: U8/S16.
+     * @param[in]  input2 Source tensor. Data types supported: U8/S16.
+     * @param[out] output Destination tensor. Data types supported: U8/S16.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input1; /**< Source tensor 1. */
+    const ICLTensor *_input2; /**< Source tensor 2. */
+    ICLTensor       *_output; /**< Destination tensor. */
+};
+}
+#endif /* __ARM_COMPUTE_CLABSOLUTEDIFFERENCEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLAccumulateKernel.h b/arm_compute/core/CL/kernels/CLAccumulateKernel.h
new file mode 100644
index 0000000000..5c8ffdb404
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLAccumulateKernel.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLACCUMULATEKERNEL_H__
+#define __ARM_COMPUTE_CLACCUMULATEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the accumulate kernel.
+ *
+ * Accumulation is computed by:
+ * @f[ accum(x,y) = accum(x,y) + input(x,y) @f]
+ */
+class CLAccumulateKernel : public ICLSimple2DKernel
+{
+public:
+    /** Set the input and accumulation tensors.
+     *
+     * @param[in]  input Source tensor. Data types supported: U8.
+     * @param[out] accum Destination tensor. Data types supported: S16.
+     */
+    void configure(const ICLTensor *input, ICLTensor *accum);
+};
+
+/** Interface for the accumulate weighted kernel.
+ *
+ * Weighted accumulation is computed:
+ * @f[ accum(x,y) = (1 - \alpha)*accum(x,y) + \alpha*input(x,y) @f]
+ *
+ * Where @f$ 0 \le \alpha \le 1 @f$
+ * Conceptually, the rounding for this is defined as:
+ * @f[ output(x,y)= uint8( (1 - \alpha) * float32( int32( output(x,y) ) ) + \alpha * float32( int32( input(x,y) ) ) ) @f]
+*/
+class CLAccumulateWeightedKernel : public ICLSimple2DKernel
+{
+public:
+    /** Set the input and accumulation images, and the scale value.
+     *
+     * @param[in]     input Source tensor. Data types supported: U8.
+     * @param[in]     alpha Scalar value in the range [0, 1.0]. Data types supported: F32.
+     * @param[in,out] accum Accumulated tensor. Data types supported: U8.
+     */
+    void configure(const ICLTensor *input, float alpha, ICLTensor *accum);
+};
+
+/** Interface for the accumulate squared kernel.
+ *
+ * The accumulation of squares is computed:
+ * @f[ accum(x,y) = saturate_{int16} ( (uint16) accum(x,y) + (((uint16)(input(x,y)^2)) >> (shift)) ) @f]
+ *
+ * Where @f$ 0 \le shift \le 15 @f$
+*/
+class CLAccumulateSquaredKernel : public ICLSimple2DKernel
+{
+public:
+    /** Set the input and accumulation tensors and the shift value.
+     *
+     * @param[in]     input Source tensor. Data types supported: U8.
+     * @param[in]     shift Shift value in the range of [0, 15]. Data types supported: U32.
+     * @param[in,out] accum Accumulated tensor. Data types supported: S16.
+     */
+    void configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum);
+};
+}
+#endif /*__ARM_COMPUTE_CLACCUMULATEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLActivationLayerKernel.h b/arm_compute/core/CL/kernels/CLActivationLayerKernel.h
new file mode 100644
index 0000000000..490e70544b
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLActivationLayerKernel.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLACTIVATIONLAYERKERNEL_H__
+#define __ARM_COMPUTE_CLACTIVATIONLAYERKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple3DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the activation layer kernel. */
+class CLActivationLayerKernel : public ICLSimple3DKernel
+{
+public:
+    /** Set the input and output tensor.
+     *
+     * @param[in]  input    Source tensor. Data types supported: F16, F32, U16, S16.
+     * @param[out] output   Destination tensor. Data type should match the input data type.
+     * @param[in]  act_info Activation layer information.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info);
+};
+}
+#endif /*__ARM_COMPUTE_CLACTIVATIONLAYERKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h b/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h
new file mode 100644
index 0000000000..7d736cdf44
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLARITHMETICADDITIONKERNEL_H__
+#define __ARM_COMPUTE_CLARITHMETICADDITIONKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the arithmetic addition kernel
+ *
+ * Arithmetic addition is computed by:
+ * @f[ output(x,y) = input1(x,y) + input2(x,y) @f]
+ */
+class CLArithmeticAdditionKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLArithmeticAdditionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLArithmeticAdditionKernel(const CLArithmeticAdditionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLArithmeticAdditionKernel &operator=(const CLArithmeticAdditionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLArithmeticAdditionKernel(CLArithmeticAdditionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLArithmeticAdditionKernel &operator=(CLArithmeticAdditionKernel &&) = default;
+    /** Default destructor */
+    ~CLArithmeticAdditionKernel() = default;
+    /** Initialise the kernel's inputs, output and convertion policy.
+     *
+     * @param[in]  input1 First tensor input. Data types supported: U8, S16, F16, F32.
+     * @param[in]  input2 Second tensor input. Data types supported: U8, S16, F16, F32.
+     * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), S16, F16, F32.
+     * @param[in]  policy Policy to use to handle overflow.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input1; /**< Source tensor 1 */
+    const ICLTensor *_input2; /**< Source tensor 2 */
+    ICLTensor       *_output; /**< Destination tensor */
+};
+}
+#endif /* __ARM_COMPUTE_CLARITHMETICADDITIONKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h b/arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h
new file mode 100644
index 0000000000..afecf6ed7d
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLARITHMETICSUBTRACTIONKERNEL_H__
+#define __ARM_COMPUTE_CLARITHMETICSUBTRACTIONKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the arithmetic subtraction kernel
+ *
+ * Arithmetic subtraction is computed by:
+ * @f[ output(x,y) = input1(x,y) - input2(x,y) @f]
+ */
+class CLArithmeticSubtractionKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLArithmeticSubtractionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLArithmeticSubtractionKernel(const CLArithmeticSubtractionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLArithmeticSubtractionKernel &operator=(const CLArithmeticSubtractionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLArithmeticSubtractionKernel(CLArithmeticSubtractionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLArithmeticSubtractionKernel &operator=(CLArithmeticSubtractionKernel &&) = default;
+    /** Default destructor */
+    ~CLArithmeticSubtractionKernel() = default;
+
+    /** Initialise the kernel's inputs, output and convertion policy.
+     *
+     * @param[in]  input1 First tensor input. Data types supported: U8, S16, F16, F32.
+     * @param[in]  input2 Second tensor input. Data types supported: U8,  S16, F16, F32.
+     * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), S16, F16, F32.
+     * @param[in]  policy Policy to use to handle overflow.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input1; /**< Source tensor 1 */
+    const ICLTensor *_input2; /**< Source tensor 2 */
+    ICLTensor       *_output; /**< Destination tensor */
+};
+}
+#endif /* __ARM_COMPUTE_CLARITHMETICSUBTRACTIONKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h b/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h
new file mode 100644
index 0000000000..088853841b
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H__
+#define __ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the BatchNormalization layer kernel.
+ */
+class CLBatchNormalizationLayerKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLBatchNormalizationLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLBatchNormalizationLayerKernel(const CLBatchNormalizationLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLBatchNormalizationLayerKernel &operator=(const CLBatchNormalizationLayerKernel &) = delete;
+    /** Default Move Constructor. */
+    CLBatchNormalizationLayerKernel(CLBatchNormalizationLayerKernel &&) = default;
+    /** Default move assignment operator. */
+    CLBatchNormalizationLayerKernel &operator=(CLBatchNormalizationLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLBatchNormalizationLayerKernel() = default;
+
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input   Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, FM]. Data types supported: F32.
+     * @param[out] output  Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
+     *                     The rest are optional and used for representing batches.
+     * @param[in]  mean    Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  var     Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  gamma   Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  beta    Beta values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  epsilon Small value to avoid division with zero.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    const ICLTensor *_mean;
+    const ICLTensor *_var;
+    const ICLTensor *_beta;
+    const ICLTensor *_gamma;
+    float            _epsilon;
+};
+}
+#endif /*__ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLBitwiseAndKernel.h b/arm_compute/core/CL/kernels/CLBitwiseAndKernel.h
new file mode 100644
index 0000000000..624c422abc
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLBitwiseAndKernel.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLBITWISEANDKERNEL_H__
+#define __ARM_COMPUTE_CLBITWISEANDKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the bitwise AND operation kernel.
+ *
+ * Result is computed by:
+ * @f[ output(x,y) = input1(x,y) \land input2(x,y) @f]
+ */
+class CLBitwiseAndKernel : public ICLKernel
+{
+public:
+    /** Default constructor. */
+    CLBitwiseAndKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLBitwiseAndKernel(const CLBitwiseAndKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLBitwiseAndKernel &operator=(const CLBitwiseAndKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLBitwiseAndKernel(CLBitwiseAndKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLBitwiseAndKernel &operator=(CLBitwiseAndKernel &&) = default;
+    /** Set the inputs and output images
+     *
+     * @param[in]  input1 Source tensor. Data types supported: U8.
+     * @param[in]  input2 Source tensor. Data types supported: U8.
+     * @param[out] output Destination tensor. Data types supported: U8.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input1; /**< Source tensor 1 */
+    const ICLTensor *_input2; /**< Source tensor 2 */
+    ICLTensor       *_output; /**< Destination tensor */
+};
+}
+#endif /* __ARM_COMPUTE_CLBITWISEANDKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLBitwiseNotKernel.h b/arm_compute/core/CL/kernels/CLBitwiseNotKernel.h
new file mode 100644
index 0000000000..c9026022e1
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLBitwiseNotKernel.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLBITWISENOTKERNEL_H__
+#define __ARM_COMPUTE_CLBITWISENOTKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the bitwise NOT operation kernel.
+ *
+ * Result is computed by:
+ * @f[ output(x,y) = \lnot input(x,y) @f]
+ */
+class CLBitwiseNotKernel : public ICLSimple2DKernel
+{
+public:
+    /** Set the inputs and output images.
+     *
+     * @param[in]  input  Source tensor. Data types supported: U8.
+     * @param[out] output Destination tensor. Data types supported: U8.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+};
+}
+#endif /* __ARM_COMPUTE_CLBITWISENOTKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLBitwiseOrKernel.h b/arm_compute/core/CL/kernels/CLBitwiseOrKernel.h
new file mode 100644
index 0000000000..fe8710fbc1
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLBitwiseOrKernel.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLBITWISEORKERNEL_H__
+#define __ARM_COMPUTE_CLBITWISEORKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the bitwise OR operation kernel.
+ *
+ * Result is computed by:
+ * @f[ output(x,y) = input1(x,y) \lor input2(x,y) @f]
+ */
+class CLBitwiseOrKernel : public ICLKernel
+{
+public:
+    /** Default constructor. */
+    CLBitwiseOrKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLBitwiseOrKernel(const CLBitwiseOrKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLBitwiseOrKernel &operator=(const CLBitwiseOrKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLBitwiseOrKernel(CLBitwiseOrKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLBitwiseOrKernel &operator=(CLBitwiseOrKernel &&) = default;
+    /** Set the inputs and output images
+     *
+     * @param[in]  input1 Source tensor. Data types supported: U8.
+     * @param[in]  input2 Source tensor. Data types supported: U8.
+     * @param[out] output Destination tensor. Data types supported: U8.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input1; /**< Source tensor 1 */
+    const ICLTensor *_input2; /**< Source tensor 2 */
+    ICLTensor       *_output; /**< Destination tensor */
+};
+}
+#endif /* __ARM_COMPUTE_CLBITWISEORKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLBitwiseXorKernel.h b/arm_compute/core/CL/kernels/CLBitwiseXorKernel.h
new file mode 100644
index 0000000000..f4e0b4df60
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLBitwiseXorKernel.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLBITWISEXORKERNEL_H__
+#define __ARM_COMPUTE_CLBITWISEXORKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the bitwise XOR operation kernel.
+ *
+ * Result is computed by:
+ * @f[ output(x,y) = input1(x,y) \oplus input2(x,y) @f]
+ */
+class CLBitwiseXorKernel : public ICLKernel
+{
+public:
+    /** Default constructor. */
+    CLBitwiseXorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLBitwiseXorKernel(const CLBitwiseXorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLBitwiseXorKernel &operator=(const CLBitwiseXorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLBitwiseXorKernel(CLBitwiseXorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLBitwiseXorKernel &operator=(CLBitwiseXorKernel &&) = default;
+    /** Set the inputs and output images
+     *
+     * @param[in]  input1 Source tensor. Data types supported: U8.
+     * @param[in]  input2 Source tensor. Data types supported: U8.
+     * @param[out] output Destination tensor. Data types supported: U8.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input1; /**< Source tensor 1 */
+    const ICLTensor *_input2; /**< Source tensor 2 */
+    ICLTensor       *_output; /**< Destination tensor */
+};
+}
+#endif /* __ARM_COMPUTE_CLBITWISEXORKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLBox3x3Kernel.h b/arm_compute/core/CL/kernels/CLBox3x3Kernel.h
new file mode 100644
index 0000000000..0960f7487a
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLBox3x3Kernel.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLBOX3X3KERNEL_H__
+#define __ARM_COMPUTE_CLBOX3X3KERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the box 3x3 filter kernel.
+ *
+ */
+class CLBox3x3Kernel : public ICLSimple2DKernel
+{
+public:
+    /**Initialise the kernel's input and output.
+     *
+     * @param[in]  input            An input tensor. Data types supported: U8
+     * @param[out] output           The output tensor. Data types supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
+
+    //Inherited methods overriden:
+    BorderSize border_size() const override;
+};
+}
+#endif /*__ARM_COMPUTE_CLBOX3X3KERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLCannyEdgeKernel.h b/arm_compute/core/CL/kernels/CLCannyEdgeKernel.h
new file mode 100644
index 0000000000..5ca3e03412
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLCannyEdgeKernel.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLCANNYEDGEKERNEL_H__
+#define __ARM_COMPUTE_CLCANNYEDGEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform Gradient computation.
+ */
+class CLGradientKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLGradientKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLGradientKernel(const CLGradientKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLGradientKernel &operator=(const CLGradientKernel &) = delete;
+    /** Initialise the kernel's sources, destinations and border mode.
+     *
+     * @note gx, gy and mag must all be the same size (either 16 or 32).
+     *
+     * @param[in]  gx        Source tensor - Gx component. Data types supported: S16/S32.
+     * @param[in]  gy        Source tensor - Gy component. Data types supported: Same as gx.
+     * @param[out] magnitude Destination tensor - Magnitude. Data types supported: U16/U32. Must match the pixel size of gx, gy.
+     * @param[out] phase     Destination tensor - Quantized phase. Data types supported: U8.
+     * @param[in]  norm_type Normalization type. if 1, L1-Norm otherwise L2-Norm.
+     */
+    void configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, int32_t norm_type);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_gx;        /**< Source tensor - Gx component */
+    const ICLTensor *_gy;        /**< Source tensor - Gy component */
+    ICLTensor       *_magnitude; /**< Destination tensor - Magnitude */
+    ICLTensor       *_phase;     /**< Destination tensor - Quantized phase */
+};
+
+/** OpenCL kernel to perform Non-Maxima suppression for Canny Edge.
+ *
+ * @note This kernel is meant to be used alongside CannyEdge and performs a non-maxima suppression using magnitude and phase of input
+ *       to characterize points as possible edges. The output buffer needs to be cleared before this kernel is executed.
+ *
+ * @note Hysteresis is computed in @ref CLEdgeTraceKernel
+ */
+class CLEdgeNonMaxSuppressionKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLEdgeNonMaxSuppressionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLEdgeNonMaxSuppressionKernel(const CLEdgeNonMaxSuppressionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLEdgeNonMaxSuppressionKernel &operator=(const CLEdgeNonMaxSuppressionKernel &) = delete;
+    /** Initialise the kernel's sources, destination and border mode.
+     *
+     * @param[in]  magnitude        Source tensor - Magnitude. Data types supported: U16/U32.
+     * @param[in]  phase            Source tensor - Quantized phase. Data types supported: U8.
+     * @param[out] output           Destination tensor. Data types supported: U16/U32.
+     * @param[in]  lower_thr        Lower threshold.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *magnitude, const ICLTensor *phase, ICLTensor *output, int32_t lower_thr, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_magnitude; /**< Source tensor - Magnitude. */
+    const ICLTensor *_phase;     /**< Source tensor - Quantized phase. */
+    ICLTensor       *_output;    /**< Destination tensor. */
+};
+
+/** OpenCL kernel to perform Edge tracing.
+ */
+class CLEdgeTraceKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLEdgeTraceKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLEdgeTraceKernel(const CLEdgeTraceKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLEdgeTraceKernel &operator=(const CLEdgeTraceKernel &) = delete;
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in]     input            Source tensor. Data types supported: U8.
+     * @param[out]    output           Destination tensor. Data types supported: U8.
+     * @param[in]     upper_thr        Upper threshold used for the hysteresis
+     * @param[in]     lower_thr        Lower threshold used for the hysteresis
+     * @param[in,out] visited          Tensor for keeping the visited pixels. Data types supported: U32.
+     *                                 Expected to be initialized to 0 before each run.
+     * @param[in,out] recorded         Tensor for keeping the recorded pixels. Data types supported: U32
+     *                                 Expected to be initialized to 0 before each run.
+     * @param[in,out] l1_stack         Tensor with the L1 stack for each pixel. Data types supported: S32.
+     *                                 Expected to be initialized to 0 before each run.
+     * @param[in,out] l1_stack_counter Tensor for counting the elements in the L1 stack of each pixel. Data types supported: U8.
+     *                                              Expected to be initialized to 0 before each run.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr,
+                   ICLTensor *visited, ICLTensor *recorded, ICLTensor *l1_stack, ICLTensor *l1_stack_counter);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;            /**< Source tensor. */
+    ICLTensor       *_output;           /**< Destination tensor. */
+    int32_t          _lower_thr;        /**< Lower threshold used for the hysteresis. */
+    int32_t          _upper_thr;        /**< Upper threshold used for the hysteresis. */
+    ICLTensor       *_visited;          /**< Marks visited elements */
+    ICLTensor       *_recorded;         /**< Marks recorded elements */
+    ICLTensor       *_l1_stack;         /**< L1 hysteris stack */
+    ICLTensor       *_l1_stack_counter; /**< L1 hysteris stack counter */
+};
+}
+#endif /* __ARM_COMPUTE_CLCANNYEDGEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLChannelCombineKernel.h b/arm_compute/core/CL/kernels/CLChannelCombineKernel.h
new file mode 100644
index 0000000000..3e718a2f1a
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLChannelCombineKernel.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLCHANNELCOMBINEKERNEL_H__
+#define __ARM_COMPUTE_CLCHANNELCOMBINEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+#include <array>
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLMultiImage;
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Interface for the channel combine kernel */
+class CLChannelCombineKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLChannelCombineKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLChannelCombineKernel(const CLChannelCombineKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLChannelCombineKernel &operator=(const CLChannelCombineKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLChannelCombineKernel(CLChannelCombineKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLChannelCombineKernel &operator=(CLChannelCombineKernel &&) = default;
+    /** Default destructor */
+    ~CLChannelCombineKernel() = default;
+    /** Configure function's inputs and outputs.
+     *
+     * @param[in]  plane0 The 2D plane that forms channel 0. Must be of U8 format.
+     * @param[in]  plane1 The 2D plane that forms channel 1. Must be of U8 format.
+     * @param[in]  plane2 The 2D plane that forms channel 2. Must be of U8 format.
+     * @param[in]  plane3 The 2D plane that forms channel 3. Must be of U8 format.
+     * @param[out] output The single planar output tensor.
+     */
+    void configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output);
+    /** Configure function's inputs and outputs.
+     *
+     * @param[in]  plane0 The 2D plane that forms channel 0. Must be of U8 format.
+     * @param[in]  plane1 The 2D plane that forms channel 1. Must be of U8 format.
+     * @param[in]  plane2 The 2D plane that forms channel 2. Must be of U8 format.
+     * @param[out] output The multi planar output tensor.
+     */
+    void configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    std::array<const ICLTensor *, 4> _planes;
+    ICLTensor     *_output;
+    ICLMultiImage *_output_multi;
+    std::array<uint32_t, 3> _x_subsampling;
+    std::array<uint32_t, 3> _y_subsampling;
+};
+}
+#endif /* __ARM_COMPUTE_CLCHANNELCOMBINEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLChannelExtractKernel.h b/arm_compute/core/CL/kernels/CLChannelExtractKernel.h
new file mode 100644
index 0000000000..3e9e699a50
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLChannelExtractKernel.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLCHANNELEXTRACTKERNEL_H__
+#define __ARM_COMPUTE_CLCHANNELEXTRACTKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLMultiImage;
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Interface for the channel extract kernel */
+class CLChannelExtractKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLChannelExtractKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLChannelExtractKernel(const CLChannelExtractKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLChannelExtractKernel &operator=(const CLChannelExtractKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLChannelExtractKernel(CLChannelExtractKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLChannelExtractKernel &operator=(CLChannelExtractKernel &&) = default;
+    /** Default destructor */
+    ~CLChannelExtractKernel() = default;
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input   Source tensor.
+     * @param[in]  channel Channel to extract.
+     * @param[out] output  Destination tensor. Must be of U8 format.
+     */
+    void configure(const ICLTensor *input, Channel channel, ICLTensor *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input   Multi-planar source image.
+     * @param[in]  channel Channel to extract.
+     * @param[out] output  Single-planar 2D destination image. Must be of U8 format.
+     */
+    void configure(const ICLMultiImage *input, Channel channel, ICLImage *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    uint32_t         _num_elems_processed_per_iteration;
+    uint32_t         _subsampling;
+};
+}
+#endif /* __ARM_COMPUTE_CLCHANNELEXTRACTKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLCol2ImKernel.h b/arm_compute/core/CL/kernels/CLCol2ImKernel.h
new file mode 100644
index 0000000000..9d445e3004
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLCol2ImKernel.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLCOL2IMKERNEL_H__
+#define __ARM_COMPUTE_CLCOL2IMKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the col2im reshaping kernel.
+ *
+ * Rearranges each matrix column into image blocks. It's the inverse operation of @ref CLIm2ColKernel.
+ *
+ * For example, a vector of 9 elements can be reshaped to a block(image) of 3x3:
+ *
+ * @f[
+ * \left( \begin{array}{ccccccccc}
+ * a0 & a1 & a2 & a3 & a4 & a5 & a6 & a7 & a8 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccc}
+ * a0 & a1 & a2 \\
+ * a3 & a4 & a5 \\
+ * a6 & a7 & a8 \\
+ * \end{array} \right)
+ * @f]
+ */
+class CLCol2ImKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLCol2ImKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLCol2ImKernel(const CLCol2ImKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLCol2ImKernel &operator=(const CLCol2ImKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLCol2ImKernel(CLCol2ImKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLCol2ImKernel &operator=(CLCol2ImKernel &&) = default;
+    /** Default destructor */
+    ~CLCol2ImKernel() = default;
+
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input          The input tensor to convert. Data types supported: F16, F32
+     * @param[out] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
+     *                            while the rest represent batch of outputs. Data types supported: Same as @p input
+     * @param[in]  convolved_dims Output convolved dimensions.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, std::pair<unsigned int, unsigned int> convolved_dims);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    std::pair<unsigned int, unsigned int> _convolved_dims;
+};
+}
+
+#endif /*__ARM_COMPUTE_CLCOL2IMKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLColorConvertKernel.h b/arm_compute/core/CL/kernels/CLColorConvertKernel.h
new file mode 100644
index 0000000000..a88e2dcdf3
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLColorConvertKernel.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLCOLORCONVERTKERNEL_H__
+#define __ARM_COMPUTE_CLCOLORCONVERTKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLMultiImage;
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Interface for the color convert kernel.
+ *
+ */
+class CLColorConvertKernel : public ICLKernel
+{
+public:
+    /** Default constructor. */
+    CLColorConvertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLColorConvertKernel(const CLColorConvertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLColorConvertKernel &operator=(const CLColorConvertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLColorConvertKernel(CLColorConvertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLColorConvertKernel &operator=(CLColorConvertKernel &&) = default;
+    /** Default destructor. */
+    ~CLColorConvertKernel() = default;
+
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input  Source tensor
+     * @param[out] output Destination tensor
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input  multi-planar source image
+     * @param[out] output single-planar destination image
+     */
+    void configure(const ICLMultiImage *input, ICLImage *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input  single-planar source image
+     * @param[out] output multi-planar destination image
+     */
+    void configure(const ICLImage *input, ICLMultiImage *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input  multi-planar source image
+     * @param[out] output multi-planar destination image
+     */
+    void configure(const ICLMultiImage *input, ICLMultiImage *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor     *_input;        /*pointer to single planar tensor input */
+    ICLTensor           *_output;       /*pointer to single planar tensor output */
+    const ICLMultiImage *_multi_input;  /*pointer to multi-planar input */
+    ICLMultiImage       *_multi_output; /*pointer to multi-planar output */
+};
+}
+
+#endif /* __ARM_COMPUTE_CLCOLORCONVERTKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLConvolutionKernel.h b/arm_compute/core/CL/kernels/CLConvolutionKernel.h
new file mode 100644
index 0000000000..9c0908405a
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLConvolutionKernel.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLCONVOLUTIONKERNEL_H__
+#define __ARM_COMPUTE_CLCONVOLUTIONKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/****************************************************************************************\
+ *                                    Square Convolution                                *
+\****************************************************************************************/
+
+/** Interface for the kernel to run an arbitrary size convolution on a tensor. (Currently supports 3x3, 5x5, 7x7 and 9x9).
+ * The client can supply a convolution matrix \f$ C_{m,n} \f$.
+ * @f{eqnarray}{
+ *  k_0 &=& \frac{m}{2}  \\
+ *  l_0 &=& \frac{n}{2}  \\
+ *  sum &=& \sum_{k=0,l=0}^{k=m-1,l=n-1} input(x+k-k_0, y+l-l_0) C_{k,l}
+ *  @f}
+ *
+ * @note The above equation for this function is similar to the default OpenCV Filter2D function,
+ *       which actually computes a correlation and not a convolution.
+ *       In case of a real convolution the convolution matrix should be flipped both horizontally and vertically.
+ */
+template <unsigned int matrix_size>
+class CLConvolutionKernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output           Destination tensor, Data types supported: U8, S16.
+     * @param[in]  conv             Convolution matrix to apply to the input tensor.
+     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+};
+
+/** Interface for the kernel which applies a 3x3 convolution to a tensor. */
+using CLConvolution3x3Kernel = CLConvolutionKernel<3>;
+/** Interface for the kernel which applies a 5x5 convolution to a tensor. */
+using CLConvolution5x5Kernel = CLConvolutionKernel<5>;
+/** Interface for the kernel which applies a 7x7 convolution to a tensor. */
+using CLConvolution7x7Kernel = CLConvolutionKernel<7>;
+/** Interface for the kernel which applies a 9x9 convolution to a tensor. */
+using CLConvolution9x9Kernel = CLConvolutionKernel<9>;
+
+/****************************************************************************************\
+ *                              Separable Square Convolution                            *
+\****************************************************************************************/
+
+/** Kernel for the Horizontal pass of a Separable Convolution. Currently support 5x5, 7x7, 9x9 */
+template <unsigned int matrix_size>
+class CLSeparableConvolutionHorKernel : public ICLSimple2DKernel
+{
+public:
+    /** Default Constructor */
+    CLSeparableConvolutionHorKernel();
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output           Destination tensor, Data types supported: S16.
+     * @param[in]  conv             Convolution matrix to apply to the input tensor.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, bool border_undefined);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+
+private:
+    BorderSize _border_size; /**< Border size */
+};
+
+/** Interface for the kernel which applies a horizontal pass of 5x5 convolution to a tensor. */
+using CLSeparableConvolution5x5HorKernel = CLSeparableConvolutionHorKernel<5>;
+/** Interface for the kernel which applies a horizontal pass of 7x7 convolution to a tensor. */
+using CLSeparableConvolution7x7HorKernel = CLSeparableConvolutionHorKernel<7>;
+/** Interface for the kernel which applies a horizontal pass of 9x9 convolution to a tensor. */
+using CLSeparableConvolution9x9HorKernel = CLSeparableConvolutionHorKernel<9>;
+
+/** Kernel for the Vertical pass of a Separable Convolution. Currently supports 5x5, 7x7, 9x9 */
+template <unsigned int matrix_size>
+class CLSeparableConvolutionVertKernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input            Source tensor. Data types supported: S16.
+     * @param[out] output           Destination tensor, Data types supported: U8, S16.
+     * @param[in]  conv             Convolution matrix to apply to the input tensor.
+     * @param[in]  scale            Scale of the convolution matrix.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     * @param[in]  data_type        Data type to use for intermeidate result. @sa data_type_for_convolution
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined, DataType data_type = DataType::S32);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+};
+
+/** Interface for the kernel which applies a vertical pass of 5x5 convolution to a tensor. */
+using CLSeparableConvolution5x5VertKernel = CLSeparableConvolutionVertKernel<5>;
+/** Interface for the kernel which applies a vertical pass of 7x7 convolution to a tensor. */
+using CLSeparableConvolution7x7VertKernel = CLSeparableConvolutionVertKernel<7>;
+/** Interface for the kernel which applies a vertical pass of 9x9 convolution to a tensor. */
+using CLSeparableConvolution9x9VertKernel = CLSeparableConvolutionVertKernel<9>;
+
+/****************************************************************************************\
+ *                                 Rectangle Convolution                                *
+\****************************************************************************************/
+
+/** Kernel for the running convolution on a rectangle matrix.
+ *
+ * @note Supports combinations of 3,5,7 and 9.
+ */
+class CLConvolutionRectangleKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLConvolutionRectangleKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLConvolutionRectangleKernel(const CLConvolutionRectangleKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLConvolutionRectangleKernel &operator=(const CLConvolutionRectangleKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLConvolutionRectangleKernel(CLConvolutionRectangleKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLConvolutionRectangleKernel &operator=(CLConvolutionRectangleKernel &&) = default;
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output           Destination tensor, Data types supported: U8, S16.
+     * @param[in]  conv             Convolution matrix to apply to the input tensor.
+     * @param[in]  width            Width of convolution matrix (Number of columns)
+     * @param[in]  height           Height of convolution matrix (Number of rows)
+     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    BorderSize       _border_size;
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+};
+}
+#endif /*__ARM_COMPUTE_CLCONVOLUTIONKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h b/arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h
new file mode 100644
index 0000000000..eda4c66883
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_CLDEPTHCONCATENATEKERNEL_H__
+#define __ARM_COMPUTE_CLDEPTHCONCATENATEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the depth concatenate kernel.
+ *  The input tensor will be concatenated into the output tensor.
+ */
+class CLDepthConcatenateKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLDepthConcatenateKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLDepthConcatenateKernel(const CLDepthConcatenateKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLDepthConcatenateKernel &operator=(const CLDepthConcatenateKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLDepthConcatenateKernel(CLDepthConcatenateKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLDepthConcatenateKernel &operator=(CLDepthConcatenateKernel &&) = default;
+    /** Default destructor */
+    ~CLDepthConcatenateKernel() = default;
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]     input        Input tensor. Data types supported: F32.
+     * @param[in]     depth_offset The offset on the Z axis.
+     * @param[in,out] output       Output tensor. Data types supported: F32.
+     *
+     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
+     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
+     *
+     */
+    void configure(const ICLTensor *input, unsigned int depth_offset, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    int              _top_bottom;
+    int              _left_right;
+};
+}
+#endif /* __ARM_COMPUTE_CLDEPTHCONCATENATEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLDepthConvertKernel.h b/arm_compute/core/CL/kernels/CLDepthConvertKernel.h
new file mode 100644
index 0000000000..2c3b1b8b69
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLDepthConvertKernel.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLDEPTHCONVERTKERNEL_H__
+#define __ARM_COMPUTE_CLDEPTHCONVERTKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the depth conversion kernel.
+ *
+ */
+class CLDepthConvertKernel : public ICLSimple2DKernel
+{
+public:
+    /** Set the input and output of the kernel.
+     *
+     * Valid conversions Input -> Output :
+     *
+     *   - U8 -> U16, S16, U32, S32
+     *   - U16 -> U8, U32, S32
+     *   - S16 -> U8, U32, S32
+     *   - U32 -> U8, U16, S16
+     *   - S32 -> U8, U16, S16
+     *
+     * @param[in]  input  The input tensor to convert. Data types supported: U8, U16, S16, U32 or S32.
+     * @param[out] output The output tensor. Data types supported: U8, U16, S16, U32 or S32.
+     * @param[in]  policy Conversion policy
+     * @param[in]  shift  Value for down/up conversions. Must be 0 <= shift < 8.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift);
+};
+}
+
+#endif /*__ARM_COMPUTE_CLDEPTHCONVERTKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLDerivativeKernel.h b/arm_compute/core/CL/kernels/CLDerivativeKernel.h
new file mode 100644
index 0000000000..17552aefbe
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLDerivativeKernel.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLDERIVATIVEKERNEL_H__
+#define __ARM_COMPUTE_CLDERIVATIVEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the derivative kernel. */
+class CLDerivativeKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLDerivativeKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLDerivativeKernel(const CLDerivativeKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLDerivativeKernel &operator=(const CLDerivativeKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLDerivativeKernel(CLDerivativeKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLDerivativeKernel &operator=(CLDerivativeKernel &&) = default;
+    /** Default destructor */
+    ~CLDerivativeKernel() = default;
+    /** Initialise the kernel's sources, destination and border
+     *
+     * @note At least one of output_x or output_y must be set
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_input;            /**< Input tensor */
+    ICLTensor       *_output_x;         /**< Output tensor - Derivate along the X direction */
+    ICLTensor       *_output_y;         /**< Output tensor - Derivate along the Y direction */
+    bool             _run_derivative_x; /**< Do we need to run Derivative X ? */
+    bool             _run_derivative_y; /**< Do we need to run Derivative Y ? */
+};
+}
+#endif /*__ARM_COMPUTE_CLDERIVATIVEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLDilateKernel.h b/arm_compute/core/CL/kernels/CLDilateKernel.h
new file mode 100644
index 0000000000..a5d3beb02f
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLDilateKernel.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLDILATEKERNEL_H__
+#define __ARM_COMPUTE_CLDILATEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the dilate kernel.
+ *
+ */
+class CLDilateKernel : public ICLSimple2DKernel
+{
+public:
+    /**Initialise the kernel's input and output.
+     *
+     * @param[in]  input            An input tensor. Data types supported: U8
+     * @param[out] output           The output tensor. Data types supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+};
+}
+#endif /*__ARM_COMPUTE_CLDILATEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLErodeKernel.h b/arm_compute/core/CL/kernels/CLErodeKernel.h
new file mode 100644
index 0000000000..a43c925be6
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLErodeKernel.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLERODEKERNEL_H__
+#define __ARM_COMPUTE_CLERODEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the erode kernel.
+ *
+ */
+class CLErodeKernel : public ICLSimple2DKernel
+{
+public:
+    /**Initialise the kernel's input and output.
+     *
+     * @param[in]  input            An input tensor. Data types supported: U8
+     * @param[out] output           The output tensor. Data types supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+};
+}
+#endif /*__ARM_COMPUTE_CLERODEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLFastCornersKernel.h b/arm_compute/core/CL/kernels/CLFastCornersKernel.h
new file mode 100644
index 0000000000..9817b78ae0
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLFastCornersKernel.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLFASTCORNERSKERNEL_H__
+#define __ARM_COMPUTE_CLFASTCORNERSKERNEL_H__
+
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace cl
+{
+class Buffer;
+}
+
+namespace arm_compute
+{
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** CL kernel to perform fast corners */
+class CLFastCornersKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLFastCornersKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFastCornersKernel(const CLFastCornersKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFastCornersKernel &operator=(const CLFastCornersKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLFastCornersKernel(CLFastCornersKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLFastCornersKernel &operator=(CLFastCornersKernel &&) = default;
+    /** Default destructor */
+    ~CLFastCornersKernel() = default;
+
+    /** Initialise the kernel.
+     *
+     * @param[in]  input               Source image. Data types supported: U8.
+     * @param[out] output              Output image. Data types supported: U8.
+     * @param[in]  threshold           Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3.
+     * @param[in]  non_max_suppression True if non-maxima suppresion is applied, false otherwise.
+     * @param[in]  border_mode         Strategy to use for borders.
+     */
+    void configure(const ICLImage *input, ICLImage *output, float threshold, bool non_max_suppression, BorderMode border_mode);
+
+    // Inherited methods overridden
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLImage *_input;
+    ICLImage       *_output;
+};
+
+/** CL kernel to copy keypoints information to ICLKeyPointArray and counts the number of key points */
+class CLCopyToArrayKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLCopyToArrayKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLCopyToArrayKernel(const CLCopyToArrayKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLCopyToArrayKernel &operator=(const CLCopyToArrayKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLCopyToArrayKernel(CLCopyToArrayKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLCopyToArrayKernel &operator=(CLCopyToArrayKernel &&) = default;
+    /** Default destructor */
+    ~CLCopyToArrayKernel() = default;
+
+    /** Initialise the kernel.
+     *
+     * @param[in]  input         Source image. Data types supported: U8.
+     * @param[in]  update_number Flag to indicate whether we need to update the number of corners
+     * @param[out] corners       Array of keypoints to store the results.
+     * @param[out] num_buffers   Number of keypoints to store the results.
+     */
+    void configure(const ICLImage *input, bool update_number, ICLKeyPointArray *corners, cl::Buffer *num_buffers);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLImage   *_input;      /**< source image */
+    ICLKeyPointArray *_corners;    /**< destination array */
+    cl::Buffer       *_num_buffer; /**< CL memory to record number of key points in the array */
+};
+}
+#endif /* __ARM_COMPUTE_CLFASTCORNERSKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLFillBorderKernel.h b/arm_compute/core/CL/kernels/CLFillBorderKernel.h
new file mode 100644
index 0000000000..797f86dae8
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLFillBorderKernel.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLFILLBORDERKERNEL_H__
+#define __ARM_COMPUTE_CLFILLBORDERKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for filling the border of a kernel */
+class CLFillBorderKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLFillBorderKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFillBorderKernel(const CLFillBorderKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFillBorderKernel &operator=(const CLFillBorderKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLFillBorderKernel(CLFillBorderKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLFillBorderKernel &operator=(CLFillBorderKernel &&) = default;
+    /** Default destructor */
+    ~CLFillBorderKernel() = default;
+
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in,out] tensor                Tensor to process Data types supported: U8, S16, S32, F32.
+     * @param[in]     border_size           Size of the border to fill in elements.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+
+    /** Function to set the constant value on fill border kernel depending on type.
+     *
+     * @param[in] idx                   Index of the kernel argument to set.
+     * @param[in] constant_border_value Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    template <class T>
+    void set_constant_border(unsigned int idx, const PixelValue &constant_border_value);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    bool is_parallelisable() const override;
+
+private:
+    ICLTensor *_tensor;
+};
+}
+#endif /*__ARM_COMPUTE_CLFILLBORDERKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h b/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
new file mode 100644
index 0000000000..3ac7b3c4fa
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMINTERLEAVE4X4KERNEL_H__
+#define __ARM_COMPUTE_CLGEMMINTERLEAVE4X4KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel which interleaves the elements of a matrix A in chunk of 4x4
+ *
+ * This function puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
+ *
+ * @f[
+ * \left( \begin{array}{cccc}
+ * a00 & a01 & a02 & a03 \\
+ * a10 & a11 & a12 & a13 \\
+ * a20 & a21 & a22 & a23 \\
+ * a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccccccccccccccccc}
+ * a00 & a10 & a20 & a30 & a01 & a11 & a21 & a31 & a02 & a12 & a22 & a32 & a03 & a13 & a23 & a33 \\
+ * \end{array} \right)
+ * @f]
+ *
+ * After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
+ */
+class CLGEMMInterleave4x4Kernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLGEMMInterleave4x4Kernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMInterleave4x4Kernel(const CLGEMMInterleave4x4Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMInterleave4x4Kernel &operator=(const CLGEMMInterleave4x4Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGEMMInterleave4x4Kernel(CLGEMMInterleave4x4Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGEMMInterleave4x4Kernel &operator=(CLGEMMInterleave4x4Kernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/U16/S16/F16/U32/S32/F32
+     * @param[out] output Output tensor. Data type supported: same as @p input
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+
+    // Inherited methods overridden
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+};
+}
+#endif /* __ARM_COMPUTE_CLGEMMINTERLEAVE4X4KERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
new file mode 100644
index 0000000000..f84d0638da
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNEL_H__
+#define __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to compute low precision matrix multiplication kernel
+ *
+ *  This kernel performs the following computation:
+ *  -# Convert a values from uint8 to int32 and add a_offset to each of them.
+ *  -# Convert b values from uint8 to int32 and add b_offset to each of them.
+ *  -# Compute the int32 matrix product of the resulting a * b.
+ *  -# Add output_offset to each entry of the result.
+ *  -# Multiply each entry of the result and round to the nearest integer
+ *  -# Clamp the resulting int32 values to the [0..255] range and cast to uint8.
+ */
+class CLGEMMLowpMatrixMultiplyKernel : public ICLKernel
+{
+public:
+    /** Default Constructor */
+    CLGEMMLowpMatrixMultiplyKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMLowpMatrixMultiplyKernel(const CLGEMMLowpMatrixMultiplyKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMLowpMatrixMultiplyKernel &operator=(const CLGEMMLowpMatrixMultiplyKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGEMMLowpMatrixMultiplyKernel(CLGEMMLowpMatrixMultiplyKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGEMMLowpMatrixMultiplyKernel &operator=(CLGEMMLowpMatrixMultiplyKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * The input matrices @p input0 and @p input1 must be the output of the kernels: @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel.
+     * These two kernels change the layout of the original matrices to be more cache-friendly.
+     *
+     * @param[in]  input0          Input tensor containing the interleaved Matrix A. Data types supported: U8
+     * @param[in]  input1          Input tensor containing the transposed Matrix B. Data types supported: same as @p input0
+     * @param[out] output          Output tensor to store the result of matrix multiplication, Data types supported: same as @p input0
+     * @param[in]  a_offset        Offset to be added to each element of the matrix A.
+     * @param[in]  b_offset        Offset to be added to each element of the matrix B.
+     * @param[in]  output_offset   Offset to be added to each element of the output matrix
+     * @param[in]  output_mult_int Offset to be added to each element of the output matrix
+     * @param[in]  shift           Number of bits to shift right the result.
+     */
+    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input0;
+    const ICLTensor *_input1;
+    ICLTensor       *_output;
+};
+}
+#endif /*__ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNEL_H__*/
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h
new file mode 100644
index 0000000000..ea1db9f831
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMMATRIXACCUMULATEBIASESKERNEL_H__
+#define __ARM_COMPUTE_CLGEMMMATRIXACCUMULATEBIASESKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+/** Interface to add a bias to each row of the input tensor
+ *
+ */
+class CLGEMMMatrixAccumulateBiasesKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLGEMMMatrixAccumulateBiasesKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMMatrixAccumulateBiasesKernel(const CLGEMMMatrixAccumulateBiasesKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMMatrixAccumulateBiasesKernel &operator=(const CLGEMMMatrixAccumulateBiasesKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGEMMMatrixAccumulateBiasesKernel(CLGEMMMatrixAccumulateBiasesKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGEMMMatrixAccumulateBiasesKernel &operator=(CLGEMMMatrixAccumulateBiasesKernel &&) = default;
+    /** Set the accumulate buffer and the biases of the kernel.
+     *
+     * @param[in, out] accum  The accumulate tensor to convert. Data types supported: F16/F32
+     * @param[in]      biases The shared biases tensor to append. It must be 1D tensor. Data types supported: Same as @p input
+     */
+    void configure(ICLTensor *accum, const ICLTensor *biases);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    ICLTensor       *_accum;
+    const ICLTensor *_biases;
+};
+}
+
+#endif /*__ARM_COMPUTE_CLGEMMMATRIXACCUMULATEBIASESKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h
new file mode 100644
index 0000000000..c808039567
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMMATRIXADDITIONKERNEL_H__
+#define __ARM_COMPUTE_CLGEMMMATRIXADDITIONKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform the in-place matrix addition between 2 matrices, taking into account that the second matrix might be weighted by a scalar value beta.
+ *  The matrices must have the same dimensions
+ *
+ * @note This kernel is computed if and only if beta != 0.0.
+ */
+class CLGEMMMatrixAdditionKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLGEMMMatrixAdditionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMMatrixAdditionKernel(const CLGEMMMatrixAdditionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMMatrixAdditionKernel &operator=(const CLGEMMMatrixAdditionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGEMMMatrixAdditionKernel(CLGEMMMatrixAdditionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGEMMMatrixAdditionKernel &operator=(CLGEMMMatrixAdditionKernel &&) = default;
+    /** Initialise the kernel's input, output and beta value
+     *
+     * @note The input and output tensors must have the same dimensions
+     *
+     * @param[in]      input  Input tensor (Matrix C). Data types supported: F16/F32
+     * @param[in, out] output Output tensor. If this kernel is used to finalize the GEMM result (alpha * AB + beta * C), output must contain the result obtained by @ref CLGEMMMatrixMultiplyKernel. Data type supported: same as @p input
+     * @param[in]      beta   Weight of matrix C
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, float beta);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+};
+}
+
+#endif /* __ARM_COMPUTE_CLGEMMMATRIXADDITIONKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
new file mode 100644
index 0000000000..07ea3c12ac
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMMATRIXMULTIPLYKERNEL_H__
+#define __ARM_COMPUTE_CLGEMMMATRIXMULTIPLYKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to multiply two input matrices "A" and "B" or to multiply a vector "A" by a matrix "B". All elements of the output matrix/vector will be multiplied by alpha
+ *
+ * @note If the output tensor is a matrix, the implementation assumes that the input tensors @p input0 and @p input1 are both matrices and reshaped respectively with @ref CLGEMMInterleave4x4Kernel" and @ref CLGEMMTranspose1xWKernel
+ * @note If the output tensor is a vector and the data type is F32, the implementation assumes that the first input tensor @p input0 is a vector and the second input tensor @p input1 a matrix. The implementation also assumes that both tensors have not been reshaped
+ *
+ * @attention The second input tensor must have at least 2 dimensions (matrix)
+ *
+ */
+class CLGEMMMatrixMultiplyKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLGEMMMatrixMultiplyKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMMatrixMultiplyKernel(const CLGEMMMatrixMultiplyKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMMatrixMultiplyKernel &operator=(const CLGEMMMatrixMultiplyKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGEMMMatrixMultiplyKernel(CLGEMMMatrixMultiplyKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGEMMMatrixMultiplyKernel &operator=(CLGEMMMatrixMultiplyKernel &&) = default;
+    /** Initialise the kernel's input, output and alpha
+     *
+     * @param[in]  input0 Input tensor containing the interleaved Matrix A or the vector A. Data types supported: F16/F32
+     * @param[in]  input1 Input tensor containing the transposed Matrix B if the first input tensor A is not a vector.
+     *                    If the output tensor is a vector, input1 must contain the matrix B not reshaped. Data type supported: same as @p input0
+     * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
+     * @param[in]  alpha  Weight of the matrix product
+     */
+    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input0;
+    const ICLTensor *_input1;
+    ICLTensor       *_output;
+};
+}
+#endif /* __ARM_COMPUTE_CLGEMMMATRIXMULTIPLYKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h b/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h
new file mode 100644
index 0000000000..8d44a4c4fa
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMTRANSPOSE1XWKERNEL_H__
+#define __ARM_COMPUTE_CLGEMMTRANSPOSE1XWKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel which transposes the elements of a matrix in chunks of 1x4 if the input data type is F32 or in chunks of 1x8 if the input data type is F16.
+ *
+ * Following an example of how the transposition1xW works when the input data type is F32
+ *
+ * @f[
+ * \left( \begin{array}{cccc}
+ * a00 & a01 & a02 & a03 \\
+ * a10 & a11 & a12 & a13 \\
+ * a20 & a21 & a22 & a23 \\
+ * a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccccccccccccccccc}
+ * a00 & a01 & a02 & a03 & a10 & a11 & a12 & a13 & a20 & a21 & a22 & a23 & a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * @f]
+ *
+ * Following an example of how the transposition1xW works when the input data type is F16
+ *
+ * @f[
+ * \left( \begin{array}{cccccccc}
+ * a00 & a01 & a02 & a03 & a04 & a05 & a06 & a7 \\
+ * a10 & a11 & a12 & a13 & a14 & a15 & a16 & 17 \\
+ * a20 & a21 & a22 & a23 & a24 & a25 & a26 & 27 \\
+ * a30 & a31 & a32 & a33 & a34 & a35 & a36 & 37 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc}
+ * a00 & a01 & a02 & a03 & a04 & a05 & a06 & a07 & a10 & a11 & a12 & a13 & a14 & a15 & a16 & a17 & a20 & a21 & a22 & a23 & a24 & a25 & a26 & a27 & a30 & a31 & a32 & a33 & a34 & a35 & a36 & a37\\
+ * \end{array} \right)
+ * @f]
+ *
+ * @note If the input data type is F32, the output matrix will have the following shape: [ height * 4, width / 4 ]
+ * @note If the input data type is F16, the output matrix will have the following shape: [ height * 8, width / 8 ]
+ * @note If the input data type is U8, the output matrix will have the following shape: [ height * 16, width / 16 ]
+ *
+ */
+class CLGEMMTranspose1xWKernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input  Input tensor. Data types supported: U8/F16/F32
+     * @param[out] output Output tensor. Data type supported: same as @p input
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+};
+}
+#endif /* __ARM_COMPUTE_CLGEMMTRANSPOSE1XWKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h b/arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h
new file mode 100644
index 0000000000..028a10b421
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGAUSSIAN3X3KERNEL_H__
+#define __ARM_COMPUTE_CLGAUSSIAN3X3KERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the Gaussian 3x3 filter kernel.
+ *
+ */
+class CLGaussian3x3Kernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input            An input tensor. Data types supported: U8
+     * @param[out] output           The output tensor. Data types supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+};
+}
+#endif /*__ARM_COMPUTE_CLGAUSSIAN3X3KERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h b/arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h
new file mode 100644
index 0000000000..1484c06311
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGAUSSIAN5X5KERNEL_H__
+#define __ARM_COMPUTE_CLGAUSSIAN5X5KERNEL_H__
+
+#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to run the horizontal pass of 5x5 Gaussian filter on a tensor. */
+class CLGaussian5x5HorKernel : public CLSeparableConvolution5x5HorKernel
+{
+public:
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output           Destination tensor. Data types supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
+
+private:
+    //Make the configure method of the parent class private
+    using CLSeparableConvolution5x5HorKernel::configure;
+};
+
+/** Interface for the kernel to run the vertical pass of 5x5 Gaussian filter on a tensor. */
+class CLGaussian5x5VertKernel : public CLSeparableConvolution5x5VertKernel
+{
+public:
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @param[in]  input            Input tensor(output of horizontal pass). Data types supported: S16.
+     * @param[out] output           Destination tensor. Data types supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
+
+private:
+    //Make the configure method of the parent class private
+    using CLSeparableConvolution5x5VertKernel::configure;
+};
+}
+#endif /*__ARM_COMPUTE_CLGAUSSIAN5X5KERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h b/arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h
new file mode 100644
index 0000000000..6d79d0e718
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGAUSSIANPYRAMIDKERNEL_H__
+#define __ARM_COMPUTE_CLGAUSSIANPYRAMIDKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimpleKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform a Gaussian filter and half scaling across width (horizontal pass) */
+class CLGaussianPyramidHorKernel : public ICLSimpleKernel
+{
+public:
+    /** Default constructor */
+    CLGaussianPyramidHorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGaussianPyramidHorKernel(const CLGaussianPyramidHorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGaussianPyramidHorKernel &operator=(const CLGaussianPyramidHorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGaussianPyramidHorKernel(CLGaussianPyramidHorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGaussianPyramidHorKernel &operator=(CLGaussianPyramidHorKernel &&) = default;
+    /** Default destructor */
+    ~CLGaussianPyramidHorKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output           Destination tensor. Output should have half the input width. Data types supported: U16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    BorderSize _border_size;
+    int        _l2_load_offset;
+};
+
+/** OpenCL kernel to perform a Gaussian filter and half scaling across height (vertical pass) */
+class CLGaussianPyramidVertKernel : public ICLSimpleKernel
+{
+public:
+    /** Default constructor */
+    CLGaussianPyramidVertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGaussianPyramidVertKernel(const CLGaussianPyramidVertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGaussianPyramidVertKernel &operator=(const CLGaussianPyramidVertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGaussianPyramidVertKernel(CLGaussianPyramidVertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGaussianPyramidVertKernel &operator=(CLGaussianPyramidVertKernel &&) = default;
+    /** Default destructor */
+    ~CLGaussianPyramidVertKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U16.
+     * @param[out] output           Destination tensor. Output should have half the input height. Data types supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    int _t2_load_offset;
+};
+}
+#endif /*__ARM_COMPUTE_CLGAUSSIANPYRAMIDKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h b/arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h
new file mode 100644
index 0000000000..45a5aac1bc
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHOGDESCRIPTORKERNEL_H__
+#define __ARM_COMPUTE_CLHOGDESCRIPTORKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/IHOG.h"
+#include "arm_compute/core/Size2D.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** OpenCL kernel to perform HOG Orientation Binning */
+class CLHOGOrientationBinningKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLHOGOrientationBinningKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGOrientationBinningKernel(const CLHOGOrientationBinningKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGOrientationBinningKernel &operator=(const CLHOGOrientationBinningKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLHOGOrientationBinningKernel(CLHOGOrientationBinningKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLHOGOrientationBinningKernel &operator=(CLHOGOrientationBinningKernel &&) = default;
+    /** Default destructor */
+    ~CLHOGOrientationBinningKernel() = default;
+
+    /**  Initialise the kernel's inputs, output and HOG's metadata
+     *
+     * @param[in]  input_magnitude Input tensor which stores the magnitude of the gradient for each pixel. Data type supported: S16.
+     * @param[in]  input_phase     Input tensor which stores the phase of the gradient for each pixel. Data type supported: U8
+     * @param[out] output          Output tensor which stores the local HOG for each cell. DataType supported: F32. Number of channels supported: equal to the number of histogram bins per cell
+     * @param[in]  hog_info        HOG's metadata
+     */
+    void configure(const ICLTensor *input_magnitude, const ICLTensor *input_phase, ICLTensor *output, const HOGInfo *hog_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input_magnitude;
+    const ICLTensor *_input_phase;
+    ICLTensor       *_output;
+    Size2D           _cell_size;
+};
+
+/** OpenCL kernel to perform HOG block normalization */
+class CLHOGBlockNormalizationKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLHOGBlockNormalizationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGBlockNormalizationKernel(const CLHOGBlockNormalizationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGBlockNormalizationKernel &operator=(const CLHOGBlockNormalizationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLHOGBlockNormalizationKernel(CLHOGBlockNormalizationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLHOGBlockNormalizationKernel &operator=(CLHOGBlockNormalizationKernel &&) = default;
+    /** Default destructor */
+    ~CLHOGBlockNormalizationKernel() = default;
+
+    /** Initialise the kernel's input, output and HOG's metadata
+     *
+     * @param[in]  input    Input tensor which stores the local HOG for each cell. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per cell
+     * @param[out] output   Output tensor which stores the normalised blocks. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
+     * @param[in]  hog_info HOG's metadata
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const HOGInfo *hog_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    Size2D           _num_cells_per_block_stride;
+};
+}
+#endif /* __ARM_COMPUTE_CLHOGDESCRIPTORKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLHOGDetectorKernel.h b/arm_compute/core/CL/kernels/CLHOGDetectorKernel.h
new file mode 100644
index 0000000000..47bd0549ee
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLHOGDetectorKernel.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHOGDETECTORKERNEL_H__
+#define __ARM_COMPUTE_CLHOGDETECTORKERNEL_H__
+
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/ICLHOG.h"
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/CL/OpenCL.h"
+
+namespace cl
+{
+class Buffer;
+}
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform HOG detector kernel using linear SVM */
+class CLHOGDetectorKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLHOGDetectorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGDetectorKernel(const CLHOGDetectorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGDetectorKernel &operator=(const CLHOGDetectorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLHOGDetectorKernel(CLHOGDetectorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLHOGDetectorKernel &operator=(CLHOGDetectorKernel &&) = default;
+    /** Default destructor */
+    ~CLHOGDetectorKernel() = default;
+
+    /** Initialise the kernel's input, HOG data-object, detection window, the stride of the detection window, the threshold and index of the object to detect
+     *
+     * @param[in]  input                   Input tensor which stores the HOG descriptor obtained with @ref CLHOGOrientationBinningKernel. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
+     * @param[in]  hog                     HOG data object used by @ref CLHOGOrientationBinningKernel and  @ref CLHOGBlockNormalizationKernel
+     * @param[out] detection_windows       Array of @ref DetectionWindow. This array stores all the detected objects
+     * @param[in]  num_detection_windows   Number of detected objects
+     * @param[in]  detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions.
+     *                                     It must be multiple of the hog->info()->block_stride()
+     * @param[in]  threshold               (Optional) Threshold for the distance between features and SVM classifying plane
+     * @param[in]  idx_class               (Optional) Index of the class used for evaluating which class the detection window belongs to
+     */
+    void configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, cl::Buffer *num_detection_windows, const Size2D &detection_window_stride, float threshold = 0.0f,
+                   uint16_t idx_class = 0);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue);
+
+private:
+    const ICLTensor         *_input;
+    ICLDetectionWindowArray *_detection_windows;
+    cl::Buffer              *_num_detection_windows;
+};
+}
+
+#endif /* __ARM_COMPUTE_CLHOGDETECTORKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLHarrisCornersKernel.h b/arm_compute/core/CL/kernels/CLHarrisCornersKernel.h
new file mode 100644
index 0000000000..d8057df8d1
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLHarrisCornersKernel.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHARRISCORNERSKERNEL_H__
+#define __ARM_COMPUTE_CLHARRISCORNERSKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Interface for the harris score kernel.
+ *
+ * @note The implementation supports 3, 5, and 7 for the block_size.
+ */
+class CLHarrisScoreKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLHarrisScoreKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHarrisScoreKernel(const CLHarrisScoreKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHarrisScoreKernel &operator=(const CLHarrisScoreKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLHarrisScoreKernel(CLHarrisScoreKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLHarrisScoreKernel &operator=(CLHarrisScoreKernel &&) = default;
+    /** Default destructor */
+    ~CLHarrisScoreKernel() = default;
+
+    /** Setup the kernel parameters
+     *
+     * @param[in]  input1           Source image (gradient X). Data types supported S16, S32. (Must be the same as input2)
+     * @param[in]  input2           Source image (gradient Y). Data types supported S16, S32. (Must be the same as input1)
+     * @param[out] output           Destination image (harris score). Data types supported F32
+     * @param[in]  block_size       The block window size used to compute the Harris Corner score.  Supports: 3, 5 and 7
+     * @param[in]  norm_factor      Normalization factor to use accordingly with the gradient size (Must be different from 0)
+     * @param[in]  strength_thresh  Minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel).
+     * @param[in]  sensitivity      Sensitivity threshold k from the Harris-Stephens equation.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLImage *input1, const ICLImage *input2, ICLImage *output,
+                   int32_t block_size, float norm_factor, float strength_thresh, float sensitivity,
+                   bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+protected:
+    const ICLImage *_input1;          /**< Source image - Gx component */
+    const ICLImage *_input2;          /**< Source image - Gy component */
+    ICLImage       *_output;          /**< Source image - Harris score */
+    float           _sensitivity;     /**< Sensitivity value */
+    float           _strength_thresh; /**< Threshold value */
+    float           _norm_factor;     /**< Normalization factor */
+    BorderSize      _border_size;     /**< Border size */
+};
+}
+#endif /* __ARM_COMPUTE_CLHARRISCORNERSKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLHistogramKernel.h b/arm_compute/core/CL/kernels/CLHistogramKernel.h
new file mode 100644
index 0000000000..b65e62d9a2
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLHistogramKernel.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHISTOGRAMKERNEL_H__
+#define __ARM_COMPUTE_CLHISTOGRAMKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLDistribution1D;
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Interface to run the histogram kernel. This kernel processes the part of image with width can be divided by 16.
+ *  If the image width is not a multiple of 16, remaining pixels have to be processed with the @ref CLHistogramBorderKernel
+ */
+class CLHistogramKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLHistogramKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHistogramKernel(const CLHistogramKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHistogramKernel &operator=(const CLHistogramKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLHistogramKernel(CLHistogramKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLHistogramKernel &operator=(CLHistogramKernel &&) = default;
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input  Source image. Data types supported: U8.
+     * @param[out] output Destination distribution.
+     */
+    void configure(const ICLImage *input, ICLDistribution1D *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLImage    *_input;
+    ICLDistribution1D *_output;
+};
+
+/** Interface to run the histogram kernel to handle the leftover part of image
+ *
+ */
+class CLHistogramBorderKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLHistogramBorderKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHistogramBorderKernel(const CLHistogramBorderKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHistogramBorderKernel &operator=(const CLHistogramBorderKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLHistogramBorderKernel(CLHistogramBorderKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLHistogramBorderKernel &operator=(CLHistogramBorderKernel &&) = default;
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input  Source image. Data types supported: U8.
+     * @param[out] output Destination distribution.
+     */
+    void configure(const ICLImage *input, ICLDistribution1D *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLImage    *_input;
+    ICLDistribution1D *_output;
+};
+}
+
+#endif /* __ARM_COMPUTE_CLHISTOGRAMKERNEL_H__*/
diff --git a/arm_compute/core/CL/kernels/CLIm2ColKernel.h b/arm_compute/core/CL/kernels/CLIm2ColKernel.h
new file mode 100644
index 0000000000..d2224b53e1
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLIm2ColKernel.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLIM2COLKERNEL_H__
+#define __ARM_COMPUTE_CLIM2COLKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the im2col reshape kernel.
+ *
+ * Rearranges image blocks into columns. It is used to strip out each convolution block to a single column.
+ * It is used to transform a convolution to a plain matrix multiplication.
+ *
+ * For example taking into account the image below and assuming 3x3 image blocks with stride of 1 we have:
+ * @f[
+ * \left( \begin{array}{cccc}
+ * a00 & a01 & a02 & a03 \\
+ * a10 & a11 & a12 & a13 \\
+ * a20 & a21 & a22 & a23 \\
+ * a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * =
+ * \left( \begin{array}{ccccccccc}
+ * a00 & a01 & a02 & a10 & a11 & a12 & a20 & a21 & a22 \\
+ * a01 & a02 & a03 & a11 & a12 & a13 & a21 & a22 & a23 \\
+ * a10 & a11 & a12 & a20 & a21 & a22 & a30 & a31 & a32 \\
+ * a11 & a12 & a13 & a21 & a22 & a23 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * @f]
+ */
+class CLIm2ColKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLIm2ColKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLIm2ColKernel(const CLIm2ColKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLIm2ColKernel &operator=(const CLIm2ColKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLIm2ColKernel(CLIm2ColKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLIm2ColKernel &operator=(CLIm2ColKernel &&) = default;
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input          The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
+     *                            while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16, F32
+     * @param[out] output         The output tensor. First 2 lower dimensions represent a transform of each 3D input,
+     *                            while every dimension above represents a batch. Data types supported: Same as @p input
+     * @param[in]  convolved_dims The convolved output dimensions.
+     * @param[in]  conv_info      Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  has_bias       In case biases are provided expands the matrix with 1.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, std::pair<unsigned int, unsigned int> convolved_dims, const PadStrideInfo &conv_info, bool has_bias);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    /** Run the reshape kernel optimised for the special case (stride is 1, padding is 0 and kernel's low 3 dimensions are same as input)
+     *
+     * @param[in]     window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     * @param[in,out] queue  Command queue on which to enqueue the kernel.
+     */
+    void run_reduced(const Window &window, cl::CommandQueue &queue);
+    /** run the generic convolution layer input reshape kernel
+     *
+     * @param[in]     window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     * @param[in,out] queue  Command queue on which to enqueue the kernel.
+     */
+    void run_generic(const Window &window, cl::CommandQueue &queue);
+
+    /** Common signature for the kernel to run */
+    using Im2ColFunction = void (CLIm2ColKernel::*)(const Window &, cl::CommandQueue &);
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    std::pair<unsigned int, unsigned int> _convolved_dims;
+    PadStrideInfo  _conv_info;
+    int            _kernel_size;
+    unsigned int   _num_elems_processed_per_iteration;
+    Im2ColFunction _run_func;
+};
+}
+
+#endif /*__ARM_COMPUTE_CLIM2COLKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLIntegralImageKernel.h b/arm_compute/core/CL/kernels/CLIntegralImageKernel.h
new file mode 100644
index 0000000000..0f53c2d2a8
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLIntegralImageKernel.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLINTEGRALIMAGEKERNEL_H__
+#define __ARM_COMPUTE_CLINTEGRALIMAGEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface to run the horizontal pass of the integral image kernel. */
+class CLIntegralImageHorKernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input  An input tensor. Data types supported: U8
+     * @param[out] output Destination tensor, Data types supported: U32.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+};
+
+/** Interface to run the vertical pass of the integral image kernel. */
+class CLIntegralImageVertKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLIntegralImageVertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLIntegralImageVertKernel(const CLIntegralImageVertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLIntegralImageVertKernel &operator=(const CLIntegralImageVertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLIntegralImageVertKernel(CLIntegralImageVertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLIntegralImageVertKernel &operator=(CLIntegralImageVertKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in,out] in_out The input/output tensor. Data types supported: U32
+     */
+    void configure(ICLTensor *in_out);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    ICLTensor *_in_out;
+};
+}
+#endif /*__ARM_COMPUTE_CLINTEGRALIMAGEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLLKTrackerKernel.h b/arm_compute/core/CL/kernels/CLLKTrackerKernel.h
new file mode 100644
index 0000000000..4d0dbed55d
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLLKTrackerKernel.h
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLLKTRACKERKERNEL_H__
+#define __ARM_COMPUTE_CLLKTRACKERKERNEL_H__
+
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Internal keypoint structure for Lucas-Kanade Optical Flow */
+struct CLLKInternalKeypoint
+{
+    float x{ 0.f };               /**< x coordinate of the keypoint */
+    float y{ 0.f };               /**< y coordinate of the keypoint */
+    float tracking_status{ 0.f }; /**< the tracking status of the keypoint */
+    float dummy{ 0.f };           /**< Dummy field, to make sure the data structure 128-bit align, so that GPU can use vload4 */
+};
+
+/** Structure for storing Spatial Gradient Matrix and the minimum eigenvalue for each keypoint */
+struct CLCoefficientTable
+{
+    float A11;     /**< iA11 * FLT_SCALE */
+    float A12;     /**< iA11 * FLT_SCALE */
+    float A22;     /**< iA11 * FLT_SCALE */
+    float min_eig; /**< Minimum eigenvalue */
+};
+
+/** Structure for storing ival, ixval and iyval for each point inside the window */
+struct CLOldValue
+{
+    int16_t ival;  /**< ival extracts from old image */
+    int16_t ixval; /**< ixval extracts from scharr Gx image */
+    int16_t iyval; /**< iyval extracts from scharr Gy image */
+    int16_t dummy; /**< Dummy field, to make sure the data structure 128-bit align, so that GPU can use vload4 */
+};
+
+using ICLLKInternalKeypointArray = ICLArray<CLLKInternalKeypoint>;
+using ICLCoefficientTableArray   = ICLArray<CLCoefficientTable>;
+using ICLOldValArray             = ICLArray<CLOldValue>;
+
+/** Interface to run the initialization step of LKTracker */
+class CLLKTrackerInitKernel : public ICLKernel
+{
+public:
+    /** Initialise the kernel input and output
+     *
+     * @param[in]  old_points           Pointer to the @ref ICLKeyPointArray storing old key points
+     * @param[in]  new_points_estimates Pointer to the @ref ICLKeyPointArray storing new estimates key points
+     * @param[out] old_points_internal  Pointer to the array of internal @ref CLLKInternalKeypoint old points
+     * @param[out] new_points_internal  Pointer to the array of internal @ref CLLKInternalKeypoint new points
+     * @param[in]  use_initial_estimate The flag to indicate whether the initial estimated position should be used
+     * @param[in]  level                The pyramid level
+     * @param[in]  num_levels           The number of pyramid levels
+     * @param[in]  pyramid_scale        Scale factor used for generating the pyramid
+     */
+    void configure(const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates,
+                   ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
+                   bool use_initial_estimate, size_t level, size_t num_levels, float pyramid_scale);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+};
+
+/** Interface to run the finalize step of LKTracker, where it truncates the coordinates stored in new_points array */
+class CLLKTrackerFinalizeKernel : public ICLKernel
+{
+public:
+    /** Initialise the kernel input and output
+     *
+     * @param[in]  new_points_internal Pointer to the array of internal @ref CLLKInternalKeypoint new points
+     * @param[out] new_points          Pointer to the @ref ICLKeyPointArray storing new key points
+     */
+    void configure(ICLLKInternalKeypointArray *new_points_internal, ICLKeyPointArray *new_points);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+};
+
+/** Interface to run the first stage of LKTracker, where A11, A12, A22, min_eig, ival, ixval and iyval are computed */
+class CLLKTrackerStage0Kernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLLKTrackerStage0Kernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLKTrackerStage0Kernel(const CLLKTrackerStage0Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLKTrackerStage0Kernel &operator=(const CLLKTrackerStage0Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLLKTrackerStage0Kernel(CLLKTrackerStage0Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLLKTrackerStage0Kernel &operator=(CLLKTrackerStage0Kernel &&) = default;
+    /** Initialise the kernel input and output
+     *
+     * @param[in]      old_input           Pointer to the input old tensor. Data types supported: U8
+     * @param[in]      old_scharr_gx       Pointer to the input scharr X tensor. Data types supported: S16
+     * @param[in]      old_scharr_gy       Pointer to the input scharr Y tensor. Data types supported: S16
+     * @param[in]      old_points_internal Pointer to the array of CLLKInternalKeypoint old points
+     * @param[in, out] new_points_internal Pointer to the array of CLLKInternalKeypoint new points
+     * @param[out]     coeff_table         Pointer to the array holding the Spatial Gradient coefficients
+     * @param[out]     old_ival            Pointer to the array holding internal values
+     * @param[in]      window_dimension    The size of the window on which to perform the algorithm
+     * @param[in]      level               The pyramid level
+     */
+    void configure(const ICLTensor *old_input, const ICLTensor *old_scharr_gx, const ICLTensor *old_scharr_gy,
+                   ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
+                   ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
+                   size_t window_dimension, size_t level);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_old_input;
+    const ICLTensor *_old_scharr_gx;
+    const ICLTensor *_old_scharr_gy;
+};
+
+/** Interface to run the second stage of LKTracker, where the motion vectors of the given points are computed */
+class CLLKTrackerStage1Kernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLLKTrackerStage1Kernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLKTrackerStage1Kernel(const CLLKTrackerStage1Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLKTrackerStage1Kernel &operator=(const CLLKTrackerStage1Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLLKTrackerStage1Kernel(CLLKTrackerStage1Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLLKTrackerStage1Kernel &operator=(CLLKTrackerStage1Kernel &&) = default;
+    /** Initialise the kernel input and output
+     *
+     * @param[in]      new_input           Pointer to the input new tensor. Data types supported: U8
+     * @param[in, out] new_points_internal Pointer to the array of CLLKInternalKeypoint for new points
+     * @param[in]      coeff_table         Pointer to the array holding the Spatial Gradient coefficients
+     * @param[in]      old_ival            Pointer to the array holding internal values
+     * @param[in]      termination         The criteria to terminate the search of each keypoint.
+     * @param[in]      epsilon             The error for terminating the algorithm
+     * @param[in]      num_iterations      The maximum number of iterations before terminating the algorithm
+     * @param[in]      window_dimension    The size of the window on which to perform the algorithm
+     * @param[in]      level               The pyramid level
+     */
+    void configure(const ICLTensor *new_input, ICLLKInternalKeypointArray *new_points_internal, ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
+                   Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, size_t level);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_new_input;
+};
+}
+#endif /*__ARM_COMPUTE_CLLKTRACKERKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h
new file mode 100644
index 0000000000..fda0327461
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLLOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H__
+#define __ARM_COMPUTE_CLLOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to multiply each row of first tensor with low 2 dimensions of second tensor.
+ *
+ * @attention The second input tensor must have at least 2 dimensions (matrix)
+ *
+ */
+class CLLocallyConnectedMatrixMultiplyKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLLocallyConnectedMatrixMultiplyKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLocallyConnectedMatrixMultiplyKernel(const CLLocallyConnectedMatrixMultiplyKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLocallyConnectedMatrixMultiplyKernel &operator=(const CLLocallyConnectedMatrixMultiplyKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLLocallyConnectedMatrixMultiplyKernel(CLLocallyConnectedMatrixMultiplyKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLLocallyConnectedMatrixMultiplyKernel &operator=(CLLocallyConnectedMatrixMultiplyKernel &&) = default;
+    /** Initialise the kernel's input, output and alpha
+     *
+     * @param[in]  input0 First input tensor. Data types supported: F32
+     * @param[in]  input1 Second input tensor. Data type supported: same as @p input0
+     * @param[out] output Output tensor to store the result. Data type supported: same as @p input0
+     */
+    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input0;
+    const ICLTensor *_input1;
+    ICLTensor       *_output;
+};
+}
+#endif /* __ARM_COMPUTE_CLLOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h b/arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h
new file mode 100644
index 0000000000..a8e1dcb361
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLMAGNITUDEPHASEKERNEL_H__
+#define __ARM_COMPUTE_CLMAGNITUDEPHASEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Template interface for the kernel to compute magnitude and phase.
+ *
+ */
+class CLMagnitudePhaseKernel : public ICLKernel
+{
+public:
+    /** Default constructor. */
+    CLMagnitudePhaseKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLMagnitudePhaseKernel(const CLMagnitudePhaseKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLMagnitudePhaseKernel &operator=(const CLMagnitudePhaseKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLMagnitudePhaseKernel(CLMagnitudePhaseKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLMagnitudePhaseKernel &operator=(CLMagnitudePhaseKernel &&) = default;
+    /** Initialise the kernel's input, output.
+     *
+     * @note At least one of output1 or output2 must be set.
+     *
+     * @param[in]  gx         The input gradient X tensor. Data types supported: S16.
+     * @param[in]  gy         The input gradient Y tensor. Data types supported: S16.
+     * @param[out] magnitude  (Optional) The output tensor - Magnitude. Data types supported: S16.
+     * @param[out] phase      (Optional) The output tensor - Phase. Data types supported: U8.
+     * @param[in]  mag_type   (Optional) Magnitude calculation type. Default: L2NORM.
+     * @param[in]  phase_type (Optional) Phase calculation type. Default: SIGNED.
+     */
+    void configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase,
+                   MagnitudeType mag_type = MagnitudeType::L2NORM, PhaseType phase_type = PhaseType::SIGNED);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_gx;        /**< Input gradient X. */
+    const ICLTensor *_gy;        /**< Input gradient Y. */
+    ICLTensor       *_magnitude; /**< Output - Magnitude. */
+    ICLTensor       *_phase;     /**< Output - Phase. */
+    bool             _run_mag;   /**< Calculate magnitude ? */
+    bool             _run_phase; /**< Calculate phase ? */
+};
+}
+
+#endif /* __ARM_COMPUTE_CLMAGNITUDEPHASEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLMeanStdDevKernel.h b/arm_compute/core/CL/kernels/CLMeanStdDevKernel.h
new file mode 100644
index 0000000000..9f30f76e1b
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLMeanStdDevKernel.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLMEANSTDDEVKERNEL_H__
+#define __ARM_COMPUTE_CLMEANSTDDEVKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace cl
+{
+class Buffer;
+}
+
+namespace arm_compute
+{
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Interface for the kernel to calculate mean and standard deviation of input image pixels. */
+class CLMeanStdDevKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLMeanStdDevKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMeanStdDevKernel(const CLMeanStdDevKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMeanStdDevKernel &operator=(const CLMeanStdDevKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLMeanStdDevKernel(CLMeanStdDevKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLMeanStdDevKernel &operator=(CLMeanStdDevKernel &&) = default;
+    /** Initialise the kernel's input and outputs.
+     *
+     * @param[in]  input              Input image. Data types supported: U8.
+     * @param[out] mean               Input average pixel value.
+     * @param[out] global_sum         Keeps global sum of pixel values (Buffer size: 1 cl_ulong).
+     * @param[out] stddev             (Optional) Output standard deviation of pixel values.
+     * @param[out] global_sum_squared (Optional if stddev is not set, required if stddev is set) Keeps global sum of squared pixel values (Buffer size: 1 cl_ulong).
+     */
+    void configure(const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev = nullptr, cl::Buffer *global_sum_squared = nullptr);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLImage *_input;
+    float          *_mean;
+    float          *_stddev;
+    cl::Buffer     *_global_sum;
+    cl::Buffer     *_global_sum_squared;
+};
+}
+#endif /* __ARM_COMPUTE_CLMEANSTDDEVKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLMedian3x3Kernel.h b/arm_compute/core/CL/kernels/CLMedian3x3Kernel.h
new file mode 100644
index 0000000000..5af364b6c6
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLMedian3x3Kernel.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLMEDIAN3X3KERNEL_H__
+#define __ARM_COMPUTE_CLMEDIAN3X3KERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the median 3x3 filter kernel.
+ *
+ */
+class CLMedian3x3Kernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input            An input tensor. Data types supported: U8
+     * @param[out] output           The output tensor. Data types supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+};
+}
+#endif /*__ARM_COMPUTE_CLMEDIAN3X3KERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h b/arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h
new file mode 100644
index 0000000000..6a31f3cf18
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLMINMAXLOCATIONKERNEL_H__
+#define __ARM_COMPUTE_CLMINMAXLOCATIONKERNEL_H__
+
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/ICLKernel.h"
+
+#include <array>
+
+namespace arm_compute
+{
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Interface for the kernel to perform min max search on an image.
+ */
+class CLMinMaxKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLMinMaxKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMinMaxKernel(const CLMinMaxKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMinMaxKernel &operator=(const CLMinMaxKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLMinMaxKernel(CLMinMaxKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLMinMaxKernel &operator=(CLMinMaxKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input   Input Image. Data types supported: U8 or S16.
+     * @param[out] min_max Buffer of 2 elements to store the min value at position 0 and the max value at position 1. Data type supported: S32.
+     */
+    void configure(const ICLImage *input, cl::Buffer *min_max);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;               /**< Input image. */
+    cl::Buffer      *_min_max;             /**< Minimum/maximum value. */
+    std::array<int, 2> _data_type_max_min; /**< Maximum and minimum data type value respectively. */
+};
+
+/** Interface for the kernel to find min max locations of an image.
+ */
+class CLMinMaxLocationKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLMinMaxLocationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMinMaxLocationKernel(const CLMinMaxLocationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMinMaxLocationKernel &operator=(const CLMinMaxLocationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLMinMaxLocationKernel(CLMinMaxLocationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLMinMaxLocationKernel &operator=(CLMinMaxLocationKernel &&) = default;
+    /** Initialise the kernel's input and outputs.
+     *
+     * @note When locations of min and max occurrences are requested, the reported number of locations is limited to the given array size.
+     *
+     * @param[in]  input         Input image. Data types supported: U8 or S16.
+     * @param[in]  min_max       Buffer of 2 elements which contains the min value at position 0 and the max value at position 1. Data type supported: S32
+     * @param[out] min_max_count Buffer of 2 elements to store the min value occurrences at position 0 and the max value occurrences at position 1. Data type supported: S32
+     * @param[out] min_loc       (Optional) Array of Coordinates2D used to store minimum value locations.
+     * @param[out] max_loc       (Optional) Array of Coordinates2D used to store maximum value locations.
+     */
+    void configure(const ICLImage *input, cl::Buffer *min_max, cl::Buffer *min_max_count,
+                   ICLCoordinates2DArray *min_loc = nullptr, ICLCoordinates2DArray *max_loc = nullptr);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLImage *_input;         /**< Input image. */
+    cl::Buffer     *_min_max_count; /**< Minimum/maximum value occurrences. */
+};
+}
+#endif /*__ARM_COMPUTE_CLMINMAXLOCATIONKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h b/arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h
new file mode 100644
index 0000000000..0c59063bbc
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLNONLINEARFILTERKERNEL_H__
+#define __ARM_COMPUTE_CLNONLINEARFILTERKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to apply a non-linear filter */
+class CLNonLinearFilterKernel : public ICLSimple2DKernel
+{
+public:
+    /** Default constructor */
+    CLNonLinearFilterKernel();
+    /** Set the source, destination and border mode of the kernel
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8
+     * @param[out] output           Destination tensor. Data types supported: U8
+     * @param[in]  function         Non linear function to perform
+     * @param[in]  mask_size        Mask size. Supported sizes: 3, 5
+     * @param[in]  pattern          Mask pattern
+     * @param[in]  mask             The given mask. Will be used only if pattern is specified to PATTERN_OTHER
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function,
+                   unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
+                   bool border_undefined);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+
+private:
+    BorderSize _border_size; /**< Border size */
+};
+}
+#endif /*__ARM_COMPUTE_CLNONLINEARFILTERKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h b/arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h
new file mode 100644
index 0000000000..1719bbbb47
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLNONMAXIMASUPPRESSION3x3KERNEL_H__
+#define __ARM_COMPUTE_CLNONMAXIMASUPPRESSION3x3KERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface to perform Non-Maxima suppression over a 3x3 window using OpenCL
+ *
+ * @note Used by @ref CLFastCorners and @ref CLHarrisCorners
+ */
+class CLNonMaximaSuppression3x3Kernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialise the kernel's sources, destinations and border mode.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8, F32. (Must be the same as the output tensor)
+     * @param[out] output           Destination tensor. Data types supported: U8, F32. (Must be the same as the input tensor)
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+};
+}
+#endif /* __ARM_COMPUTE_CLNONMAXIMASUPPRESSION3x3KERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h b/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h
new file mode 100644
index 0000000000..ca9034b162
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLNORMALIZATIONLAYERKERNEL_H__
+#define __ARM_COMPUTE_CLNORMALIZATIONLAYERKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the normalization layer kernel.
+ */
+class CLNormalizationLayerKernel : public ICLKernel
+{
+public:
+    /** Constructor */
+    CLNormalizationLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLNormalizationLayerKernel(const CLNormalizationLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLNormalizationLayerKernel &operator=(const CLNormalizationLayerKernel &) = delete;
+    /** Default Move Constructor. */
+    CLNormalizationLayerKernel(CLNormalizationLayerKernel &&) = default;
+    /** Default move assignment operator. */
+    CLNormalizationLayerKernel &operator=(CLNormalizationLayerKernel &&) = default;
+
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input         Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
+     *                           and an optional 4th dimension for batch of inputs. Data types supported: F16, F32.
+     * @param[in]  squared_input Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM],
+     *                           Data types should match the input type.
+     * @param[out] output        Destination tensor. Output will have the same number of dimensions as input. Data types should match the input type.
+     * @param[in]  norm_info     Normalization layer information like the normalization type, normalization size and other parameters.
+     */
+    void configure(const ICLTensor *input, const ICLTensor *squared_input, ICLTensor *output, NormalizationLayerInfo norm_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_input;
+    const ICLTensor *_squared_input;
+    ICLTensor       *_output;
+    BorderSize       _border_size;
+};
+}
+#endif /*__ARM_COMPUTE_CLNORMALIZATIONLAYERKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h b/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
new file mode 100644
index 0000000000..6fbbe95219
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLPIXELWISEMULTIPLICATIONKERNEL_H__
+#define __ARM_COMPUTE_CLPIXELWISEMULTIPLICATIONKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the pixelwise multiplication kernel.
+ *
+ */
+class CLPixelWiseMultiplicationKernel : public ICLKernel
+{
+public:
+    /** Default constructor.*/
+    CLPixelWiseMultiplicationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLPixelWiseMultiplicationKernel(const CLPixelWiseMultiplicationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLPixelWiseMultiplicationKernel &operator=(const CLPixelWiseMultiplicationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLPixelWiseMultiplicationKernel(CLPixelWiseMultiplicationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLPixelWiseMultiplicationKernel &operator=(CLPixelWiseMultiplicationKernel &&) = default;
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input1          An input tensor. Data types supported: U8, S16, F16, F32.
+     * @param[in]  input2          An input tensor. Data types supported: U8, S16, F16, F32.
+     * @param[out] output          The output tensor, Data types supported: U8 (Only if both inputs are U8), S16, F16, F32.
+     * @param[in]  scale           Scale to apply after multiplication.
+     *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
+     * @param[in]  overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+     * @param[in]  rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
+                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input1;
+    const ICLTensor *_input2;
+    ICLTensor       *_output;
+};
+}
+
+#endif /*__ARM_COMPUTE_CLPIXELWISEMULTIPLICATIONKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h b/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
new file mode 100644
index 0000000000..546a40b15e
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLPOOLINGLAYERKERNEL_H__
+#define __ARM_COMPUTE_CLPOOLINGLAYERKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the pooling layer kernel */
+class CLPoolingLayerKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLPoolingLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLPoolingLayerKernel(const CLPoolingLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLPoolingLayerKernel &operator=(const CLPoolingLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLPoolingLayerKernel(CLPoolingLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLPoolingLayerKernel &operator=(CLPoolingLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLPoolingLayerKernel() = default;
+
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input     Source tensor. Data types supported: F16, F32.
+     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    PoolingLayerInfo _pool_info;
+    BorderSize       _border_size;
+};
+}
+#endif /*__ARM_COMPUTE_CLPOOLINGLAYERKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLRemapKernel.h b/arm_compute/core/CL/kernels/CLRemapKernel.h
new file mode 100644
index 0000000000..7cebf2e817
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLRemapKernel.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLREMAPKERNEL_H__
+#define __ARM_COMPUTE_CLREMAPKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform a remap on a tensor */
+class CLRemapKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLRemapKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLRemapKernel(const CLRemapKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLRemapKernel &operator=(const CLRemapKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLRemapKernel(CLRemapKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLRemapKernel &operator=(CLRemapKernel &&) = default;
+    /** Initialize the kernel's input, output and border mode.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[in]  map_x            Map for X coordinates. Data types supported: F32.
+     * @param[in]  map_y            Map for Y coordinates. Data types supported: F32.
+     * @param[out] output           Destination tensor. Data types supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane.
+     * @param[in]  policy           The interpolation type.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    const ICLTensor *_map_x;
+    const ICLTensor *_map_y;
+};
+}
+#endif /*__ARM_COMPUTE_CLREMAPKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLScaleKernel.h b/arm_compute/core/CL/kernels/CLScaleKernel.h
new file mode 100644
index 0000000000..e74a7cb82a
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLScaleKernel.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSCALEKERNEL_H__
+#define __ARM_COMPUTE_CLSCALEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the warp affine kernel.*/
+class CLScaleKernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialise the kernel's inputs, output and interpolation policy
+     *
+     * @note dx, dy and offsets have the same dimensions (width and height) of the output tensor
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8, S16.
+     * @param[out] output           Destination tensor. Data types supported: U8, S16 (Must be the same as the input tensor).
+     *                              All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in]  policy           Interpolation type to use
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, bool border_undefined);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+};
+}
+
+#endif /*__ARM_COMPUTE_CLSCALEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLScharr3x3Kernel.h b/arm_compute/core/CL/kernels/CLScharr3x3Kernel.h
new file mode 100644
index 0000000000..fe245cc351
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLScharr3x3Kernel.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSCHARR3X3KERNEL_H__
+#define __ARM_COMPUTE_CLSCHARR3X3KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to run a 3x3 Scharr filter on a tensor.
+ *
+ * @f[
+ *      \mathbf{G}_x=\begin{vmatrix}
+ *      -3 & 0 & +3\\
+ *      -10& 0 & +10\\
+ *      -3 & 0 & +3
+ *      \end{vmatrix}
+ * @f]
+ * @f[
+ *      \mathbf{G}_y=\begin{vmatrix}
+ *      -3 & -10 & -3\\
+ *       0 & 0 & 0\\
+ *      +3 & +10 & +3
+ *      \end{vmatrix}
+ * @f]
+ */
+class CLScharr3x3Kernel : public ICLKernel
+{
+public:
+    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
+    CLScharr3x3Kernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLScharr3x3Kernel(const CLScharr3x3Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLScharr3x3Kernel &operator=(const CLScharr3x3Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLScharr3x3Kernel(CLScharr3x3Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLScharr3x3Kernel &operator=(CLScharr3x3Kernel &&) = default;
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @note At least one of output_x or output_y must be set.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    bool             _run_scharr_x; /**< Do we need to run Scharr X ? */
+    bool             _run_scharr_y; /**< Do we need to run Scharr Y ? */
+    const ICLTensor *_input;        /**< Input image */
+    ICLTensor       *_output_x;     /**< Output image for scharr X */
+    ICLTensor       *_output_y;     /**< Output image for scharr Y */
+};
+}
+#endif /*__ARM_COMPUTE_CLSCHARR3X3KERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLSobel3x3Kernel.h b/arm_compute/core/CL/kernels/CLSobel3x3Kernel.h
new file mode 100644
index 0000000000..9edeb6ceff
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLSobel3x3Kernel.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSOBEL3X3KERNEL_H__
+#define __ARM_COMPUTE_CLSOBEL3X3KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to run a 3x3 Sobel filter on a tensor. */
+class CLSobel3x3Kernel : public ICLKernel
+{
+public:
+    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
+    CLSobel3x3Kernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLSobel3x3Kernel(const CLSobel3x3Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLSobel3x3Kernel &operator=(const CLSobel3x3Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLSobel3x3Kernel(CLSobel3x3Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLSobel3x3Kernel &operator=(CLSobel3x3Kernel &&) = default;
+    /** Default destructor */
+    ~CLSobel3x3Kernel() = default;
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @note At least one of output_x or output_y must be set.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_input;       /**< Input tensor */
+    ICLTensor       *_output_x;    /**< Output tensor for Sobel X */
+    ICLTensor       *_output_y;    /**< Output tensor for Sobel Y */
+    bool             _run_sobel_x; /**< Do we need to run Sobel X ? */
+    bool             _run_sobel_y; /**< Do we need to run Sobel Y ? */
+};
+}
+#endif /*__ARM_COMPUTE_CLSOBEL3X3KERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLSobel5x5Kernel.h b/arm_compute/core/CL/kernels/CLSobel5x5Kernel.h
new file mode 100644
index 0000000000..e90f8f587e
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLSobel5x5Kernel.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSOBEL5X5KERNEL_H__
+#define __ARM_COMPUTE_CLSOBEL5X5KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to run the horizontal pass of 5x5 Sobel filter on a tensor. */
+class CLSobel5x5HorKernel : public ICLKernel
+{
+public:
+    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
+    CLSobel5x5HorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLSobel5x5HorKernel(const CLSobel5x5HorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLSobel5x5HorKernel &operator=(const CLSobel5x5HorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLSobel5x5HorKernel(CLSobel5x5HorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLSobel5x5HorKernel &operator=(CLSobel5x5HorKernel &&) = default;
+    /** Default destructor */
+    ~CLSobel5x5HorKernel() = default;
+
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @note At least one of output_x or output_y must be set.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_input;       /**< Input tensor */
+    ICLTensor       *_output_x;    /**< X output of horizontal pass */
+    ICLTensor       *_output_y;    /**< Y output of horizontal pass */
+    bool             _run_sobel_x; /**< Do we need to run Sobel X ? */
+    bool             _run_sobel_y; /**< Do we need to run Sobel Y ? */
+    BorderSize       _border_size; /**< Border size */
+};
+
+/** Interface for the kernel to run the vertical pass of 5x5 Sobel filter on a tensor. */
+class CLSobel5x5VertKernel : public ICLKernel
+{
+public:
+    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
+    CLSobel5x5VertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLSobel5x5VertKernel(const CLSobel5x5VertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLSobel5x5VertKernel &operator=(const CLSobel5x5VertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLSobel5x5VertKernel(CLSobel5x5VertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLSobel5x5VertKernel &operator=(CLSobel5x5VertKernel &&) = default;
+    /** Default destructor */
+    ~CLSobel5x5VertKernel() = default;
+
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @note At least one of output_x or output_y must be set and the corresponding input.
+     *
+     * @param[in]  input_x          (Optional) Input for X (X output of horizontal pass). Data types supported: S16.
+     * @param[in]  input_y          (Optional) Input for Y (Y output of horizontal pass). Data types supported: S16.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_input_x;     /**< X input (X output of the horizontal pass) */
+    const ICLTensor *_input_y;     /**< Y input (Y output of the horizontal pass) */
+    ICLTensor       *_output_x;    /**< X output of sobel */
+    ICLTensor       *_output_y;    /**< Y output of sobel */
+    bool             _run_sobel_x; /**< Do we need to run sobel X? */
+    bool             _run_sobel_y; /**< Do we need to run sobel Y? */
+};
+}
+#endif /*__ARM_COMPUTE_CLSOBEL5X5KERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLSobel7x7Kernel.h b/arm_compute/core/CL/kernels/CLSobel7x7Kernel.h
new file mode 100644
index 0000000000..e5ef8444ee
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLSobel7x7Kernel.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSOBEL7X7KERNEL_H__
+#define __ARM_COMPUTE_CLSOBEL7X7KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the kernel to run the horizontal pass of 7x7 Sobel filter on a tensor. */
+class CLSobel7x7HorKernel : public ICLKernel
+{
+public:
+    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
+    CLSobel7x7HorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLSobel7x7HorKernel(const CLSobel7x7HorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLSobel7x7HorKernel &operator=(const CLSobel7x7HorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLSobel7x7HorKernel(CLSobel7x7HorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLSobel7x7HorKernel &operator=(CLSobel7x7HorKernel &&) = default;
+    /** Default destructor */
+    ~CLSobel7x7HorKernel() = default;
+
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @note At least one of output_x or output_y must be set.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S32.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S32.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_input;       /**< Input tensor */
+    ICLTensor       *_output_x;    /**< X output of horizontal pass */
+    ICLTensor       *_output_y;    /**< Y output of horizontal pass */
+    bool             _run_sobel_x; /**< Do we need to run Sobel X ? */
+    bool             _run_sobel_y; /**< Do we need to run Sobel Y ? */
+    BorderSize       _border_size; /**< Border size */
+};
+
+/** Interface for the kernel to run the vertical pass of 7x7 Sobel filter on a tensor. */
+class CLSobel7x7VertKernel : public ICLKernel
+{
+public:
+    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
+    CLSobel7x7VertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLSobel7x7VertKernel(const CLSobel7x7VertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLSobel7x7VertKernel &operator=(const CLSobel7x7VertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLSobel7x7VertKernel(CLSobel7x7VertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLSobel7x7VertKernel &operator=(CLSobel7x7VertKernel &&) = default;
+    /** Default destructor */
+    ~CLSobel7x7VertKernel() = default;
+
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @note At least one of output_x or output_y must be set and the corresponding input.
+     *
+     * @param[in]  input_x          (Optional) Input for X (X output of horizontal pass). Data types supported: S32.
+     * @param[in]  input_y          (Optional) Input for Y (Y output of horizontal pass). Data types supported: S32.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S32.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S32.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    const ICLTensor *_input_x;     /**< X input (X output of the horizontal pass) */
+    const ICLTensor *_input_y;     /**< Y input (Y output of the horizontal pass) */
+    ICLTensor       *_output_x;    /**< X output of sobel */
+    ICLTensor       *_output_y;    /**< Y output of sobel */
+    bool             _run_sobel_x; /**< Do we need to run sobel X? */
+    bool             _run_sobel_y; /**< Do we need to run sobel Y? */
+};
+}
+#endif /*__ARM_COMPUTE_CLSOBEL7X7KERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h b/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h
new file mode 100644
index 0000000000..0806974ad6
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSOFTMAXLAYERKERNEL_H__
+#define __ARM_COMPUTE_CLSOFTMAXLAYERKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the identifying the max value of 1D Logits */
+class CLLogits1DMaxKernel : public ICLSimple2DKernel
+{
+public:
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Source tensor. Data types supported: F16, F32. Number of channels must be 1.
+     * @param[out] output Destination tensor. Matching input type and channel number.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+};
+
+/** Interface for shifting the logits values around the max value and exponentiating the result */
+class CLLogits1DShiftExpSumKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLLogits1DShiftExpSumKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLogits1DShiftExpSumKernel(const CLLogits1DShiftExpSumKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLogits1DShiftExpSumKernel &operator=(const CLLogits1DShiftExpSumKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLLogits1DShiftExpSumKernel(CLLogits1DShiftExpSumKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLLogits1DShiftExpSumKernel &operator=(CLLogits1DShiftExpSumKernel &&) = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Source tensor. Data types supported: F16, F32. Number of channels must be 1.
+     * @param[in]  max    Max values tensor. Matching input type and channel number.
+     * @param[out] output Destination tensor. Matching input type and channel number.
+     * @param[out] sum    Sum of 1D logits tensor. Matching input type and channel number.
+     */
+    void configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    const ICLTensor *_max;
+    ICLTensor       *_output;
+    ICLTensor       *_sum;
+};
+
+/** Interface for calculating the final step of the Softmax Layer where each logit value is multiplied by the inverse of the sum of the logits. */
+class CLLogits1DNormKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLLogits1DNormKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLogits1DNormKernel(const CLLogits1DNormKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLogits1DNormKernel &operator=(const CLLogits1DNormKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLLogits1DNormKernel(CLLogits1DNormKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLLogits1DNormKernel &operator=(CLLogits1DNormKernel &&) = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Source tensor. Data types supported: F16, F32. Number of channels must be 1.
+     * @param[in]  sum    Sum tensor. Dimensions should be dim(input)-1. Matching input type and channel number.
+     * @param[out] output Destination tensor. Matching input type and channel number.
+     */
+    void configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    const ICLTensor *_sum;
+    ICLTensor       *_output;
+};
+}
+#endif /*__ARM_COMPUTE_CLSOFTMAXLAYERKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLTableLookupKernel.h b/arm_compute/core/CL/kernels/CLTableLookupKernel.h
new file mode 100644
index 0000000000..477f58dc38
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLTableLookupKernel.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLTABLELOOKUPKERNEL_H__
+#define __ARM_COMPUTE_CLTABLELOOKUPKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+class ICLLut;
+
+/** Interface for the kernel to perform table lookup calculations. */
+class CLTableLookupKernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialise the kernel's input, lut and output.
+     *
+     * @param[in]  input  An input tensor. Data types supported: U8, S16.
+     * @param[in]  lut    The input LUT. Data types supported: U8, S16.
+     * @param[out] output The output tensor. Data types supported: U8, S16.
+     */
+    void configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output);
+};
+}
+#endif /* __ARM_COMPUTE_CLTABLELOOKUPKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLThresholdKernel.h b/arm_compute/core/CL/kernels/CLThresholdKernel.h
new file mode 100644
index 0000000000..d7a6ae2cdb
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLThresholdKernel.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLTHRESHOLDKERNEL_H__
+#define __ARM_COMPUTE_CLTHRESHOLDKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the thresholding kernel.
+ *
+ */
+class CLThresholdKernel : public ICLSimple2DKernel
+{
+public:
+    /**Initialise the kernel's input, output and threshold parameters.
+     *
+     * @param[in]  input       An input tensor. Data types supported: U8
+     * @param[out] output      The output tensor. Data types supported: U8.
+     * @param[in]  threshold   Threshold. When the threshold type is RANGE, this is used as the lower threshold.
+     * @param[in]  false_value value to set when the condition is not respected.
+     * @param[in]  true_value  value to set when the condition is respected.
+     * @param[in]  type        Thresholding type. Either RANGE or BINARY.
+     * @param[in]  upper       Upper threshold. Only used when the thresholding type is RANGE.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, uint8_t threshold,
+                   uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper);
+};
+}
+#endif /*__ARM_COMPUTE_NETHRESHOLDKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLTransposeKernel.h b/arm_compute/core/CL/kernels/CLTransposeKernel.h
new file mode 100644
index 0000000000..9ad183f8f1
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLTransposeKernel.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLTRANSPOSEKERNEL_H__
+#define __ARM_COMPUTE_CLTRANSPOSEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel which transposes the elements of a matrix.
+ *
+ * [width, height, batch] -> [height, width, batch]
+ *
+ */
+class CLTransposeKernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/U16/S16/F16/U32/S32/F32
+     * @param[out] output Output tensor. Data type supported: Same as @p input
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+};
+}
+#endif /* __ARM_COMPUTE_CLTRANSPOSEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLWarpAffineKernel.h b/arm_compute/core/CL/kernels/CLWarpAffineKernel.h
new file mode 100644
index 0000000000..05d6d0a8f7
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLWarpAffineKernel.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLWARPAFFINEKERNEL_H__
+#define __ARM_COMPUTE_CLWARPAFFINEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the warp affine kernel.*/
+class CLWarpAffineKernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialize the function's source, destination, interpolation policy and border_mode.
+     *
+     * @param[in]  input  Source tensor. Data types supported: U8.
+     * @param[out] output Destination tensor, Data types supported: U8.
+     * @param[in]  matrix The perspective matrix. Must be 2x3 of type float.
+     * @param[in]  policy The interpolation type.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+};
+}
+#endif /*__ARM_COMPUTE_CLWARPAFFINEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h b/arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h
new file mode 100644
index 0000000000..5c5013c599
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLWARPERSPECTIVEKERNEL_H__
+#define __ARM_COMPUTE_CLWARPERSPECTIVEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+/** Interface for the warp perspective kernel.*/
+class CLWarpPerspectiveKernel : public ICLSimple2DKernel
+{
+public:
+    /** Initialize the function's source, destination, interpolation policy and border_mode.
+     *
+     * @param[in]  input  Source tensor. Data types supported: U8.
+     * @param[out] output Destination tensor, Data types supported: U8.
+     * @param[in]  matrix The perspective matrix. Must be 3x3 of type float.
+     * @param[in]  policy The interpolation type.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy);
+
+    // Inherited methods overridden:
+    BorderSize border_size() const override;
+};
+}
+
+#endif /*__ARM_COMPUTE_CLWARPERSPECTIVEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h b/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
new file mode 100644
index 0000000000..1dc8a8b80e
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLWEIGHTSRESHAPEKERNEL_H__
+#define __ARM_COMPUTE_CLWEIGHTSRESHAPEKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class CLWeightsReshapeKernel : public ICLKernel
+{
+public:
+    /** Constructor.
+     *
+     * @param[in] is_shared Flag to indicate whether the weights are shared or not.
+     */
+    CLWeightsReshapeKernel(bool is_shared = false);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLWeightsReshapeKernel(const CLWeightsReshapeKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLWeightsReshapeKernel &operator=(const CLWeightsReshapeKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLWeightsReshapeKernel(CLWeightsReshapeKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLWeightsReshapeKernel &operator=(CLWeightsReshapeKernel &&) = default;
+    /** Default destructor */
+    ~CLWeightsReshapeKernel() = default;
+
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
+     *                    and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: F16, F32
+     * @param[in]  biases The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
+     *                    dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
+     * @param[out] output The output tensor. Should be a 2D Tensor. Data types supported: Same as @p input
+     */
+    void configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output);
+
+    // Inherited methods overridden:
+    virtual void run(const Window &window, cl::CommandQueue &queue) = 0;
+
+protected:
+    bool             _is_shared;
+    const ICLTensor *_input;
+    const ICLTensor *_biases;
+    ICLTensor       *_output;
+};
+
+/** Interface for the weights reshape kernel used by convolution and fully connected layers.
+ *
+ * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels.
+ * In combination with the @ref CLIm2ColKernel can transform a convolution into a matrix multiplication.
+ *
+ * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have:
+ * @f[
+ * \left( \begin{array}{ccc}
+ * a000 & a001 & a002 \\
+ * a010 & a011 & a012 \\
+ * a020 & a021 & a022 \\
+ * \end{array} \right)
+ * \left( \begin{array}{ccc}
+ * a100 & a101 & a102 \\
+ * a110 & a111 & a112 \\
+ * a120 & a121 & a122 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccccccccc}
+ * a000 & a001 & a002 & a010 & a011 & a012 & a020 & a021 & a022 & a100 & a101 & a102 & a110 & a111 & a112 & a120 & a121 & a122 \\
+ * \end{array} \right)
+ * @f]
+ */
+class CLConvolutionLayerWeightsReshapeKernel : public CLWeightsReshapeKernel
+{
+public:
+    /** Default constructor */
+    CLConvolutionLayerWeightsReshapeKernel();
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+};
+
+/** Interface for the weights reshape kernel used by locally connected layers. */
+class CLLocallyConnectedLayerWeightsReshapeKernel : public CLWeightsReshapeKernel
+{
+public:
+    /** Default constructor */
+    CLLocallyConnectedLayerWeightsReshapeKernel();
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+};
+}
+#endif /*__ARM_COMPUTE_CLWEIGHTSRESHAPEKERNEL_H__ */
diff --git a/arm_compute/core/CPP/CPPKernels.h b/arm_compute/core/CPP/CPPKernels.h
new file mode 100644
index 0000000000..1eabfa9437
--- /dev/null
+++ b/arm_compute/core/CPP/CPPKernels.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CPPKERNELS_H__
+#define __ARM_COMPUTE_CPPKERNELS_H__
+
+/* Header regrouping all the CPP kernels */
+#include "arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h"
+#include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h"
+#include "arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h"
+
+#endif /* __ARM_COMPUTE_CPPKERNELS_H__ */
diff --git a/arm_compute/core/CPP/ICPPKernel.h b/arm_compute/core/CPP/ICPPKernel.h
new file mode 100644
index 0000000000..99ae68f2e5
--- /dev/null
+++ b/arm_compute/core/CPP/ICPPKernel.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ICPPKERNEL_H__
+#define __ARM_COMPUTE_ICPPKERNEL_H__
+
+#include "arm_compute/core/IKernel.h"
+
+namespace arm_compute
+{
+class Window;
+
+/** Common interface for all kernels implemented in C++ */
+class ICPPKernel : public IKernel
+{
+public:
+    /** Default destructor */
+    virtual ~ICPPKernel() = default;
+
+    /** Execute the kernel on the passed window
+     *
+     * @warning If is_parallelisable() returns false then the passed window must be equal to window()
+     *
+     * @note The window has to be a region within the window returned by the window() method
+     *
+     * @note The width of the window has to be a multiple of num_elems_processed_per_iteration().
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a region of the window returned by window())
+     */
+    virtual void run(const Window &window) = 0;
+};
+}
+#endif /*__ARM_COMPUTE_ICPPKERNEL_H__ */
diff --git a/arm_compute/core/CPP/ICPPSimpleKernel.h b/arm_compute/core/CPP/ICPPSimpleKernel.h
new file mode 100644
index 0000000000..105de397a2
--- /dev/null
+++ b/arm_compute/core/CPP/ICPPSimpleKernel.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ICPPSIMPLEKERNEL_H__
+#define __ARM_COMPUTE_ICPPSIMPLEKERNEL_H__
+
+#include "arm_compute/core/CPP/ICPPKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for simple NEON kernels having 1 tensor input and 1 tensor output */
+class ICPPSimpleKernel : public ICPPKernel
+{
+public:
+    /** Constructor */
+    ICPPSimpleKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ICPPSimpleKernel(const ICPPSimpleKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ICPPSimpleKernel &operator=(const ICPPSimpleKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    ICPPSimpleKernel(ICPPSimpleKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    ICPPSimpleKernel &operator=(ICPPSimpleKernel &&) = default;
+    /** Default destructor */
+    ~ICPPSimpleKernel() = default;
+
+protected:
+    /** Configure the kernel
+     *
+     * @param[in]  input                             Source tensor.
+     * @param[out] output                            Destination tensor.
+     * @param[in]  num_elems_processed_per_iteration Number of processed elements per iteration.
+     * @param[in]  border_undefined                  (Optional) True if the border mode is undefined. False if it's replicate or constant.
+     * @param[in]  border_size                       (Optional) Size of the border.
+     */
+    void configure(const ITensor *input, ITensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined = false, const BorderSize &border_size = BorderSize());
+
+protected:
+    const ITensor *_input;
+    ITensor       *_output;
+};
+}
+#endif /*__ARM_COMPUTE_ICPPSIMPLEKERNEL_H__ */
diff --git a/arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h b/arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h
new file mode 100644
index 0000000000..0866d4ee57
--- /dev/null
+++ b/arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CPPCORNERCANDIDATESKERNEL_H__
+#define __ARM_COMPUTE_CPPCORNERCANDIDATESKERNEL_H__
+
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+
+#include <cstdint>
+#include <mutex>
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+
+/** CPP kernel to perform corner candidates
+ */
+class CPPCornerCandidatesKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    CPPCornerCandidatesKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CPPCornerCandidatesKernel(const CPPCornerCandidatesKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CPPCornerCandidatesKernel &operator=(const CPPCornerCandidatesKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CPPCornerCandidatesKernel(CPPCornerCandidatesKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CPPCornerCandidatesKernel &operator=(CPPCornerCandidatesKernel &&) = default;
+    /** Default destructor */
+    ~CPPCornerCandidatesKernel() = default;
+
+    /** Setup the kernel parameters
+     *
+     * @param[in]  input                 Source image (harris score). Format supported F32
+     * @param[out] output                Destination array of InternalKeypoint
+     * @param[out] num_corner_candidates Number of corner candidates
+     */
+    void configure(const IImage *input, InternalKeypoint *output, int32_t *num_corner_candidates);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    int32_t          *_num_corner_candidates;   /**< Number of corner candidates */
+    std::mutex        _corner_candidates_mutex; /**< Mutex to preventing race conditions */
+    const IImage     *_input;                   /**< Source image - Harris score */
+    InternalKeypoint *_output;                  /**< Array of NEInternalKeypoint */
+};
+} //namespace arm_compute
+#endif /* __ARM_COMPUTE_CPPCORNERCANDIDATESKERNEL_H__ */
diff --git a/arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h b/arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h
new file mode 100644
index 0000000000..bcb3026959
--- /dev/null
+++ b/arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CPPDETECTIONWINDOWNONMAXIMASUPPRESSIONKERNEL_H__
+#define __ARM_COMPUTE_CPPDETECTIONWINDOWNONMAXIMASUPPRESSIONKERNEL_H__
+
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/IHOG.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+/** CPP kernel to perform in-place computation of euclidean distance on IDetectionWindowArray
+ *
+ * @note This kernel is meant to be used alongside HOG or other object detection algorithms to perform a non-maxima suppression on a
+ *       IDetectionWindowArray
+ */
+class CPPDetectionWindowNonMaximaSuppressionKernel : public ICPPKernel
+{
+public:
+    /** Default constructor */
+    CPPDetectionWindowNonMaximaSuppressionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CPPDetectionWindowNonMaximaSuppressionKernel(const CPPDetectionWindowNonMaximaSuppressionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CPPDetectionWindowNonMaximaSuppressionKernel &operator=(const CPPDetectionWindowNonMaximaSuppressionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CPPDetectionWindowNonMaximaSuppressionKernel(CPPDetectionWindowNonMaximaSuppressionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CPPDetectionWindowNonMaximaSuppressionKernel &operator=(CPPDetectionWindowNonMaximaSuppressionKernel &&) = default;
+    /** Initialise the kernel's input, output and the euclidean minimum distance
+     *
+     * @attention: If @ref CLDetectionWindowArray is passed to the kernel, the map() and unmap() methods @ref CLDetectionWindowArray must be called respectively before and after
+     *             the run() method of @ref CPPDetectionWindowNonMaximaSuppressionKernel
+     *
+     * @param[in, out] input_output Input/Output array of @ref DetectionWindow
+     * @param[in]      min_distance Radial Euclidean distance for non-maxima suppression
+     */
+    void configure(IDetectionWindowArray *input_output, float min_distance);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    bool is_parallelisable() const override;
+
+private:
+    IDetectionWindowArray *_input_output;
+    float                  _min_distance;
+};
+}
+
+#endif /* __ARM_COMPUTE_CPPDETECTIONWINDOWNONMAXIMASUPPRESSIONKERNEL_H__ */
diff --git a/arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h b/arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h
new file mode 100644
index 0000000000..b7a7d9ff9f
--- /dev/null
+++ b/arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CPPSORTEUCLIDEANDISTANCEKERNEL_H__
+#define __ARM_COMPUTE_CPPSORTEUCLIDEANDISTANCEKERNEL_H__
+
+#include "arm_compute/core/CPP/ICPPKernel.h"
+#include "arm_compute/core/IArray.h"
+
+#include <cstdint>
+#include <mutex>
+
+namespace arm_compute
+{
+/** CPP kernel to perform sorting and euclidean distance */
+class CPPSortEuclideanDistanceKernel : public ICPPKernel
+{
+public:
+    /** Default constructor */
+    CPPSortEuclideanDistanceKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CPPSortEuclideanDistanceKernel(const CPPSortEuclideanDistanceKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CPPSortEuclideanDistanceKernel &operator=(const CPPSortEuclideanDistanceKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CPPSortEuclideanDistanceKernel(CPPSortEuclideanDistanceKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CPPSortEuclideanDistanceKernel &operator=(CPPSortEuclideanDistanceKernel &&) = default;
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in,out] in_out                Input internal keypoints. Marked as out as the kernel writes 0 in the strength member.
+     * @param[out]    output                Output keypoints.
+     * @param[in]     num_corner_candidates Pointer to the number of corner candidates in the input array
+     * @param[in]     min_distance          Radial Euclidean distance to use
+     */
+    void configure(InternalKeypoint *in_out, IKeyPointArray *output, const int32_t *num_corner_candidates, float min_distance);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    bool is_parallelisable() const override;
+
+private:
+    const int32_t    *_num_corner_candidates; /**< Number of corner candidates */
+    float             _min_distance;          /**< Radial Euclidean distance */
+    InternalKeypoint *_in_out;                /**< Source array of InternalKeypoint */
+    IKeyPointArray   *_output;                /**< Destination array of IKeyPointArray */
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CPPSORTEUCLIDEANDISTANCEKERNEL_H__ */
diff --git a/arm_compute/core/Coordinates.h b/arm_compute/core/Coordinates.h
new file mode 100644
index 0000000000..3a99abbd74
--- /dev/null
+++ b/arm_compute/core/Coordinates.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_COORDINATES_H__
+#define __ARM_COMPUTE_COORDINATES_H__
+
+#include "arm_compute/core/Dimensions.h"
+#include "arm_compute/core/Error.h"
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+
+namespace arm_compute
+{
+/** Coordinates of an item */
+class Coordinates : public Dimensions<int>
+{
+public:
+    /** Constructor to initialize the coordinates.
+     *
+     * @param[in] coords Values to initialize the dimensions.
+     */
+    template <typename... Ts>
+    constexpr Coordinates(Ts... coords)
+        : Dimensions{ coords... }
+    {
+    }
+    /** Allow instances of this class to be copy constructed */
+    constexpr Coordinates(const Coordinates &) = default;
+    /** Allow instances of this class to be copied */
+    Coordinates &operator=(const Coordinates &) = default;
+    /** Allow instances of this class to be move constructed */
+    constexpr Coordinates(Coordinates &&) = default;
+    /** Allow instances of this class to be moved */
+    Coordinates &operator=(Coordinates &&) = default;
+    /** Default destructor */
+    ~Coordinates() = default;
+};
+}
+#endif /*__ARM_COMPUTE_COORDINATES_H__*/
diff --git a/arm_compute/core/Dimensions.h b/arm_compute/core/Dimensions.h
new file mode 100644
index 0000000000..b080435b69
--- /dev/null
+++ b/arm_compute/core/Dimensions.h
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_DIMENSIONS_H__
+#define __ARM_COMPUTE_DIMENSIONS_H__
+
+#include "arm_compute/core/Error.h"
+
+#include <algorithm>
+#include <array>
+#include <functional>
+#include <numeric>
+
+namespace arm_compute
+{
+/* Constant value used to indicate maximum dimensions of a Window, TensorShape and Coordinates */
+constexpr size_t MAX_DIMS = 6;
+
+/** Dimensions with dimensionality */
+template <typename T>
+class Dimensions
+{
+public:
+    /** Number of dimensions the tensor has */
+    static constexpr size_t num_max_dimensions = MAX_DIMS;
+
+    /** Constructor to initialize the tensor shape.
+     *
+     * @param[in] dims Values to initialize the dimensions.
+     */
+    template <typename... Ts>
+    Dimensions(Ts... dims)
+        : _id{ { dims... } }, _num_dimensions{ sizeof...(dims) }
+    {
+    }
+
+    /** Allow instances of this class to be copy constructed */
+    Dimensions(const Dimensions &) = default;
+
+    /** Allow instances of this class to be copied */
+    Dimensions &operator=(const Dimensions &) = default;
+
+    /** Allow instances of this class to be move constructed */
+    Dimensions(Dimensions &&) = default;
+
+    /** Allow instances of this class to be moved */
+    Dimensions &operator=(Dimensions &&) = default;
+
+    /** Accessor to set the value of one of the dimensions.
+     *
+     * @param[in] dimension Dimension for which the value is set.
+     * @param[in] value     Value to be set for the dimension.
+     */
+    void set(size_t dimension, T value)
+    {
+        ARM_COMPUTE_ERROR_ON(dimension >= num_max_dimensions);
+        _id[dimension]  = value;
+        _num_dimensions = std::max(_num_dimensions, dimension + 1);
+    }
+    /** Alias to access the size of the first dimension */
+    T x() const
+    {
+        return _id[0];
+    }
+    /** Alias to access the size of the second dimension */
+    T y() const
+    {
+        return _id[1];
+    }
+    /** Alias to access the size of the third dimension */
+    T z() const
+    {
+        return _id[2];
+    }
+    /** Generic accessor to get the size of any dimension
+     *
+     * @note Precondition: dimension < Dimensions::num_max_dimensions
+     *
+     * @param[in] dimension Dimension of the wanted size
+     *
+     * @return The size of the requested dimension.
+     */
+    T operator[](size_t dimension) const
+    {
+        ARM_COMPUTE_ERROR_ON(dimension >= num_max_dimensions);
+        return _id[dimension];
+    }
+    /** Returns the effective dimensionality of the tensor */
+    unsigned int num_dimensions() const
+    {
+        return _num_dimensions;
+    }
+
+    /** Set number of dimensions */
+    void set_num_dimensions(size_t num_dimensions)
+    {
+        _num_dimensions = num_dimensions;
+    }
+
+    /** Collapse dimensions.
+     *
+     * @param[in] first Dimensions into which the following @p n are collapsed.
+     * @param[in] n     Number of dimensions to collapse into @p first.
+     */
+    void collapse(size_t n, size_t first = 0)
+    {
+        ARM_COMPUTE_ERROR_ON(first + n > _id.size());
+
+        // Collapse dimensions into the first
+        _id[first] = std::accumulate(_id.cbegin() + first, _id.cbegin() + first + n, 1, std::multiplies<T>());
+        // Shift the remaining dimensions down
+        std::copy(_id.begin() + first + n, _id.end(), _id.begin() + first + 1);
+        // Reduce the number of dimensions
+        _num_dimensions -= n - 1;
+        // Fill the now empty dimensions with zero
+        std::fill(_id.begin() + _num_dimensions, _id.end(), 0);
+    }
+
+    /** Returns a read/write iterator that points to the first element in the dimension array. */
+    typename std::array<T, num_max_dimensions>::iterator begin()
+    {
+        return _id.begin();
+    }
+    /** Returns a read-only (constant) iterator that points to the first element in the dimension array. */
+    typename std::array<T, num_max_dimensions>::const_iterator begin() const
+    {
+        return _id.begin();
+    }
+    /** Returns a read-only (constant) iterator that points to the first element in the dimension array. */
+    typename std::array<T, num_max_dimensions>::const_iterator cbegin() const
+    {
+        return begin();
+    }
+    /** Returns a read/write iterator that points one past the last element in the dimension array. */
+    typename std::array<T, num_max_dimensions>::iterator end()
+    {
+        return _id.end();
+    }
+    /** Returns a read-only (constant) iterator that points one past the last element in the dimension array. */
+    typename std::array<T, num_max_dimensions>::const_iterator end() const
+    {
+        return _id.end();
+    }
+    /** Returns a read-only (constant) iterator that points one past the last element in the dimension array. */
+    typename std::array<T, num_max_dimensions>::const_iterator cend() const
+    {
+        return end();
+    }
+
+protected:
+    /** Protected destructor. */
+    ~Dimensions() = default;
+
+    std::array<T, num_max_dimensions> _id;
+    size_t _num_dimensions{ 0 };
+};
+}
+#endif /*__ARM_COMPUTE_DIMENSIONS_H__*/
diff --git a/arm_compute/core/Error.h b/arm_compute/core/Error.h
new file mode 100644
index 0000000000..c4c452bacf
--- /dev/null
+++ b/arm_compute/core/Error.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ERROR_H__
+#define __ARM_COMPUTE_ERROR_H__
+
+/** Print the given message then throw an std::runtime_error.
+ *
+ * @param[in] ... Message to display before aborting.
+ */
+#define ARM_COMPUTE_ERROR(...) ::arm_compute::error(__func__, __FILE__, __LINE__, __VA_ARGS__) // NOLINT
+
+/** Print the given message then throw an std::runtime_error.
+ *
+ * @param[in] func Function in which the error occurred.
+ * @param[in] file File in which the error occurred.
+ * @param[in] line Line in which the error occurred.
+ * @param[in] ...  Message to display before aborting.
+ */
+#define ARM_COMPUTE_ERROR_LOC(func, file, line, ...) ::arm_compute::error(func, file, line, __VA_ARGS__) // NOLINT
+
+/** To avoid unused variables warnings
+ *
+ * This is useful if for example a variable is only used
+ * in debug builds and generates a warning in release builds.
+ *
+ * @param[in] var Variable which is unused
+ */
+#define ARM_COMPUTE_UNUSED(var) (void)(var)
+
+#ifdef ARM_COMPUTE_DEBUG_ENABLED
+/** Print the given message
+ *
+ * @param[in] ... Message to display
+ */
+#define ARM_COMPUTE_INFO(...) ::arm_compute::debug(__func__, __FILE__, __LINE__, __VA_ARGS__) // NOLINT
+/** If the condition is true, the given message is printed
+ *
+ * @param[in] cond Condition to evaluate.
+ * @param[in] ...  Message to print if cond is false.
+ */
+#define ARM_COMPUTE_INFO_ON_MSG(cond, ...) \
+    do                                     \
+    {                                      \
+        if(cond)                           \
+        {                                  \
+            ARM_COMPUTE_INFO(__VA_ARGS__); \
+        }                                  \
+    } while(0)
+#else /* ARM_COMPUTE_DEBUG_ENABLED */
+#define ARM_COMPUTE_INFO_ON_MSG(cond, ...)
+#define ARM_COMPUTE_INFO(...)
+#endif /* ARM_COMPUTE_DEBUG_ENABLED */
+
+#ifdef ARM_COMPUTE_ASSERTS_ENABLED
+/** If the condition is true, the given message is printed and an exception is thrown
+ *
+ * @param[in] cond Condition to evaluate.
+ * @param[in] ...  Message to print if cond is false.
+ */
+#define ARM_COMPUTE_ERROR_ON_MSG(cond, ...) \
+    do                                      \
+    {                                       \
+        if(cond)                            \
+        {                                   \
+            ARM_COMPUTE_ERROR(__VA_ARGS__); \
+        }                                   \
+    } while(0)
+
+/** If the condition is true, the given message is printed and an exception is thrown
+ *
+ * @param[in] cond Condition to evaluate.
+ * @param[in] func Function in which the error occurred.
+ * @param[in] file File in which the error occurred.
+ * @param[in] line Line in which the error occurred.
+ * @param[in] ...  Message to print if cond is false.
+ */
+#define ARM_COMPUTE_ERROR_ON_LOC_MSG(cond, func, file, line, ...) \
+    do                                                            \
+    {                                                             \
+        if(cond)                                                  \
+        {                                                         \
+            ARM_COMPUTE_ERROR_LOC(func, file, line, __VA_ARGS__); \
+        }                                                         \
+    } while(0)
+
+/** If the condition is true, the given message is printed and an exception is thrown, otherwise value is returned
+ *
+ * @param[in] cond Condition to evaluate.
+ * @param[in] val  Value to be returned.
+ * @param[in] msg  Message to print if cond is false.
+ */
+#define ARM_COMPUTE_CONST_ON_ERROR(cond, val, msg) (cond) ? throw std::logic_error(msg) : val;
+#else /* ARM_COMPUTE_ASSERTS_ENABLED */
+#define ARM_COMPUTE_ERROR_ON_MSG(cond, ...)
+#define ARM_COMPUTE_ERROR_ON_LOC_MSG(cond, func, file, line, ...)
+#define ARM_COMPUTE_CONST_ON_ERROR(cond, val, msg) val
+#endif /* ARM_COMPUTE_ASSERTS_ENABLED */
+
+/** If the condition is true then an error message is printed and an exception thrown
+ *
+ * @param[in] cond Condition to evaluate
+ */
+#define ARM_COMPUTE_ERROR_ON(cond) \
+    ARM_COMPUTE_ERROR_ON_MSG(cond, #cond)
+
+/** If the condition is true then an error message is printed and an exception thrown
+ *
+ * @param[in] cond Condition to evaluate
+ * @param[in] func Function in which the error occurred.
+ * @param[in] file File in which the error occurred.
+ * @param[in] line Line in which the error occurred.
+ */
+#define ARM_COMPUTE_ERROR_ON_LOC(cond, func, file, line) \
+    ARM_COMPUTE_ERROR_ON_LOC_MSG(cond, func, file, line, #cond)
+
+namespace arm_compute
+{
+/** Print an error message then throw an std::runtime_error
+ *
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     Name of the file where the error occurred.
+ * @param[in] line     Line on which the error occurred.
+ * @param[in] msg      Message to display before aborting.
+ * @param[in] ...      Variable number of arguments of the message.
+ */
+[[noreturn]] void error(const char *function, const char *file, const int line, const char *msg, ...);
+
+/** Print a debug message
+ *
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     Name of the file where the error occurred.
+ * @param[in] line     Line on which the error occurred.
+ * @param[in] msg      Message to display before aborting.
+ * @param[in] ...      Variable number of arguments of the message.
+ */
+void debug(const char *function, const char *file, const int line, const char *msg, ...);
+}
+
+#endif /* __ARM_COMPUTE_ERROR_H__ */
diff --git a/arm_compute/core/FixedPoint.h b/arm_compute/core/FixedPoint.h
new file mode 100644
index 0000000000..925b4949a3
--- /dev/null
+++ b/arm_compute/core/FixedPoint.h
@@ -0,0 +1,217 @@
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_FIXEDPOINT_H__
+#define __ARM_COMPUTE_FIXEDPOINT_H__
+
+#include <cstdint>
+
+namespace arm_compute
+{
+using qint8_t  = int8_t;  /**< 8 bit fixed point scalar value */
+using qint16_t = int16_t; /**< 16 bit fixed point scalar value */
+using qint32_t = int32_t; /**< 32 bit fixed point scalar value */
+
+/** 8 bit fixed point scalar saturating shift left
+ *
+ * @param[in] a     First 8 bit fixed point input
+ * @param[in] shift Shift amount
+ *
+ * @return The result of the 8 bit fixed point shift. The result is saturated in case of overflow
+ */
+qint8_t sqshl_qs8(qint8_t a, int shift);
+
+/** 8 bit fixed point scalar absolute value
+ *
+ * @param[in] a 8 bit fixed point input
+ *
+ * @return The result of the 8 bit fixed point absolute value
+ */
+qint8_t sabs_qs8(qint8_t a);
+
+/** 8 bit fixed point scalar add
+ *
+ * @param[in] a First 8 bit fixed point input
+ * @param[in] b Second 8 bit fixed point input
+ *
+ * @return The result of the 8 bit fixed point addition
+ */
+qint8_t sadd_qs8(qint8_t a, qint8_t b);
+
+/** 8 bit fixed point scalar saturating add
+ *
+ * @param[in] a First 8 bit fixed point input
+ * @param[in] b Second 8 bit fixed point input
+ *
+ * @return The result of the 8 bit fixed point addition. The result is saturated in case of overflow
+ */
+qint8_t sqadd_qs8(qint8_t a, qint8_t b);
+
+/** 16 bit fixed point scalar saturating add
+ *
+ * @param[in] a First 16 bit fixed point input
+ * @param[in] b Second 16 bit fixed point input
+ *
+ * @return The result of the 16 bit fixed point addition. The result is saturated in case of overflow
+ */
+qint16_t sqadd_qs16(qint16_t a, qint16_t b);
+
+/** 8 bit fixed point scalar subtraction
+ *
+ * @param[in] a First 8 bit fixed point input
+ * @param[in] b Second 8 bit fixed point input
+ *
+ * @return The result of the 8 bit fixed point subtraction
+ */
+qint8_t ssub_qs8(qint8_t a, qint8_t b);
+
+/** 8 bit fixed point scalar saturating subtraction
+ *
+ * @param[in] a First 8 bit fixed point input
+ * @param[in] b Second 8 bit fixed point input
+ *
+ * @return The result of the 8 bit fixed point subtraction. The result is saturated in case of overflow
+ */
+qint8_t sqsub_qs8(qint8_t a, qint8_t b);
+
+/** 8 bit fixed point scalar multiply
+ *
+ * @param[in] a                    First 8 bit fixed point input
+ * @param[in] b                    Second 8 bit fixed point input
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point multiplication.
+ */
+qint8_t smul_qs8(qint8_t a, qint8_t b, int fixed_point_position);
+
+/** 8 bit fixed point scalar saturating multiply
+ *
+ * @param[in] a                    First 8 bit fixed point input
+ * @param[in] b                    Second 8 bit fixed point input
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point multiplication. The result is saturated in case of overflow
+ */
+qint8_t sqmul_qs8(qint8_t a, qint8_t b, int fixed_point_position);
+
+/** 8 bit fixed point scalar multiply long
+ *
+ * @param[in] a                    First 8 bit fixed point input
+ * @param[in] b                    Second 8 bit fixed point input
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point multiplication long. The result is saturated in case of overflow
+ */
+qint16_t sqmull_qs8(qint8_t a, qint8_t b, int fixed_point_position);
+
+/** 16 bit fixed point scalar saturating multiply
+*
+* @param[in] a                    First 16 bit fixed point input
+* @param[in] b                    Second 16 bit fixed point input
+* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+*
+* @return The result of the 16 bit fixed point multiplication. The result is saturated in case of overflow
+*/
+qint16_t sqmul_qs16(qint16_t a, qint16_t b, int fixed_point_position);
+
+/** 8 bit fixed point scalar inverse square root
+*
+* @param[in] a                    8 bit fixed point input
+* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+*
+* @return The result of the 8 bit fixed point inverse square root.
+*/
+qint8_t sinvsqrt_qs8(qint8_t a, int fixed_point_position);
+
+/** 8 bit fixed point scalar division
+*
+* @param[in] a                    First 8 bit fixed point input
+* @param[in] b                    Second 8 bit fixed point input
+* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+*
+* @return The result of the 8 bit fixed point division.
+*/
+qint8_t sdiv_qs8(qint8_t a, qint8_t b, int fixed_point_position);
+
+/** 8 bit fixed point scalar exponential
+*
+* @param[in] a                    8 bit fixed point input
+* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+*
+* @return The result of the 8 bit fixed point exponential.
+*/
+qint8_t sexp_qs8(qint8_t a, int fixed_point_position);
+
+/** 8 bit fixed point scalar logarithm
+*
+* @param[in] a                    8 bit fixed point input
+* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+*
+* @return The result of the 8 bit fixed point logarithm.
+*/
+qint8_t slog_qs8(qint8_t a, int fixed_point_position);
+
+/** Convert an 8 bit fixed point to float
+ *
+ * @param[in] a                    Input to convert
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the conversion 8 bit fixed point -> float
+ */
+float scvt_f32_qs8(qint8_t a, int fixed_point_position);
+
+/** Convert a float to 8 bit fixed point
+ *
+ * @param[in] a                    Input to convert
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the conversion float -> 8 bit fixed point
+ */
+qint8_t scvt_qs8_f32(float a, int fixed_point_position);
+
+/** Convert a 16 bit fixed point to float
+ *
+ * @param[in] a                    Input to convert
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the conversion 16 bit fixed point -> float
+ */
+float scvt_f32_qs16(qint16_t a, int fixed_point_position);
+
+/** Convert a float to 16 bit fixed point
+ *
+ * @param[in] a                    Input to convert
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the conversion float -> 16 bit fixed point
+ */
+qint8_t scvt_qs16_f32(float a, int fixed_point_position);
+
+/** Scalar saturating move and narrow.
+ *
+ * @param[in] a Input to convert to 8 bit fixed point
+ *
+ * @return The narrowing conversion to 8 bit
+ */
+qint8_t sqmovn_qs16(qint16_t a);
+}
+#include "arm_compute/core/FixedPoint.inl"
+#endif /* __ARM_COMPUTE_FIXEDPOINT_H__ */
diff --git a/arm_compute/core/FixedPoint.inl b/arm_compute/core/FixedPoint.inl
new file mode 100644
index 0000000000..4263a6f00d
--- /dev/null
+++ b/arm_compute/core/FixedPoint.inl
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <cmath>
+#include <limits>
+
+namespace
+{
+template <typename TpIn, typename TpSat>
+inline TpSat saturate_convert(TpIn a)
+{
+    if(a > std::numeric_limits<TpSat>::max())
+    {
+        a = std::numeric_limits<TpSat>::max();
+    }
+    if(a < std::numeric_limits<TpSat>::min())
+    {
+        a = std::numeric_limits<TpSat>::min();
+    }
+    return static_cast<TpSat>(a);
+}
+} // namespace
+
+namespace arm_compute
+{
+inline qint8_t sqshl_qs8(qint8_t a, int shift)
+{
+    qint16_t tmp = static_cast<qint16_t>(a) << shift;
+    // Saturate the result in case of overflow and cast to qint8_t
+    return saturate_convert<qint16_t, qint8_t>(tmp);
+}
+
+inline qint8_t sabs_qs8(qint8_t a)
+{
+    return a & 0x7F;
+}
+
+inline qint8_t sadd_qs8(qint8_t a, qint8_t b)
+{
+    return a + b;
+}
+
+inline qint8_t sqadd_qs8(qint8_t a, qint8_t b)
+{
+    // We need to store the temporary result in qint16_t otherwise we cannot evaluate the overflow
+    qint16_t tmp = (static_cast<qint16_t>(a) + static_cast<qint16_t>(b));
+
+    // Saturate the result in case of overflow and cast to qint8_t
+    return saturate_convert<qint16_t, qint8_t>(tmp);
+}
+
+inline qint16_t sqadd_qs16(qint16_t a, qint16_t b)
+{
+    // We need to store the temporary result in qint16_t otherwise we cannot evaluate the overflow
+    qint32_t tmp = (static_cast<qint32_t>(a) + static_cast<qint32_t>(b));
+
+    // Saturate the result in case of overflow and cast to qint16_t
+    return saturate_convert<qint32_t, qint16_t>(tmp);
+}
+
+inline qint8_t ssub_qs8(qint8_t a, qint8_t b)
+{
+    return a - b;
+}
+
+inline qint8_t sqsub_qs8(qint8_t a, qint8_t b)
+{
+    // We need to store the temporary result in uint16_t otherwise we cannot evaluate the overflow
+    qint16_t tmp = static_cast<qint16_t>(a) - static_cast<qint16_t>(b);
+
+    // Saturate the result in case of overflow and cast to qint8_t
+    return saturate_convert<qint16_t, qint8_t>(tmp);
+}
+
+inline qint8_t smul_qs8(qint8_t a, qint8_t b, int fixed_point_position)
+{
+    const qint16_t round_up_const = (1 << (fixed_point_position - 1));
+
+    qint16_t tmp = static_cast<qint16_t>(a) * static_cast<qint16_t>(b);
+
+    // Rounding up
+    tmp += round_up_const;
+
+    return static_cast<qint8_t>(tmp >> fixed_point_position);
+}
+
+inline qint8_t sqmul_qs8(qint8_t a, qint8_t b, int fixed_point_position)
+{
+    const qint16_t round_up_const = (1 << (fixed_point_position - 1));
+
+    qint16_t tmp = static_cast<qint16_t>(a) * static_cast<qint16_t>(b);
+
+    // Rounding up
+    tmp += round_up_const;
+
+    return saturate_convert<qint16_t, qint8_t>(tmp >> fixed_point_position);
+}
+
+inline qint16_t sqmul_qs16(qint16_t a, qint16_t b, int fixed_point_position)
+{
+    const qint32_t round_up_const = (1 << (fixed_point_position - 1));
+
+    qint32_t tmp = static_cast<qint32_t>(a) * static_cast<qint32_t>(b);
+
+    // Rounding up
+    tmp += round_up_const;
+
+    return saturate_convert<qint32_t, qint16_t>(tmp >> fixed_point_position);
+}
+
+inline qint16_t sqmull_qs8(qint8_t a, qint8_t b, int fixed_point_position)
+{
+    const qint16_t round_up_const = (1 << (fixed_point_position - 1));
+
+    qint16_t tmp = static_cast<qint16_t>(a) * static_cast<qint16_t>(b);
+
+    // Rounding up
+    tmp += round_up_const;
+
+    return tmp >> fixed_point_position;
+}
+
+inline qint8_t sinvsqrt_qs8(qint8_t a, int fixed_point_position)
+{
+    qint8_t shift = 8 - (fixed_point_position + (__builtin_clz(a) - 24));
+
+    qint8_t const_three = (3 << fixed_point_position);
+    qint8_t temp        = shift < 0 ? (a << -shift) : (a >> shift);
+    qint8_t x2          = temp;
+
+    // We need three iterations to find the result
+    for(int i = 0; i < 3; i++)
+    {
+        qint8_t three_minus_dx = ssub_qs8(const_three, smul_qs8(temp, smul_qs8(x2, x2, fixed_point_position), fixed_point_position));
+        x2                     = (smul_qs8(x2, three_minus_dx, fixed_point_position) >> 1);
+    }
+
+    temp = shift < 0 ? (x2 << (-shift >> 1)) : (x2 >> (shift >> 1));
+
+    return temp;
+}
+
+inline qint8_t sdiv_qs8(qint8_t a, qint8_t b, int fixed_point_position)
+{
+    qint16_t temp = a << fixed_point_position;
+    return (qint8_t)(temp / b);
+}
+
+inline qint8_t sqexp_qs8(qint8_t a, int fixed_point_position)
+{
+    // Constants
+    qint8_t const_one = (1 << fixed_point_position);
+    qint8_t ln2       = ((0x58 >> (6 - fixed_point_position)) + 1) >> 1;
+    qint8_t inv_ln2   = (((0x38 >> (6 - fixed_point_position)) + 1) >> 1) | const_one;
+    qint8_t A         = ((0x7F >> (6 - fixed_point_position)) + 1) >> 1;
+    qint8_t B         = ((0x3F >> (6 - fixed_point_position)) + 1) >> 1;
+    qint8_t C         = ((0x16 >> (6 - fixed_point_position)) + 1) >> 1;
+    qint8_t D         = ((0x05 >> (6 - fixed_point_position)) + 1) >> 1;
+
+    // Polynomial expansion
+    int     dec_a = (sqmul_qs8(a, inv_ln2, fixed_point_position) >> fixed_point_position);
+    qint8_t alpha = sabs_qs8(sqsub_qs8(a, sqmul_qs8(ln2, sqshl_qs8(dec_a, fixed_point_position), fixed_point_position)));
+    qint8_t sum   = sqadd_qs8(sqmul_qs8(alpha, D, fixed_point_position), C);
+    sum           = sqadd_qs8(sqmul_qs8(alpha, sum, fixed_point_position), B);
+    sum           = sqadd_qs8(sqmul_qs8(alpha, sum, fixed_point_position), A);
+    sum           = sqmul_qs8(alpha, sum, fixed_point_position);
+    sum           = sqadd_qs8(sum, const_one);
+
+    return (dec_a < 0) ? (sum >> -dec_a) : sqshl_qs8(sum, dec_a);
+}
+
+inline qint8_t slog_qs8(qint8_t a, int fixed_point_position)
+{
+    // Constants
+    qint8_t const_one = (1 << fixed_point_position);
+    qint8_t ln2       = (0x58 >> (7 - fixed_point_position));
+    qint8_t A         = (0x5C >> (7 - fixed_point_position - 1));
+    qint8_t B         = -(0x56 >> (7 - fixed_point_position));
+    qint8_t C         = (0x29 >> (7 - fixed_point_position));
+    qint8_t D         = -(0x0A >> (7 - fixed_point_position));
+
+    if((const_one == a) || (a < 0))
+    {
+        return 0;
+    }
+    else if(a < const_one)
+    {
+        return -slog_qs8(sdiv_qs8(const_one, a, fixed_point_position), fixed_point_position);
+    }
+
+    // Remove even powers of 2
+    qint8_t shift_val = 31 - __builtin_clz(a >> fixed_point_position);
+    a >>= shift_val;
+    a = ssub_qs8(a, const_one);
+
+    // Polynomial expansion
+    auto sum = sqadd_qs8(sqmul_qs8(a, D, fixed_point_position), C);
+    sum      = sqadd_qs8(sqmul_qs8(a, sum, fixed_point_position), B);
+    sum      = sqadd_qs8(sqmul_qs8(a, sum, fixed_point_position), A);
+    sum      = sqmul_qs8(a, sum, fixed_point_position);
+
+    return smul_qs8(sadd_qs8(sum, shift_val << fixed_point_position), ln2, fixed_point_position);
+}
+
+inline float scvt_f32_qs8(qint8_t a, int fixed_point_position)
+{
+    return static_cast<float>(a) / (1 << fixed_point_position);
+}
+
+inline qint8_t scvt_qs8_f32(float a, int fixed_point_position)
+{
+    // round_nearest_integer(a * 2^(fixed_point_position))
+    return static_cast<qint8_t>(static_cast<float>(a) * (1 << fixed_point_position) + 0.5f);
+}
+
+inline float scvt_f32_qs16(qint16_t a, int fixed_point_position)
+{
+    return static_cast<float>(a) / (1 << fixed_point_position);
+}
+
+inline qint8_t scvt_qs16_f32(float a, int fixed_point_position)
+{
+    // round_nearest_integer(a * 2^(fixed_point_position))
+    return static_cast<qint16_t>(static_cast<float>(a) * (1 << fixed_point_position) + 0.5f);
+}
+
+inline qint8_t sqmovn_qs16(qint16_t a)
+{
+    // Saturate the result in case of overflow and cast to qint8_t
+    return saturate_convert<qint16_t, qint8_t>(a);
+}
+}
diff --git a/arm_compute/core/HOGInfo.h b/arm_compute/core/HOGInfo.h
new file mode 100644
index 0000000000..654629306d
--- /dev/null
+++ b/arm_compute/core/HOGInfo.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_HOGINFO_H__
+#define __ARM_COMPUTE_HOGINFO_H__
+
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+/** Store the HOG's metadata */
+class HOGInfo
+{
+public:
+    /** Default constructor */
+    HOGInfo();
+    /** Default destructor */
+    virtual ~HOGInfo() = default;
+    /** Allow instances of this class to be copy constructed */
+    HOGInfo(const HOGInfo &) = default;
+    /** Allow instances of this class to be copied */
+    HOGInfo &operator=(const HOGInfo &) = default;
+    /** Allow instances of this class to be move constructed */
+    HOGInfo(HOGInfo &&) = default;
+    /** Allow instances of this class to be moved */
+    HOGInfo &operator=(HOGInfo &&) = default;
+    /** Constructor
+     *
+     * @param[in] cell_size             Cell size in pixels
+     * @param[in] block_size            Block size in pixels. Must be a multiple of cell_size.
+     * @param[in] detection_window_size Detection window size in pixels. Must be a multiple of block_size and block_stride.
+     * @param[in] block_stride          Distance in pixels between 2 consecutive blocks along the x and y direction. Must be a multiple of cell size
+     * @param[in] num_bins              Number of histogram bins for each cell
+     * @param[in] normalization_type    (Optional) Normalization type to use for each block
+     * @param[in] l2_hyst_threshold     (Optional) Threshold used for L2HYS_NORM normalization method
+     * @param[in] phase_type            (Optional) Type of @ref PhaseType
+     */
+    HOGInfo(const Size2D &cell_size, const Size2D &block_size, const Size2D &detection_window_size, const Size2D &block_stride, size_t num_bins,
+            HOGNormType normalization_type = HOGNormType::L2HYS_NORM, float l2_hyst_threshold = 0.2f, PhaseType phase_type = PhaseType::UNSIGNED);
+    /** Initialize the metadata structure with the given parameters
+     *
+     * @param[in] cell_size             Cell size in pixels
+     * @param[in] block_size            Block size in pixels. Must be a multiple of cell_size.
+     * @param[in] detection_window_size Detection window size in pixels. Must be a multiple of block_size and block_stride.
+     * @param[in] block_stride          Distance in pixels between 2 consecutive blocks along the x and y direction. Must be a multiple of cell size
+     * @param[in] num_bins              Number of histogram bins for each cell
+     * @param[in] normalization_type    (Optional) Normalization type to use for each block
+     * @param[in] l2_hyst_threshold     (Optional) Threshold used for L2HYS_NORM normalization method
+     * @param[in] phase_type            (Optional) Type of @ref PhaseType
+     */
+    void init(const Size2D &cell_size, const Size2D &block_size, const Size2D &detection_window_size, const Size2D &block_stride, size_t num_bins,
+              HOGNormType normalization_type = HOGNormType::L2HYS_NORM, float l2_hyst_threshold = 0.2f, PhaseType phase_type = PhaseType::UNSIGNED);
+    /** The cell size in pixels
+     *
+     * @return The cell size in pixels
+     */
+    const Size2D &cell_size() const;
+    /** The block size in pixels
+     *
+     * @return The block size in pixels
+     */
+    const Size2D &block_size() const;
+    /** The detection window size in pixels
+     *
+     * @return The detection window size in pixels
+     */
+    const Size2D &detection_window_size() const;
+    /** The block stride in pixels. The block stride is the distance between 2 consecutive blocks
+     *
+     * @return The block stride in pixels
+     */
+    const Size2D &block_stride() const;
+    /** The number of histogram bins for each cell
+     *
+     * @return The number of histogram bins for each cell
+     */
+    size_t num_bins() const;
+    /** The normalization type
+     *
+     * @return The normalization type
+     */
+    HOGNormType normalization_type() const;
+    /** Threshold used for L2HYS_NORM normalization type
+     *
+     * @return Threshold used for L2HYS_NORM normalization type
+     */
+    float l2_hyst_threshold() const;
+    /** The type of @ref PhaseType
+     *
+     * @return The type of @ref PhaseType
+     */
+    PhaseType phase_type() const;
+    /** The size of HOG descriptor
+     *
+     * @return The size of HOG descriptor
+     */
+    size_t descriptor_size() const;
+    /** Calculates the number of cells for each block
+     *
+     * @return The Size2D data object which stores the number of cells along the x and y directions
+     */
+    Size2D num_cells_per_block() const;
+    /** Calculates the number of blocks for the given image size
+     *
+     * @param[in] image_size The input image size data object
+     *
+     * @return The Size2D data object which stores the number of blocks along the x and y directions
+     */
+    Size2D num_blocks_per_image(const Size2D &image_size) const;
+
+private:
+    Size2D      _cell_size;
+    Size2D      _block_size;
+    Size2D      _detection_window_size;
+    Size2D      _block_stride;
+    size_t      _num_bins;
+    HOGNormType _normalization_type;
+    float       _l2_hyst_threshold;
+    PhaseType   _phase_type;
+    size_t      _descriptor_size;
+};
+}
+#endif /*__ARM_COMPUTE_HOGINFO_H__ */
diff --git a/arm_compute/core/Helpers.h b/arm_compute/core/Helpers.h
new file mode 100644
index 0000000000..07318eaf7a
--- /dev/null
+++ b/arm_compute/core/Helpers.h
@@ -0,0 +1,507 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_HELPERS_H__
+#define __ARM_COMPUTE_HELPERS_H__
+
+#include "arm_compute/core/CL/CLTypes.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/Steps.h"
+#include "arm_compute/core/Strides.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Window.h"
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+namespace arm_compute
+{
+class IKernel;
+class ITensor;
+class ITensorInfo;
+
+namespace cpp14
+{
+template <class T>
+struct _Unique_if
+{
+    typedef std::unique_ptr<T> _Single_object;
+};
+
+template <class T>
+struct _Unique_if<T[]>
+{
+    typedef std::unique_ptr<T[]> _Unknown_bound;
+};
+
+template <class T, size_t N>
+struct _Unique_if<T[N]>
+{
+    typedef void _Known_bound;
+};
+
+template <class T, class... Args>
+typename _Unique_if<T>::_Single_object
+make_unique(Args &&... args)
+{
+    return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+
+template <class T>
+typename _Unique_if<T>::_Unknown_bound
+make_unique(size_t n)
+{
+    typedef typename std::remove_extent<T>::type U;
+    return std::unique_ptr<T>(new U[n]());
+}
+
+template <class T, class... Args>
+typename _Unique_if<T>::_Known_bound
+make_unique(Args &&...) = delete;
+}
+
+template <typename T>
+struct enable_bitwise_ops
+{
+    static constexpr bool value = false;
+};
+
+template <typename T>
+typename std::enable_if<enable_bitwise_ops<T>::value, T>::type operator&(T lhs, T rhs)
+{
+    using underlying_type = typename std::underlying_type<T>::type;
+    return static_cast<T>(static_cast<underlying_type>(lhs) & static_cast<underlying_type>(rhs));
+}
+
+namespace traits
+{
+/** Check if a type T is contained in a tuple Tuple of types */
+template <typename T, typename Tuple>
+struct is_contained;
+
+template <typename T>
+struct is_contained<T, std::tuple<>> : std::false_type
+{
+};
+
+template <typename T, typename... Ts>
+struct is_contained<T, std::tuple<T, Ts...>> : std::true_type
+{
+};
+
+template <typename T, typename U, typename... Ts>
+struct is_contained<T, std::tuple<U, Ts...>> : is_contained<T, std::tuple<Ts...>>
+{
+};
+}
+
+/** Computes bilinear interpolation using the pointer to the top-left pixel and the pixel's distance between
+ * the real coordinates and the smallest following integer coordinates.
+ *
+ * @param[in] pixel_ptr Pointer to the top-left pixel value. Format: Single channel U8
+ * @param[in] stride    Stride to access the bottom-left and bottom-right pixel values
+ * @param[in] dx        Pixel's distance between the X real coordinate and the smallest X following integer
+ * @param[in] dy        Pixel's distance between the Y real coordinate and the smallest Y following integer
+ *
+ * @note dx and dy must be in the range [0, 1.0]
+ *
+ * @return The bilinear interpolated pixel value
+ */
+inline uint8_t delta_bilinear_c1u8(const uint8_t *pixel_ptr, size_t stride, float dx, float dy);
+
+/** Return the pixel at (x,y) using bilinear interpolation. The image must be single channel U8
+ *
+ * @warning Only works if the iterator was created with an IImage
+ *
+ * @param[in] first_pixel_ptr Pointer to the first pixel of a single channel U8 image.
+ * @param[in] stride          Stride in bytes of the image;
+ * @param[in] x               X position of the wanted pixel
+ * @param[in] y               Y position of the wanted pixel
+ *
+ * @return The pixel at (x, y) using bilinear interpolation.
+ */
+inline uint8_t pixel_bilinear_c1u8(const uint8_t *first_pixel_ptr, size_t stride, float x, float y);
+
+/** Return the pixel at (x,y) using bilinear interpolation by clamping when out of borders. The image must be single channel U8
+ *
+ * @warning Only works if the iterator was created with an IImage
+ *
+ * @param[in] first_pixel_ptr Pointer to the first pixel of a single channel U8 image.
+ * @param[in] stride          Stride in bytes of the image
+ * @param[in] width           Width of the image
+ * @param[in] height          Height of the image
+ * @param[in] x               X position of the wanted pixel
+ * @param[in] y               Y position of the wanted pixel
+ *
+ * @return The pixel at (x, y) using bilinear interpolation.
+ */
+inline uint8_t pixel_bilinear_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float x, float y);
+
+/** Return the pixel at (x,y) using area interpolation by clamping when out of borders. The image must be single channel U8
+ *
+ * @note The interpolation area depends on the width and height ration of the input and output images
+ * @note Currently average of the contributing pixels is calculated
+ *
+ * @param[in] first_pixel_ptr Pointer to the first pixel of a single channel U8 image.
+ * @param[in] stride          Stride in bytes of the image
+ * @param[in] width           Width of the image
+ * @param[in] height          Height of the image
+ * @param[in] wr              Width ratio among the input image width and output image width.
+ * @param[in] hr              Height ratio among the input image height and output image height.
+ * @param[in] x               X position of the wanted pixel
+ * @param[in] y               Y position of the wanted pixel
+ *
+ * @return The pixel at (x, y) using area interpolation.
+ */
+inline uint8_t pixel_area_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float wr, float hr, int x, int y);
+
+/** Performs clamping among a lower and upper value.
+ *
+ * @param[in] n     Value to clamp.
+ * @param[in] lower Lower threshold.
+ * @param[in] upper Upper threshold.
+ *
+ *  @return Clamped value.
+ */
+template <typename T>
+inline T clamp(const T &n, const T &lower, const T &upper)
+{
+    return std::max(lower, std::min(n, upper));
+}
+
+/** Base case of for_each. Does nothing. */
+template <typename F>
+inline void for_each(F &&)
+{
+}
+
+/** Call the function for each of the arguments
+ *
+ * @param[in] func Function to be called
+ * @param[in] arg  Argument passed to the function
+ * @param[in] args Remaining arguments
+ */
+template <typename F, typename T, typename... Ts>
+inline void for_each(F &&func, T &&arg, Ts &&... args)
+{
+    func(arg);
+    for_each(func, args...);
+}
+
+/** Base case of foldl.
+ *
+ * @return value.
+ */
+template <typename F, typename T>
+inline T foldl(F &&, const T &value)
+{
+    return value;
+}
+
+/** Base case of foldl.
+ *
+ * @return Function evaluation for value1 and value2
+ */
+template <typename F, typename T, typename U>
+inline auto foldl(F &&func, T &&value1, U &&value2) -> decltype(func(value1, value2))
+{
+    return func(value1, value2);
+}
+
+/** Fold left.
+ *
+ * @param[in] func    Function to be called
+ * @param[in] initial Initial value
+ * @param[in] value   Argument passed to the function
+ * @param[in] values  Remaining arguments
+ */
+template <typename F, typename I, typename T, typename... Vs>
+inline I foldl(F &&func, I &&initial, T &&value, Vs &&... values)
+{
+    return foldl(std::forward<F>(func), func(std::forward<I>(initial), std::forward<T>(value)), std::forward<Vs>(values)...);
+}
+
+/** Iterator updated by @ref execute_window_loop for each window element */
+class Iterator
+{
+public:
+    /** Default constructor to create an empty iterator */
+    constexpr Iterator();
+    /** Create a container iterator for the metadata and allocation contained in the ITensor
+     *
+     * @param[in] tensor The tensor to associate to the iterator.
+     * @param[in] window The window which will be used to iterate over the tensor.
+     */
+    Iterator(const ITensor *tensor, const Window &window);
+
+    /** Increment the iterator along the specified dimension of the step value associated to the dimension.
+     *
+     * @warning It is the caller's responsibility to call increment(dimension+1) when reaching the end of a dimension, the iterator will not check for overflow.
+     *
+     * @note When incrementing a dimension 'n' the coordinates of all the dimensions in the range (0,n-1) are reset. For example if you iterate over a 2D image, everytime you change row (dimension 1), the iterator for the width (dimension 0) is reset to its start.
+     *
+     * @param[in] dimension Dimension to increment
+     */
+    void increment(size_t dimension);
+
+    /** Return the offset in bytes from the first element to the current position of the iterator
+     *
+     * @return The current position of the iterator in bytes relative to the first element.
+     */
+    constexpr int offset() const;
+
+    /** Return a pointer to the current pixel.
+     *
+     * @warning Only works if the iterator was created with an ITensor.
+     *
+     * @return equivalent to  buffer() + offset()
+     */
+    constexpr uint8_t *ptr() const;
+
+    /** Move the iterator back to the beginning of the specified dimension.
+     *
+     * @param[in] dimension Dimension to reset
+     */
+    void reset(size_t dimension);
+
+private:
+    uint8_t *_ptr;
+
+    class Dimension
+    {
+    public:
+        constexpr Dimension()
+            : _dim_start(0), _stride(0)
+        {
+        }
+
+        int _dim_start;
+        int _stride;
+    };
+
+    std::array<Dimension, Coordinates::num_max_dimensions> _dims;
+};
+
+/** Iterate through the passed window, automatically adjusting the iterators and calling the lambda_functino for each element.
+ *  It passes the x and y positions to the lambda_function for each iteration
+ *
+ * @param[in]     w               Window to iterate through.
+ * @param[in]     lambda_function The function of type void(function)( const Coordinates & id ) to call at each iteration.
+ *                                Where id represents the absolute coordinates of the item to process.
+ * @param[in,out] iterators       Tensor iterators which will be updated by this function before calling lambda_function.
+ */
+template <typename L, typename... Ts>
+inline void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators);
+
+/** Update window and padding size for each of the access patterns.
+ *
+ * First the window size is reduced based on all access patterns that are not
+ * allowed to modify the padding of the underlying tensor. Then the padding of
+ * the remaining tensors is increased to match the window.
+ *
+ * @param[in] win      Window that is used by the kernel.
+ * @param[in] patterns Access patterns used to calculate the final window and padding.
+ *
+ * @return True if the window has been changed. Changes to the padding do not
+ *         influence the returned value.
+ */
+template <typename... Ts>
+bool update_window_and_padding(Window &win, Ts &&... patterns)
+{
+    bool window_changed = false;
+
+    for_each([&](const IAccessWindow & w)
+    {
+        window_changed |= w.update_window_if_needed(win);
+    },
+    patterns...);
+
+    bool padding_changed = false;
+
+    for_each([&](const IAccessWindow & w)
+    {
+        padding_changed |= w.update_padding_if_needed(win);
+    },
+    patterns...);
+
+    return window_changed;
+}
+
+/** Calculate the maximum window for a given tensor shape and border setting
+ *
+ * @param[in] info        Tensor info object defining the shape of the object for which the window is created.
+ * @param[in] steps       (Optional) Number of elements processed for each step.
+ * @param[in] skip_border (Optional) If true exclude the border region from the window.
+ * @param[in] border_size (Optional) Border size.
+ *
+ * @return The maximum window the kernel can be executed on.
+ */
+Window calculate_max_window(const ITensorInfo &info, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
+
+/** Calculate the maximum window used by a horizontal kernel for a given tensor shape and border setting
+ *
+ * @param[in] info        Tensor info object defining the shape of the object for which the window is created.
+ * @param[in] steps       (Optional) Number of elements processed for each step.
+ * @param[in] skip_border (Optional) If true exclude the border region from the window.
+ * @param[in] border_size (Optional) Border size. The border region will be excluded from the window.
+ *
+ * @return The maximum window the kernel can be executed on.
+ */
+Window calculate_max_window_horizontal(const ITensorInfo &info, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
+
+/** Calculate the maximum window for a given tensor shape and border setting. The window will also includes the border.
+ *
+ * @param[in] info        Tensor info object defining the shape of the object for which the window is created.
+ * @param[in] steps       (Optional) Number of elements processed for each step.
+ * @param[in] border_size (Optional) Border size. The border region will be included in the window.
+ *
+ * @return The maximum window the kernel can be executed on.
+ */
+Window calculate_max_enlarged_window(const ITensorInfo &info, const Steps &steps = Steps(), BorderSize border_size = BorderSize());
+
+/** Intersect multiple valid regions.
+ *
+ * @param[in] regions Valid regions.
+ *
+ * @return Intersection of all regions.
+ */
+template <typename... Ts>
+ValidRegion intersect_valid_regions(Ts &&... regions)
+{
+    auto intersect = [](const ValidRegion & r1, const ValidRegion & r2) -> ValidRegion
+    {
+        ValidRegion region;
+
+        for(size_t d = 0; d < std::min(r1.anchor.num_dimensions(), r2.anchor.num_dimensions()); ++d)
+        {
+            region.anchor.set(d, std::max(r1.anchor[d], r2.anchor[d]));
+        }
+
+        for(size_t d = 0; d < std::min(r1.shape.num_dimensions(), r2.shape.num_dimensions()); ++d)
+        {
+            region.shape.set(d, std::min(r1.shape[d], r2.shape[d]));
+        }
+
+        return region;
+    };
+
+    return foldl(intersect, std::forward<Ts>(regions)...);
+}
+
+/** Create a strides object based on the provided strides and the tensor dimensions.
+ *
+ * @param[in] info          Tensor info object providing the shape of the tensor for unspecified strides.
+ * @param[in] stride_x      Stride to be used in X dimension (in bytes).
+ * @param[in] fixed_strides Strides to be used in higher dimensions starting at Y (in bytes).
+ *
+ * @return Strides object based on the specified strides. Missing strides are
+ *         calculated based on the tensor shape and the strides of lower dimensions.
+ */
+template <typename T, typename... Ts>
+inline Strides compute_strides(const ITensorInfo &info, T stride_x, Ts &&... fixed_strides)
+{
+    const TensorShape &shape = info.tensor_shape();
+
+    // Create strides object
+    Strides strides(stride_x, fixed_strides...);
+
+    for(size_t i = 1 + sizeof...(Ts); i < info.num_dimensions(); ++i)
+    {
+        strides.set(i, shape[i - 1] * strides[i - 1]);
+    }
+
+    return strides;
+}
+
+/** Create a strides object based on the tensor dimensions.
+ *
+ * @param[in] info Tensor info object used to compute the strides.
+ *
+ * @return Strides object based on element size and tensor shape.
+ */
+template <typename... Ts>
+inline Strides compute_strides(const ITensorInfo &info)
+{
+    return compute_strides(info, info.element_size());
+}
+
+/* Auto initialize the tensor info (shape, number of channels, data type and fixed point position) if the current assignment is empty.
+ *
+ * @param[in,out] info                 Tensor info used to check and assign.
+ * @param[in]     shape                New shape.
+ * @param[in]     num_channels         New number of channels.
+ * @param[in]     data_type            New data type
+ * @param[in]     fixed_point_position New fixed point position
+ *
+ * @return True if the tensor info has been initialized
+ */
+bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, int fixed_point_position);
+
+/* Set the shape to the specified value if the current assignment is empty.
+ *
+ * @param[in,out] info  Tensor info used to check and assign.
+ * @param[in]     shape New shape.
+ *
+ * @return True if the shape has been changed.
+ */
+bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape);
+
+/* Set the format, data type and number of channels to the specified value if
+ * the current data type is unknown.
+ *
+ * @param[in,out] info   Tensor info used to check and assign.
+ * @param[in]     format New format.
+ *
+ * @return True if the format has been changed.
+ */
+bool set_format_if_unknown(ITensorInfo &info, Format format);
+
+/* Set the data type and number of channels to the specified value if
+ * the current data type is unknown.
+ *
+ * @param[in,out] info      Tensor info used to check and assign.
+ * @param[in]     data_type New data type.
+ *
+ * @return True if the data type has been changed.
+ */
+bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type);
+
+/* Set the fixed point position to the specified value if
+ * the current fixed point position is 0 and the data type is QS8 or QS16
+ *
+ * @param[in,out] info                 Tensor info used to check and assign.
+ * @param[in]     fixed_point_position New fixed point position
+ *
+ * @return True if the fixed point position has been changed.
+ */
+bool set_fixed_point_position_if_zero(ITensorInfo &info, int fixed_point_position);
+} // namespace arm_compute
+
+#include "arm_compute/core/Helpers.inl"
+#endif /*__ARM_COMPUTE_HELPERS_H__ */
diff --git a/arm_compute/core/Helpers.inl b/arm_compute/core/Helpers.inl
new file mode 100644
index 0000000000..f885810078
--- /dev/null
+++ b/arm_compute/core/Helpers.inl
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cmath>
+#include <numeric>
+
+namespace arm_compute
+{
+inline uint8_t delta_bilinear_c1u8(const uint8_t *pixel_ptr, size_t stride, float dx, float dy)
+{
+    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
+
+    const float dx1 = 1.0f - dx;
+    const float dy1 = 1.0f - dy;
+
+    const float a00 = *pixel_ptr;
+    const float a01 = *(pixel_ptr + 1);
+    const float a10 = *(pixel_ptr + stride);
+    const float a11 = *(pixel_ptr + stride + 1);
+
+    const float w1 = dx1 * dy1;
+    const float w2 = dx * dy1;
+    const float w3 = dx1 * dy;
+    const float w4 = dx * dy;
+
+    return a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4;
+}
+
+inline uint8_t pixel_bilinear_c1u8(const uint8_t *first_pixel_ptr, size_t stride, float x, float y)
+{
+    ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
+
+    const int32_t xi = x;
+    const int32_t yi = y;
+
+    const float dx = x - xi;
+    const float dy = y - yi;
+
+    return delta_bilinear_c1u8(first_pixel_ptr + xi + yi * stride, stride, dx, dy);
+}
+
+inline uint8_t pixel_bilinear_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float x, float y)
+{
+    ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
+
+    x = std::max(-1.f, std::min(x, static_cast<float>(width)));
+    y = std::max(-1.f, std::min(y, static_cast<float>(height)));
+
+    const float xi = std::floor(x);
+    const float yi = std::floor(y);
+
+    const float dx = x - xi;
+    const float dy = y - yi;
+
+    return delta_bilinear_c1u8(first_pixel_ptr + static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride, stride, dx, dy);
+}
+
+inline uint8_t pixel_area_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float wr, float hr, int x, int y)
+{
+    ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
+
+    // Calculate sampling position
+    float in_x = (x + 0.5f) * wr - 0.5f;
+    float in_y = (y + 0.5f) * hr - 0.5f;
+
+    // Get bounding box offsets
+    int x_from = std::floor(x * wr - 0.5f - in_x);
+    int y_from = std::floor(y * hr - 0.5f - in_y);
+    int x_to   = std::ceil((x + 1) * wr - 0.5f - in_x);
+    int y_to   = std::ceil((y + 1) * hr - 0.5f - in_y);
+
+    // Clamp position to borders
+    in_x = std::max(-1.f, std::min(in_x, static_cast<float>(width)));
+    in_y = std::max(-1.f, std::min(in_y, static_cast<float>(height)));
+
+    // Clamp bounding box offsets to borders
+    x_from = ((in_x + x_from) < -1) ? -1 : x_from;
+    y_from = ((in_y + y_from) < -1) ? -1 : y_from;
+    x_to   = ((in_x + x_to) > width) ? (width - in_x) : x_to;
+    y_to   = ((in_y + y_to) > height) ? (height - in_y) : y_to;
+
+    // Get pixel index
+    const int xi = std::floor(in_x);
+    const int yi = std::floor(in_y);
+
+    // Bounding box elements in each dimension
+    const int x_elements = (x_to - x_from + 1);
+    const int y_elements = (y_to - y_from + 1);
+    ARM_COMPUTE_ERROR_ON(x_elements == 0 || y_elements == 0);
+
+    // Sum pixels in area
+    int sum = 0;
+    for(int j = yi + y_from, je = yi + y_to; j <= je; ++j)
+    {
+        const uint8_t *ptr = first_pixel_ptr + j * stride + xi + x_from;
+        sum                = std::accumulate(ptr, ptr + x_elements, sum);
+    }
+
+    // Return average
+    return sum / (x_elements * y_elements);
+}
+
+template <size_t dimension>
+struct IncrementIterators
+{
+    template <typename T, typename... Ts>
+    static void unroll(T &&it, Ts &&... iterators)
+    {
+        it.increment(dimension);
+        IncrementIterators<dimension>::unroll<Ts...>(std::forward<Ts>(iterators)...);
+    }
+
+    template <typename T>
+    static void unroll(T &&it)
+    {
+        it.increment(dimension);
+        // End of recursion
+    }
+
+    static void unroll()
+    {
+        // End of recursion
+    }
+};
+
+template <size_t dim>
+struct ForEachDimension
+{
+    template <typename L, typename... Ts>
+    static void unroll(const Window &w, Coordinates &id, L &&lambda_function, Ts &&... iterators)
+    {
+        const auto &d = w[dim - 1];
+
+        for(auto v = d.start(); v < d.end(); v += d.step(), IncrementIterators < dim - 1 >::unroll(iterators...))
+        {
+            id.set(dim - 1, v);
+            ForEachDimension < dim - 1 >::unroll(w, id, lambda_function, iterators...);
+        }
+    }
+};
+
+template <>
+struct ForEachDimension<0>
+{
+    template <typename L, typename... Ts>
+    static void unroll(const Window &w, Coordinates &id, L &&lambda_function, Ts &&... iterators)
+    {
+        lambda_function(id);
+    }
+};
+
+template <typename L, typename... Ts>
+inline void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
+{
+    w.validate();
+
+    Coordinates id;
+    ForEachDimension<Coordinates::num_max_dimensions>::unroll(w, id, std::forward<L>(lambda_function), std::forward<Ts>(iterators)...);
+}
+
+inline constexpr Iterator::Iterator()
+    : _ptr(nullptr), _dims()
+{
+}
+
+inline Iterator::Iterator(const ITensor *tensor, const Window &win)
+    : Iterator()
+{
+    ARM_COMPUTE_ERROR_ON(tensor == nullptr);
+    const ITensorInfo *info = tensor->info();
+    ARM_COMPUTE_ERROR_ON(info == nullptr);
+    const Strides &strides = info->strides_in_bytes();
+
+    _ptr = tensor->buffer() + info->offset_first_element_in_bytes();
+
+    //Initialize the stride for each dimension and calculate the position of the first element of the iteration:
+    for(unsigned int n = 0; n < info->num_dimensions(); ++n)
+    {
+        _dims[n]._stride = win[n].step() * strides[n];
+        std::get<0>(_dims)._dim_start += strides[n] * win[n].start();
+    }
+
+    //Copy the starting point to all the dimensions:
+    for(unsigned int n = 1; n < Coordinates::num_max_dimensions; ++n)
+    {
+        _dims[n]._dim_start = std::get<0>(_dims)._dim_start;
+    }
+
+    ARM_COMPUTE_ERROR_ON_WINDOW_DIMENSIONS_GTE(win, info->num_dimensions());
+}
+
+inline void Iterator::increment(const size_t dimension)
+{
+    ARM_COMPUTE_ERROR_ON(dimension >= Coordinates::num_max_dimensions);
+
+    _dims[dimension]._dim_start += _dims[dimension]._stride;
+
+    for(unsigned int n = 0; n < dimension; ++n)
+    {
+        _dims[n]._dim_start = _dims[dimension]._dim_start;
+    }
+}
+
+inline constexpr int Iterator::offset() const
+{
+    return _dims.at(0)._dim_start;
+}
+
+inline constexpr uint8_t *Iterator::ptr() const
+{
+    return _ptr + _dims.at(0)._dim_start;
+}
+
+inline void Iterator::reset(const size_t dimension)
+{
+    ARM_COMPUTE_ERROR_ON(dimension >= Coordinates::num_max_dimensions - 1);
+
+    _dims[dimension]._dim_start = _dims[dimension + 1]._dim_start;
+
+    for(unsigned int n = 0; n < dimension; ++n)
+    {
+        _dims[n]._dim_start = _dims[dimension]._dim_start;
+    }
+}
+
+inline bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, int fixed_point_position)
+{
+    if(info.tensor_shape().total_size() == 0)
+    {
+        info.set_data_type(data_type);
+        info.set_tensor_shape(shape);
+        info.set_num_channels(num_channels);
+        info.set_fixed_point_position(fixed_point_position);
+        return true;
+    }
+
+    return false;
+}
+
+inline bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape)
+{
+    if(info.tensor_shape().total_size() == 0)
+    {
+        info.set_tensor_shape(shape);
+        return true;
+    }
+
+    return false;
+}
+
+inline bool set_format_if_unknown(ITensorInfo &info, Format format)
+{
+    if(info.data_type() == DataType::UNKNOWN)
+    {
+        info.set_format(format);
+        return true;
+    }
+
+    return false;
+}
+
+inline bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type)
+{
+    if(info.data_type() == DataType::UNKNOWN)
+    {
+        info.set_data_type(data_type);
+        return true;
+    }
+
+    return false;
+}
+
+inline bool set_fixed_point_position_if_zero(ITensorInfo &info, int fixed_point_position)
+{
+    if(info.fixed_point_position() == 0 && (info.data_type() == DataType::QS8 || info.data_type() == DataType::QS16))
+    {
+        info.set_fixed_point_position(fixed_point_position);
+        return true;
+    }
+
+    return false;
+}
+} // namespace arm_compute
diff --git a/arm_compute/core/IAccessWindow.h b/arm_compute/core/IAccessWindow.h
new file mode 100644
index 0000000000..cf7490d53e
--- /dev/null
+++ b/arm_compute/core/IAccessWindow.h
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_IACCESS_WINDOW_H__
+#define __ARM_COMPUTE_IACCESS_WINDOW_H__
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+
+#include <array>
+
+namespace arm_compute
+{
+class Window;
+class ITensorInfo;
+
+/** Decrease @p required in steps of @p step until it's less than @p available.
+ *
+ * @param[in] required  Number of required bytes.
+ * @param[in] available Number of available bytes.
+ * @param[in] step      Step size used to decrease required bytes.
+ *
+ * @return Largest value smaller than @p available that is a multiple of @p step
+ *
+ **/
+inline int adjust_down(int required, int available, int step)
+{
+    ARM_COMPUTE_ERROR_ON(step <= 0);
+
+    return required - step * ((required - available + step - 1) / step);
+}
+
+/** Increase @p required in steps of @p step until it's greater than @p available.
+ *
+ * @param[in] required  Number of required bytes.
+ * @param[in] available Number of available bytes.
+ * @param[in] step      Step size used to increase required bytes.
+ *
+ * @return Largest value smaller than @p available that is a multiple of @p step
+ *
+ **/
+inline int adjust_up(int required, int available, int step)
+{
+    ARM_COMPUTE_ERROR_ON(step <= 0);
+
+    return required + step * ((available - required + step - 1) / step);
+}
+
+/** Interface describing methods to update access window and padding based on kernel parameters. */
+class IAccessWindow
+{
+public:
+    virtual ~IAccessWindow() = default;
+    /** Shrink the window if padding is not large enough.
+     *
+     * @param[in] window Window used by the kernel.
+     *
+     * @return True if the window has been changed.
+     */
+    virtual bool update_window_if_needed(Window &window) const = 0;
+    /** Increase the padding to be large enough for the window.
+     *
+     * @param[in] window Window used by the kernel.
+     *
+     * @return True if the padding has been changed.
+     */
+    virtual bool update_padding_if_needed(const Window &window) const = 0;
+    /** Compute the valid region based on access pattern and valid region of the inputs.
+     *
+     * @note This method assumes that there is no border.
+     *
+     * @param[in] window             Execution window of the kernel.
+     * @param[in] input_valid_region Combined valid region of all inputs.
+     * @param[in] border_undefined   Undefined borders are excluded from the valid region.
+     * @param[in] border_size        Size of the border around the XY-plane of the tensor.
+     */
+    virtual ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const = 0;
+};
+
+/** Implementation of a rectangular access pattern. */
+class AccessWindowRectangle : public IAccessWindow
+{
+public:
+    /** Constructor for a rectangular access pattern.
+     *
+     * @note Width and height have to be non-negative.
+     *
+     * @param[in,out] info   Tensor info of the accessed kernel.
+     * @param[in]     x      Offset of the access in X direction.
+     * @param[in]     y      Offset of the access in Y direction.
+     * @param[in]     width  Number of elements that are accessed in X direction.
+     * @param[in]     height Number of elements that are accessed in Y direction.
+     */
+    AccessWindowRectangle(ITensorInfo *info, int x, int y, int width, int height)
+        : AccessWindowRectangle(info, x, y, width, height, 1.f, 1.f)
+    {
+    }
+
+    /** Constructor for a rectangular access pattern.
+     *
+     * @note Width, height and scale have to be non-negative.
+     *
+     * @param[in,out] info    Tensor info of the accessed kernel.
+     * @param[in]     x       Offset of the access in X direction.
+     * @param[in]     y       Offset of the access in Y direction.
+     * @param[in]     width   Number of elements that are accessed in X direction.
+     * @param[in]     height  Number of elements that are accessed in Y direction.
+     * @param[in]     scale_x Ratio along the X direction between the window used by the execute_window_loop and the rectangular access pattern defined
+     * @param[in]     scale_y Ratio along the Y direction between the window used by the execute_window_loop and the rectangular access pattern defined
+     */
+    AccessWindowRectangle(ITensorInfo *info, int x, int y, int width, int height, float scale_x, float scale_y)
+        : _info(info), _x(x), _y(y), _width(width), _height(height), _scale_x(scale_x), _scale_y(scale_y)
+    {
+        ARM_COMPUTE_ERROR_ON(width < 0);
+        ARM_COMPUTE_ERROR_ON(height < 0);
+        ARM_COMPUTE_ERROR_ON(scale_x < 0);
+        ARM_COMPUTE_ERROR_ON(scale_y < 0);
+    }
+
+    AccessWindowRectangle(const AccessWindowRectangle &) = delete;
+    AccessWindowRectangle &operator=(const AccessWindowRectangle &) = delete;
+    AccessWindowRectangle(AccessWindowRectangle &&)                 = default;
+    AccessWindowRectangle &operator=(AccessWindowRectangle &&) = default;
+    ~AccessWindowRectangle()                                   = default;
+
+    /** Set the valid region based on access pattern, valid region of the inputs and border mode.
+     *
+     * @param[in] window             Execution window of the kernel.
+     * @param[in] input_valid_region Combined valid region of all inputs.
+     * @param[in] border_undefined   (Optional) Undefined borders are excluded from the valid region.
+     * @param[in] border_size        (Optional) Size of the border around the XY-plane of the tensor.
+     */
+    void set_valid_region(const Window &window, const ValidRegion &input_valid_region, bool border_undefined = false, const BorderSize &border_size = BorderSize(0));
+
+    /** Compute the valid region based on access pattern, valid region of the inputs and border mode.
+     *
+     * @note This method assumes that there is no border.
+     *
+     * @param[in] window             Execution window of the kernel.
+     * @param[in] input_valid_region Combined valid region of all inputs.
+     */
+    ValidRegion compute_valid_region(const Window &window, const ValidRegion &input_valid_region) const;
+
+    // Inherited methods overridden:
+
+    /** @note This method assumes that all elements written by the kernel are valid. */
+    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
+
+    bool update_window_if_needed(Window &window) const override;
+    bool update_padding_if_needed(const Window &window) const override;
+
+protected:
+    ITensorInfo *_info;
+    int          _x;
+    int          _y;
+    int          _width;
+    int          _height;
+    float        _scale_x;
+    float        _scale_y;
+};
+
+/** Implementation of a column access pattern. */
+class AccessWindowVertical : public AccessWindowRectangle
+{
+public:
+    /** Constructor for a column access pattern.
+     *
+     * @note Height has to be non-negative.
+     *
+     * @param[in,out] info    Tensor info of the accessed kernel.
+     * @param[in]     y       Offset of the access in Y direction.
+     * @param[in]     height  Number of elements that are accessed in Y direction.
+     * @param[in]     scale_y Ratio along the Y direction between the window used by the execute_window_loop and the rectangular access pattern defined
+     */
+    AccessWindowVertical(ITensorInfo *info, int y, int height, float scale_y = 1.f)
+        : AccessWindowRectangle(info, 0, y, 1, height, 1.f, scale_y)
+    {
+        ARM_COMPUTE_ERROR_ON(height < 0);
+        ARM_COMPUTE_ERROR_ON(scale_y < 0);
+    }
+};
+
+/** Implementation of a row access pattern. */
+class AccessWindowHorizontal : public AccessWindowRectangle
+{
+public:
+    /** Constructor for a row access pattern.
+     *
+     * @note Width has to be non-negative.
+     *
+     * @param[in,out] info    Tensor info of the accessed kernel.
+     * @param[in]     x       Offset of the access in X direction.
+     * @param[in]     width   Number of elements that are accessed in X direction.
+     * @param[in]     scale_x Ratio along the X direction between the window used by the execute_window_loop and the rectangular access pattern defined
+     */
+    AccessWindowHorizontal(ITensorInfo *info, int x, int width, float scale_x = 1.f)
+        : AccessWindowRectangle(info, x, 0, width, 1, scale_x, 1.f)
+    {
+        ARM_COMPUTE_ERROR_ON(width < 0);
+        ARM_COMPUTE_ERROR_ON(scale_x < 0);
+    }
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_IACCESS_WINDOW_H__*/
diff --git a/arm_compute/core/IArray.h b/arm_compute/core/IArray.h
new file mode 100644
index 0000000000..2ed56100cf
--- /dev/null
+++ b/arm_compute/core/IArray.h
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_IARRAY_H__
+#define __ARM_COMPUTE_IARRAY_H__
+
+#include "arm_compute/core/Error.h"
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_compute
+{
+class KeyPoint;
+class Coordinates2D;
+class DetectionWindow;
+class Size2D;
+
+/** Array of type T */
+template <class T>
+class IArray
+{
+public:
+    /** Default constructor */
+    IArray()
+        : _num_values(0), _max_size(0) {};
+    /** Constructor: initializes an array which can contain up to max_num_points values
+     *
+     * @param[in] max_num_values Maximum number of values the array will be able to stored
+     */
+    IArray(size_t max_num_values)
+        : _num_values(0), _max_size(max_num_values)
+    {
+    }
+    /** Maximum number of values which can be stored in this array
+     *
+     * @return Maximum number of values
+     */
+    size_t max_num_values() const
+    {
+        return _max_size;
+    }
+    /** Default virtual destructor */
+    virtual ~IArray() = default;
+    /** Number of values currently stored in the array
+     *
+     * @return Number of values currently stored in the array or max_num_values + 1 if the array is overflowed.
+     */
+    size_t num_values() const
+    {
+        return _num_values;
+    }
+    /** Append the passed argument to the end of the array if there is room.
+     *
+     * @param[in] val Value to add to the array.
+     *
+     * @return True if the point was successfully added to the array. False if the array is full and the point couldn't be added.
+     */
+    bool push_back(const T &val)
+    {
+        ARM_COMPUTE_ERROR_ON(0 == _max_size);
+        if(_num_values >= max_num_values())
+        {
+            _num_values = max_num_values() + 1;
+            return false;
+        }
+        at(_num_values) = val;
+        _num_values++;
+        return true;
+    }
+    /** Clear all the points from the array. */
+    void clear()
+    {
+        _num_values = 0;
+    }
+    /** Did we lose some values because the array is too small?
+     *
+     * @return True if we tried to add a value using push_back() but there wasn't any room left to store it.
+     * False if all the values were successfully added to the array.
+     */
+    bool overflow() const
+    {
+        return _num_values > max_num_values();
+    }
+    /** Pointer to the first element of the array
+     *
+     * Other elements of the array can be accessed using buffer()[idx] for 0 <= idx < num_poins().
+     *
+     * @return A pointer to the first element of the array
+     */
+    virtual T *buffer() const = 0;
+    /** Reference to the element of the array located at the given index
+     *
+     * @param[in] index Index of the element
+     *
+     * @return A reference to the element of the array located at the given index.
+     */
+    virtual T &at(size_t index) const
+    {
+        ARM_COMPUTE_ERROR_ON(buffer() == nullptr);
+        ARM_COMPUTE_ERROR_ON(index >= max_num_values());
+        return buffer()[index];
+    }
+    /** Resizes the array to contain "num" elements. If "num" is smaller than the maximum array size, the content is reduced to its first "num" elements,
+     *  "num" elements can't be bigger than the maximum number of values which can be stored in this array.
+     *
+     *  @param[in] num The new array size in number of elements
+     */
+    void resize(size_t num)
+    {
+        ARM_COMPUTE_ERROR_ON(num > max_num_values());
+        _num_values = num;
+    };
+
+private:
+    size_t _num_values;
+    size_t _max_size;
+};
+using IKeyPointArray        = IArray<KeyPoint>;
+using ICoordinates2DArray   = IArray<Coordinates2D>;
+using IDetectionWindowArray = IArray<DetectionWindow>;
+using ISize2DArray          = IArray<Size2D>;
+using IUInt8Array           = IArray<uint8_t>;
+using IUInt16Array          = IArray<uint16_t>;
+using IUInt32Array          = IArray<uint32_t>;
+using IInt16Array           = IArray<int16_t>;
+using IInt32Array           = IArray<int32_t>;
+using IFloatArray           = IArray<float>;
+}
+#endif /* __ARM_COMPUTE_IARRAY_H__ */
diff --git a/arm_compute/core/IDistribution.h b/arm_compute/core/IDistribution.h
new file mode 100644
index 0000000000..b57543a3bf
--- /dev/null
+++ b/arm_compute/core/IDistribution.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_IDISTRIBUTION_H__
+#define __ARM_COMPUTE_IDISTRIBUTION_H__
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_compute
+{
+/** Interface for distribution objects */
+class IDistribution
+{
+public:
+    /** Default virtual destructor */
+    virtual ~IDistribution() = default;
+    /** Returns the dimensions of the distribution.
+     *
+     * @note  This is fixed to 1-dimensional distribution for now.
+     * @return Dimensions of the distribution.
+     */
+    virtual size_t dimensions() const = 0;
+    /** Returns the total size in bytes of the distribution.
+     *
+     * @return Total size of the distribution in bytes.
+     */
+    virtual size_t size() const = 0;
+    /** Returns a pointer to the start of the distribution.
+     * Other elements of the array can be accessed using buffer()[idx] for 0 <= idx < num_bins()
+     *
+     * @return Pointer to the start of the distribution.
+     */
+    virtual uint32_t *buffer() const = 0;
+    /** Clears the distribution by setting every element to zero. */
+    void clear() const;
+};
+}
+#endif /* __ARM_COMPUTE_IDISTRIBUTION_H__ */
diff --git a/arm_compute/core/IDistribution1D.h b/arm_compute/core/IDistribution1D.h
new file mode 100644
index 0000000000..ca8bfc0a7d
--- /dev/null
+++ b/arm_compute/core/IDistribution1D.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_IDISTRIBUTION1D_H__
+#define __ARM_COMPUTE_IDISTRIBUTION1D_H__
+
+#include "arm_compute/core/IDistribution.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_compute
+{
+/** 1D Distribution interface */
+class IDistribution1D : public IDistribution
+{
+public:
+    /** Constructor: Creates a 1D Distribution of a consecutive interval [offset, offset + range - 1]
+     *               defined by a start offset and valid range, divided equally into num_bins parts.
+     *
+     * @param[in] num_bins The number of bins the distribution is divided in.
+     * @param[in] offset   The start of the values to use.
+     * @param[in] range    The total number of the consecutive values of the distribution interval.
+     */
+    IDistribution1D(size_t num_bins, int32_t offset, uint32_t range);
+    /** Returns the number of bins that the distribution has.
+     *
+     * @return Number of bins of the distribution.
+     */
+    size_t num_bins() const;
+    /** Returns the offset of the distribution.
+     *
+     * @return Offset of the distribution.
+     */
+    int32_t offset() const;
+    /** Returns the range of the distribution.
+     *
+     * @return Range of the distribution.
+     */
+    uint32_t range() const;
+    /** Returns the window of the distribution, which is the range divided by the number of bins.
+     *
+     * @note If range is not divided by the number of bins then it is invalid.
+     *
+     * @return Window of the distribution.
+     */
+    uint32_t window() const;
+    /** Sets the range of the distribution.
+     *
+     * @param[in] range New range of the distribution to be set.
+     */
+    void set_range(uint32_t range);
+
+    // Inherited methods overridden:
+    size_t size() const override;
+    size_t dimensions() const override;
+
+private:
+    size_t   _num_bins; /**< Number of bins. */
+    int32_t  _offset;   /**< Offset, which indicate the start of the usable values. */
+    uint32_t _range;    /**< The total number of consecutive values of the distribution interval */
+};
+}
+#endif /* __ARM_COMPUTE_IDISTRIBUTION1D_H__ */
diff --git a/arm_compute/core/IHOG.h b/arm_compute/core/IHOG.h
new file mode 100644
index 0000000000..8bf713ae82
--- /dev/null
+++ b/arm_compute/core/IHOG.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_IHOG_H__
+#define __ARM_COMPUTE_IHOG_H__
+
+#include "arm_compute/core/Types.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+class HOGInfo;
+/** Interface for HOG data-object */
+class IHOG
+{
+public:
+    /** Interface to be implemented by the child class to return the HOG's metadata
+     *
+     * @return A pointer to the HOG's metadata.
+     */
+    virtual const HOGInfo *info() const = 0;
+    /** Default virtual destructor */
+    virtual ~IHOG() = default;
+    /** Pointer to the first element of the array which stores the linear SVM coefficients of HOG descriptor
+     *
+     * @note Other elements of the array can be accessed using descriptor()[idx] for idx=[0, descriptor_size() - 1]
+     *
+     * @return A pointer to the first element of the array which stores the linear SVM coefficients of HOG descriptor
+     */
+    virtual float *descriptor() const = 0;
+};
+}
+#endif /* __ARM_COMPUTE_IHOG_H__ */
diff --git a/arm_compute/core/IKernel.h b/arm_compute/core/IKernel.h
new file mode 100644
index 0000000000..4f3812b6da
--- /dev/null
+++ b/arm_compute/core/IKernel.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_IKERNEL_H__
+#define __ARM_COMPUTE_IKERNEL_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+/** Common information for all the kernels */
+class IKernel
+{
+public:
+    /** Constructor */
+    IKernel();
+    /** Destructor */
+    virtual ~IKernel() = default;
+    /** Indicates whether or not the kernel is parallelisable
+     *
+     * If the kernel is parallelisable then the window returned by window() can be split into sub-windows
+     * which can then be run in parallel.
+     *
+     * If the kernel is not parallelisable then only the window returned by window() can be passed to run()
+     *
+     * @return True if the kernel is parallelisable
+     */
+    virtual bool is_parallelisable() const;
+    /** The size of the border for that kernel
+     *
+     * @return The width in number of elements of the border.
+     */
+    virtual BorderSize border_size() const;
+    /** The maximum window the kernel can be executed on
+     *
+     * @return The maximum window the kernel can be executed on.
+     */
+    const Window &window() const;
+
+protected:
+    /** Configure the kernel's window
+     *
+     * @param[in] window The maximum window which will be returned by window()
+     */
+    void configure(const Window &window);
+
+private:
+    Window _window;
+};
+}
+#endif /*__ARM_COMPUTE_IKERNEL_H__ */
diff --git a/arm_compute/core/ILut.h b/arm_compute/core/ILut.h
new file mode 100644
index 0000000000..5223aea67a
--- /dev/null
+++ b/arm_compute/core/ILut.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ILUT_H__
+#define __ARM_COMPUTE_ILUT_H__
+
+#include "arm_compute/core/Types.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+/** Lookup Table object interface. */
+class ILut
+{
+public:
+    /** Default virtual destructor */
+    virtual ~ILut() = default;
+    /** Returns the total number of elements in the LUT.
+     *
+     * @return Total number of elements.
+     */
+    virtual size_t num_elements() const = 0;
+    /** Indicates the offset that needs to be applied to the raw index before performing a lookup in the LUT.
+     *
+     * @return The normalization offset.
+     */
+    virtual uint32_t index_offset() const = 0;
+    /** Returns the total size in bytes of the LUT.
+     *
+     * @return Total size of the LUT in bytes.
+     */
+    virtual size_t size_in_bytes() const = 0;
+    /** Returns the type of the LUT.
+     *
+     * @return The type of the LUT.
+     */
+    virtual DataType type() const = 0;
+    /** Returns a pointer to the start of the LUT.
+     * Other elements of the LUT can be accessed using buffer()[idx] for 0 <= idx < num_elements().
+     *
+     * @return Pointer to the start of the lut.
+     */
+    virtual uint8_t *buffer() const = 0;
+    /** Clears the LUT by setting every element to zero. */
+    virtual void clear() = 0;
+};
+}
+#endif /* __ARM_COMPUTE_ILUT_H__ */
diff --git a/arm_compute/core/IMultiHOG.h b/arm_compute/core/IMultiHOG.h
new file mode 100644
index 0000000000..e91da75398
--- /dev/null
+++ b/arm_compute/core/IMultiHOG.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_IMULTIHOG_H__
+#define __ARM_COMPUTE_IMULTIHOG_H__
+
+#include "arm_compute/core/IHOG.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+/** Interface for storing multiple HOG data-objects */
+class IMultiHOG
+{
+public:
+    /** Default destructor */
+    virtual ~IMultiHOG() = default;
+    /** The number of HOG models stored
+     *
+     * @return The number of HOG models stored
+     */
+    virtual size_t num_models() const = 0;
+    /** Return a pointer to the requested HOG model
+     *
+     *  @param[in] index The index of the wanted HOG model.
+     *
+     *  @return A pointer pointed to the HOG model
+     */
+    virtual IHOG *model(size_t index) = 0;
+    /** Return a const pointer to the requested HOG model
+     *
+     *  @param[in] index The index of the wanted HOG model.
+     *
+     *  @return A const pointer pointed to the HOG model
+     */
+    virtual const IHOG *model(size_t index) const = 0;
+};
+}
+
+#endif /* __ARM_COMPUTE_IMULTIHOG_H__ */
diff --git a/arm_compute/core/IMultiImage.h b/arm_compute/core/IMultiImage.h
new file mode 100644
index 0000000000..6ed3c785ca
--- /dev/null
+++ b/arm_compute/core/IMultiImage.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_IMULTIIMAGE_H__
+#define __ARM_COMPUTE_IMULTIIMAGE_H__
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+class MultiImageInfo;
+
+/** Interface for multi-planar images */
+class IMultiImage
+{
+public:
+    /** Destructor */
+    virtual ~IMultiImage() = default;
+    /** Interface to be implemented by the child class to return the multi-planar image's metadata
+     *
+     * @return A pointer to the image's metadata.
+     */
+    virtual const MultiImageInfo *info() const = 0;
+    /** Return a pointer to the requested plane of the image.
+     *
+     *  @param[in] index The index of the wanted planed.
+     *
+     *  @return A pointer pointed to the plane
+     */
+    virtual IImage *plane(unsigned int index) = 0;
+    /** Return a constant pointer to the requested plane of the image.
+     *
+     *  @param[in] index The index of the wanted planed.
+     *
+     *  @return A constant pointer pointed to the plane
+     */
+    virtual const IImage *plane(unsigned int index) const = 0;
+};
+}
+#endif /*__ARM_COMPUTE_IMULTIIMAGE_H__ */
diff --git a/arm_compute/core/IPyramid.h b/arm_compute/core/IPyramid.h
new file mode 100644
index 0000000000..e5d7011cf9
--- /dev/null
+++ b/arm_compute/core/IPyramid.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_IPYRAMID_H__
+#define __ARM_COMPUTE_IPYRAMID_H__
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/PyramidInfo.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+/** Interface for pyramid data-object */
+class IPyramid
+{
+public:
+    /** Default virtual destructor */
+    virtual ~IPyramid() = default;
+    /** Interface to be implemented by the child class to return the Pyramid's metadata
+     *
+     * @return A pointer to the Pyramid's metadata.
+     */
+    virtual const PyramidInfo *info() const = 0;
+    /** Retrieves a level of the pyramid as a ITensor pointer
+     *
+     * @param[in] index The index of the level, such that index is less than levels.
+     *
+     *  @return An ITensor pointer
+     */
+    virtual ITensor *get_pyramid_level(size_t index) const = 0;
+};
+}
+
+#endif /* __ARM_COMPUTE_IPYRAMID_H__ */
diff --git a/arm_compute/core/ITensor.h b/arm_compute/core/ITensor.h
new file mode 100644
index 0000000000..202b50a0d8
--- /dev/null
+++ b/arm_compute/core/ITensor.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ITENSOR_H__
+#define __ARM_COMPUTE_ITENSOR_H__
+
+#include "arm_compute/core/TensorInfo.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class Coordinates;
+
+/** Interface for NEON tensor */
+class ITensor
+{
+public:
+    /** Interface to be implemented by the child class to return the tensor's metadata
+     *
+     * @return A pointer to the tensor's metadata.
+     */
+    virtual ITensorInfo *info() const = 0;
+    /** Interface to be implemented by the child class to return the tensor's metadata
+     *
+     * @return A pointer to the tensor's metadata.
+     */
+    virtual ITensorInfo *info() = 0;
+    /** Default virtual destructor */
+    virtual ~ITensor() = default;
+    /** Interface to be implemented by the child class to return a pointer to CPU memory
+     *
+     * @return A CPU pointer to the beginning of the image's allocation.
+     */
+    virtual uint8_t *buffer() const = 0;
+
+    /** Return a pointer to the element at the passed coordinates
+     *
+     * @param[in] id Coordinates of the element
+     *
+     * @return Pointer to the requested element
+     */
+    inline uint8_t *ptr_to_element(const Coordinates &id) const
+    {
+        return buffer() + info()->offset_element_in_bytes(id);
+    }
+
+    /** Copy the content of another tensor.
+     *
+     * @note The number of dimensions of the source tensor must be less or equal to those of the destination tensor.
+     *
+     * @note All dimensions of the destination tensor must be greater or equal to the source tensor ones.
+     *
+     * @note num_channels() and element_size() of both tensors must match.
+     *
+     * @param[in] src Source tensor to copy from.
+     */
+    void copy_from(const ITensor &src);
+
+    /** Print a tensor to a given stream using user defined formatting information
+     *
+     * @param s      Output stream
+     * @param io_fmt Format information
+     */
+    void print(std::ostream &s, IOFormatInfo io_fmt = IOFormatInfo()) const;
+};
+
+using IImage = ITensor;
+}
+#endif /*__ARM_COMPUTE_ITENSOR_H__ */
diff --git a/arm_compute/core/ITensorInfo.h b/arm_compute/core/ITensorInfo.h
new file mode 100644
index 0000000000..bb3ac6e35e
--- /dev/null
+++ b/arm_compute/core/ITensorInfo.h
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ITENSORINFO_H__
+#define __ARM_COMPUTE_ITENSORINFO_H__
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Strides.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+/** Store the tensor's metadata */
+class ITensorInfo
+{
+public:
+    /** Default virtual destructor */
+    virtual ~ITensorInfo() = default;
+    /** Set the data type to the specified value.
+     *
+     * @warning This resets the format to UNKNOWN.
+     *
+     * @param[in] data_type The new data type.
+     */
+    virtual void set_data_type(DataType data_type) = 0;
+    /** Set the number of channels to the specified value.
+     *
+     * @warning This resets the format to UNKNOWN.
+     *
+     * @param[in] num_channels New number of channels.
+     */
+    virtual void set_num_channels(int num_channels) = 0;
+    /** Set the format of an already initialized tensor.
+     *
+     * @note If the data type has already been configured (i.e. not UNKNOWN) it
+     * must match the new format. If data type hasn't been configured it will
+     * be based on the format.
+     *
+     * @param[in] format Single-plane format of the tensor.
+     */
+    virtual void set_format(Format format) = 0;
+    /** Set the shape of an already initialized tensor.
+     *
+     * @warning Changing the shape requires to recompute the strides and is
+     * therefore only possible if the tensor hasn't been allocated yet.
+     *
+     * @param[in] shape New tensor shape.
+     */
+    virtual void set_tensor_shape(TensorShape shape) = 0;
+    /** Set the fixed point position to the specified value
+     *
+     * @warning The fixed point position must be set once the data type has been configured
+     *
+     * @param[in] fixed_point_position The new fixed point position
+     */
+    virtual void set_fixed_point_position(int fixed_point_position) = 0;
+    /** Update the offset to the first element and the strides to automatically computed values.
+     *
+     * @note The padding used by this method is really conservative so that the tensor can be used for most functions.
+     *
+     * @return True if the strides or the offset to the first element have changed.
+     */
+    virtual bool auto_padding() = 0;
+    /** Update the offset to the first element, the strides and the total size.
+     *
+     * @note This function can only increase the offset, strides and total size.
+     *
+     * @param[in] padding Padding around the XY plane in number of elements.
+     *
+     * @return True if the strides, offset and total size have changed.
+     */
+    virtual bool extend_padding(const PaddingSize &padding) = 0;
+    /** Return the size of the requested dimension
+     *
+     * @param[in] index Index of the dimension
+     *
+     * @return Dimension of the requested dimension
+     */
+    virtual size_t dimension(size_t index) const = 0;
+    /** The strides in bytes for accessing each dimension of the tensor
+     *
+     * @return Strides in bytes for each tensor dimension
+     */
+    virtual const Strides &strides_in_bytes() const = 0;
+    /** The offset from the beginning of the memory allocation to the first element of the tensor.
+     *  This can be used to access efficiently elements in a 2D tensor
+     *
+     * @return The offset in bytes to access the first element of the tensor.
+     */
+    virtual size_t offset_first_element_in_bytes() const = 0;
+    /** The offset in bytes from the beginning of the memory allocation to access the element at position (x, y, z ...)
+     *
+     * @param[in] pos Vector with the coordinates of the element to access.
+     *                The size of this vector must be equal to the number of dimensions of the tensor
+     *
+     * @return Offset in bytes from the beginning of the memory allocation to access the element (x, y, z, ...)
+     */
+    virtual size_t offset_element_in_bytes(const Coordinates &pos) const = 0;
+    /** Fixed point position used when the tensor data type is QS8 or QS16
+     *
+     * @return The fixed point position that expresses the number of bits for the fractional part of the number
+     */
+    virtual int fixed_point_position() const = 0;
+    /** Element size in bytes calculated as data_size() * num_channels()
+     *
+     * @return The size of one element in bytes
+     */
+    virtual size_t element_size() const = 0;
+    /** The number of dimensions of the tensor (rank)
+     *
+     * @return The number of dimensions of the tensor (rank)
+     */
+    virtual size_t num_dimensions() const = 0;
+    /** The number of channels for each tensor element
+     *
+     * @return The number of channels for each tensor element
+     */
+    virtual size_t num_channels() const = 0;
+    /** Size for each dimension of the tensor
+     *
+     * @return A vector with the size for each dimension of the tensor
+     */
+    virtual const TensorShape &tensor_shape() const = 0;
+    /** Data type used for each element of the tensor
+     *
+     * @return Tensor data type
+     */
+    virtual DataType data_type() const = 0;
+    /** Colour format of the image
+     *
+     * @return Colour format of the image
+     */
+    virtual Format format() const = 0;
+    /** Returns the total size of the tensor in bytes.
+     *
+     * @return Total size of the tensor in bytes.
+     */
+    virtual size_t total_size() const = 0;
+    /** Padding of tensor.
+     *
+     * @return Padding.
+     */
+    virtual PaddingSize padding() const = 0;
+    /** Checks if the tensor has been allocated with padding or not.
+     *
+     * @return True if padding is allocated in the tensor, otherwise false.
+     */
+    virtual bool has_padding() const = 0;
+    /** Flag indicating whether the size of the tensor can be changed.
+     *
+     * @return True if the tensor size can be changed.
+     */
+    virtual bool is_resizable() const = 0;
+    /** Set the flag whether the tensor size can be changed.
+     *
+     * @param[in] is_resizable Flag that marks the tensor if it can be changed or not.
+     */
+    virtual void set_is_resizable(bool is_resizable) = 0;
+    /** Valid region of the tensor. All elements in the valid region have defined values, i.e. are not undefined.
+     *
+     * @return The valid region.
+     */
+    virtual ValidRegion valid_region() const = 0;
+    /** Set the valid region of the tensor.
+     *
+     * @param[in] valid_region Valid region to set.
+     */
+    virtual void set_valid_region(ValidRegion valid_region) = 0;
+};
+}
+#endif /*__ARM_COMPUTE_TENSORINFO_H__ */
diff --git a/arm_compute/core/MultiImageInfo.h b/arm_compute/core/MultiImageInfo.h
new file mode 100644
index 0000000000..6d76953845
--- /dev/null
+++ b/arm_compute/core/MultiImageInfo.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_MULTIIMAGEINFO_H__
+#define __ARM_COMPUTE_MULTIIMAGEINFO_H__
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+/** Store the multi-planar image's metadata */
+class MultiImageInfo
+{
+public:
+    /** Constructor */
+    MultiImageInfo();
+    /** Initialize the metadata structure with the given parameters
+     *
+     * @param[in] width  Width of the image (in number of pixels)
+     * @param[in] height Height of the image (in number of pixels)
+     * @param[in] format Colour format of the image.
+     */
+    void init(unsigned int width, unsigned int height, Format format);
+    /** Colour format of the image
+     *
+     * @return Colour format of the image
+     */
+    Format format() const;
+    /** Width in pixels
+     *
+     * @return The width in pixels
+     */
+    unsigned int width() const;
+    /** Height in pixels
+     *
+     * @return The height in pixels
+     */
+    unsigned int height() const;
+
+protected:
+    unsigned int _width;
+    unsigned int _height;
+    Format       _format;
+};
+}
+#endif /*__ARM_COMPUTE_MULTIIMAGEINFO_H__ */
diff --git a/arm_compute/core/NEON/INEKernel.h b/arm_compute/core/NEON/INEKernel.h
new file mode 100644
index 0000000000..3ac8164a51
--- /dev/null
+++ b/arm_compute/core/NEON/INEKernel.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_INEKERNEL_H__
+#define __ARM_COMPUTE_INEKERNEL_H__
+
+#include "arm_compute/core/CPP/ICPPKernel.h"
+
+namespace arm_compute
+{
+using INEKernel = ICPPKernel;
+}
+#endif /*__ARM_COMPUTE_INEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/INESimpleKernel.h b/arm_compute/core/NEON/INESimpleKernel.h
new file mode 100644
index 0000000000..ca25532ef1
--- /dev/null
+++ b/arm_compute/core/NEON/INESimpleKernel.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_INESIMPLEKERNEL_H__
+#define __ARM_COMPUTE_INESIMPLEKERNEL_H__
+
+#include "arm_compute/core/CPP/ICPPSimpleKernel.h"
+
+namespace arm_compute
+{
+using INESimpleKernel = ICPPSimpleKernel;
+}
+#endif /*__ARM_COMPUTE_INESIMPLEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/NEColorConvertHelper.inl b/arm_compute/core/NEON/NEColorConvertHelper.inl
new file mode 100644
index 0000000000..9be7c8a658
--- /dev/null
+++ b/arm_compute/core/NEON/NEColorConvertHelper.inl
@@ -0,0 +1,888 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IMultiImage.h"
+#include "arm_compute/core/Utils.h"
+
+#include <arm_neon.h>
+
+namespace
+{
+constexpr float red_coef_bt709    = 1.5748F;
+constexpr float green_coef_bt709  = -0.1873f;
+constexpr float green_coef2_bt709 = -0.4681f;
+constexpr float blue_coef_bt709   = 1.8556f;
+
+constexpr float rgb2yuv_bt709_kr = 0.2126f;
+constexpr float rgb2yuv_bt709_kb = 0.0722f;
+// K_g = 1 - K_r - K_b
+constexpr float rgb2yuv_bt709_kg = 0.7152f;
+// C_u = 1 / (2 * (1 - K_b))
+constexpr float rgb2yuv_bt709_cu = 0.5389f;
+// C_v = 1 / (2 * (1 - K_r))
+constexpr float rgb2yuv_bt709_cv = 0.6350f;
+
+inline void convert_uint8x16_to_float32x4x4(const uint8x16_t &in, float32x4x4_t &out)
+{
+    const auto tmp1 = vmovl_u8(vget_low_u8(in));
+    out.val[0]      = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp1)));
+    out.val[1]      = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp1)));
+    const auto tmp2 = vmovl_u8(vget_high_u8(in));
+    out.val[2]      = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp2)));
+    out.val[3]      = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp2)));
+}
+
+inline void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const float32x4x3_t &in2, uint8x8x3_t &out)
+{
+    out.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[0])),
+                                         vqmovn_u32(vcvtq_u32_f32(in2.val[0]))));
+    out.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[1])),
+                                         vqmovn_u32(vcvtq_u32_f32(in2.val[1]))));
+    out.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[2])),
+                                         vqmovn_u32(vcvtq_u32_f32(in2.val[2]))));
+}
+
+inline void convert_float32x4x4_to_unit8x16(const float32x4x4_t &in, uint8x16_t &out)
+{
+    const auto low = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[0])),
+                                  vqmovn_u32(vcvtq_u32_f32(in.val[1])));
+    const auto high = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[2])),
+                                   vqmovn_u32(vcvtq_u32_f32(in.val[3])));
+    out = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high));
+}
+
+inline void rgb_to_yuv_calculation(const float32x4_t &rvec, const float32x4_t &gvec, const float32x4_t &bvec,
+                                   float32x4_t &yvec, float32x4_t &uvec, float32x4_t &vvec)
+{
+    /*
+    Y'= 0.2126*R' + 0.7152*G' + 0.0722*B'
+    U'=-0.1146*R' - 0.3854*G' + 0.5000*B'
+    V'= 0.5000*R' - 0.4542*G' - 0.0458*B'
+    */
+    const auto c128 = vdupq_n_f32(128.f);
+
+    // Y = R * K_r + G * (1 - K_r - K_b) * B * K_b
+    yvec = vmulq_n_f32(rvec, rgb2yuv_bt709_kr);
+    yvec = vmlaq_n_f32(yvec, gvec, rgb2yuv_bt709_kg);
+    yvec = vmlaq_n_f32(yvec, bvec, rgb2yuv_bt709_kb);
+
+    // U = (B - Y) / (2 * (1 - K_b))
+    uvec = vsubq_f32(bvec, yvec);
+    uvec = vmlaq_n_f32(c128, uvec, rgb2yuv_bt709_cu);
+
+    // V = (R - Y) / (2 * (1 - K_r))
+    vvec = vsubq_f32(rvec, yvec);
+    vvec = vmlaq_n_f32(c128, vvec, rgb2yuv_bt709_cv);
+}
+
+inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uvec_val, const float32x4_t &yyvec_val,
+                                    float32x4_t vvec_val, unsigned char *output_ptr, const bool alpha)
+{
+    float32x4x3_t rgb1, rgb2;
+
+    // Compute: cb - 128 and cr - 128;
+    const auto c128 = vdupq_n_f32(128.f);
+    uvec_val        = vsubq_f32(uvec_val, c128);
+    vvec_val        = vsubq_f32(vvec_val, c128);
+
+    // Compute:
+    // r = 0.0000f*f_u + 1.5748f*f_v;
+    // g = 0.1873f*f_u - 0.4681f*f_v;
+    // b = 1.8556f*f_u + 0.0000f*f_v;
+    const auto red   = vmulq_n_f32(vvec_val, red_coef_bt709);
+    const auto blue  = vmulq_n_f32(uvec_val, blue_coef_bt709);
+    const auto green = vaddq_f32(vmulq_n_f32(uvec_val, green_coef_bt709),
+                                 vmulq_n_f32(vvec_val, green_coef2_bt709));
+
+    // Compute the final r,g,b values using y1 for the first texel and y2 for the second one.
+    // the result is stored in two float32x4x3_t which then are converted to one uint8x8x3_t
+    // and written back to memory using vst3 instruction
+
+    rgb1.val[0] = vaddq_f32(yvec_val, red);
+    rgb1.val[1] = vaddq_f32(yvec_val, green);
+    rgb1.val[2] = vaddq_f32(yvec_val, blue);
+
+    rgb2.val[0] = vaddq_f32(yyvec_val, red);
+    rgb2.val[1] = vaddq_f32(yyvec_val, green);
+    rgb2.val[2] = vaddq_f32(yyvec_val, blue);
+
+    uint8x8x3_t u8_rgb;
+    convert_float32x4x3_to_uint8x8x3(rgb1, rgb2, u8_rgb);
+
+    if(!alpha)
+    {
+        vst3_lane_u8(&output_ptr[0], u8_rgb, 0);
+        vst3_lane_u8(&output_ptr[3], u8_rgb, 4);
+        vst3_lane_u8(&output_ptr[6], u8_rgb, 1);
+        vst3_lane_u8(&output_ptr[9], u8_rgb, 5);
+        vst3_lane_u8(&output_ptr[12], u8_rgb, 2);
+        vst3_lane_u8(&output_ptr[15], u8_rgb, 6);
+        vst3_lane_u8(&output_ptr[18], u8_rgb, 3);
+        vst3_lane_u8(&output_ptr[21], u8_rgb, 7);
+    }
+    else
+    {
+        uint8x8x4_t u8_rgba;
+        u8_rgba.val[0] = u8_rgb.val[0];
+        u8_rgba.val[1] = u8_rgb.val[1];
+        u8_rgba.val[2] = u8_rgb.val[2];
+        u8_rgba.val[3] = vdup_n_u8(255);
+        vst4_lane_u8(&output_ptr[0], u8_rgba, 0);
+        vst4_lane_u8(&output_ptr[4], u8_rgba, 4);
+        vst4_lane_u8(&output_ptr[8], u8_rgba, 1);
+        vst4_lane_u8(&output_ptr[12], u8_rgba, 5);
+        vst4_lane_u8(&output_ptr[16], u8_rgba, 2);
+        vst4_lane_u8(&output_ptr[20], u8_rgba, 6);
+        vst4_lane_u8(&output_ptr[24], u8_rgba, 3);
+        vst4_lane_u8(&output_ptr[28], u8_rgba, 7);
+    }
+}
+
+inline uint8x16x3_t load_rgb(const unsigned char *const ptr, const bool alpha)
+{
+    uint8x16x3_t rgb;
+
+    if(alpha)
+    {
+        const auto tmp = vld4q_u8(ptr);
+        rgb.val[0]     = tmp.val[0];
+        rgb.val[1]     = tmp.val[1];
+        rgb.val[2]     = tmp.val[2];
+    }
+    else
+    {
+        rgb = vld3q_u8(ptr);
+    }
+
+    return rgb;
+}
+
+inline void rgb_to_yuv_conversion(uint8x16x3_t &vec_top, uint8x16x3_t &vec_bottom)
+{
+    // Convert the uint8x16_t to float32x4x4_t
+    float32x4x4_t frvec_top, fgvec_top, fbvec_top;
+    convert_uint8x16_to_float32x4x4(vec_top.val[0], frvec_top);
+    convert_uint8x16_to_float32x4x4(vec_top.val[1], fgvec_top);
+    convert_uint8x16_to_float32x4x4(vec_top.val[2], fbvec_top);
+
+    float32x4x4_t frvec_bottom, fgvec_bottom, fbvec_bottom;
+    convert_uint8x16_to_float32x4x4(vec_bottom.val[0], frvec_bottom);
+    convert_uint8x16_to_float32x4x4(vec_bottom.val[1], fgvec_bottom);
+    convert_uint8x16_to_float32x4x4(vec_bottom.val[2], fbvec_bottom);
+
+    float32x4x4_t fyvec_top, fuvec_top, fvvec_top;
+    float32x4x4_t fyvec_bottom, fuvec_bottom, fvvec_bottom;
+
+    for(auto i = 0; i < 4; ++i)
+    {
+        rgb_to_yuv_calculation(frvec_top.val[i], fgvec_top.val[i], fbvec_top.val[i],
+                               fyvec_top.val[i], fuvec_top.val[i], fvvec_top.val[i]);
+        rgb_to_yuv_calculation(frvec_bottom.val[i], fgvec_bottom.val[i], fbvec_bottom.val[i],
+                               fyvec_bottom.val[i], fuvec_bottom.val[i], fvvec_bottom.val[i]);
+    }
+
+    convert_float32x4x4_to_unit8x16(fyvec_top, vec_top.val[0]);
+    convert_float32x4x4_to_unit8x16(fuvec_top, vec_top.val[1]);
+    convert_float32x4x4_to_unit8x16(fvvec_top, vec_top.val[2]);
+    convert_float32x4x4_to_unit8x16(fyvec_bottom, vec_bottom.val[0]);
+    convert_float32x4x4_to_unit8x16(fuvec_bottom, vec_bottom.val[1]);
+    convert_float32x4x4_to_unit8x16(fvvec_bottom, vec_bottom.val[2]);
+}
+
+inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top,
+                              const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom,
+                              unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom,
+                              unsigned char *const __restrict out_uv)
+{
+    uint8x16x3_t vec_top, vec_bottom;
+    vec_top.val[0]    = rvec_top;
+    vec_top.val[1]    = gvec_top;
+    vec_top.val[2]    = bvec_top;
+    vec_bottom.val[0] = rvec_bottom;
+    vec_bottom.val[1] = gvec_bottom;
+    vec_bottom.val[2] = bvec_bottom;
+
+    rgb_to_yuv_conversion(vec_top, vec_bottom);
+
+    vst1q_u8(out_y_top, vec_top.val[0]);
+    vst1q_u8(out_y_bottom, vec_bottom.val[0]);
+
+    const auto uvec = vuzpq_u8(vec_top.val[1], vec_bottom.val[1]);
+    const auto vvec = vuzpq_u8(vec_top.val[2], vec_bottom.val[2]);
+    const auto utmp = vrhaddq_u8(uvec.val[0], uvec.val[1]);
+    const auto vtmp = vrhaddq_u8(vvec.val[0], vvec.val[1]);
+
+    uint8x8x2_t uvvec;
+    uvvec.val[0] = vhadd_u8(vget_low_u8(utmp), vget_high_u8(utmp));
+    uvvec.val[1] = vhadd_u8(vget_low_u8(vtmp), vget_high_u8(vtmp));
+
+    vst2_u8(out_uv, uvvec);
+}
+
+inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top,
+                              const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom,
+                              unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom,
+                              unsigned char *const __restrict out_u,
+                              unsigned char *const __restrict out_v)
+{
+    uint8x16x3_t vec_top, vec_bottom;
+    vec_top.val[0]    = rvec_top;
+    vec_top.val[1]    = gvec_top;
+    vec_top.val[2]    = bvec_top;
+    vec_bottom.val[0] = rvec_bottom;
+    vec_bottom.val[1] = gvec_bottom;
+    vec_bottom.val[2] = bvec_bottom;
+
+    rgb_to_yuv_conversion(vec_top, vec_bottom);
+
+    vst1q_u8(out_y_top, vec_top.val[0]);
+    vst1q_u8(out_y_bottom, vec_bottom.val[0]);
+
+    const auto uvvec_top    = vuzpq_u8(vec_top.val[1], vec_top.val[2]);
+    const auto uvvec_bottom = vuzpq_u8(vec_bottom.val[1], vec_bottom.val[2]);
+    const auto uvvec        = vhaddq_u8(vrhaddq_u8(uvvec_top.val[0], uvvec_top.val[1]),
+                                        vrhaddq_u8(uvvec_bottom.val[0], uvvec_bottom.val[1]));
+
+    vst1_u8(out_u, vget_low_u8(uvvec));
+    vst1_u8(out_v, vget_high_u8(uvvec));
+}
+
+inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, const uint8x16_t &bvec,
+                              unsigned char *const __restrict out_y,
+                              unsigned char *const __restrict out_u,
+                              unsigned char *const __restrict out_v)
+{
+    // Convert the uint8x16_t to float32x4x4_t
+    float32x4x4_t frvec, fgvec, fbvec;
+    convert_uint8x16_to_float32x4x4(rvec, frvec);
+    convert_uint8x16_to_float32x4x4(gvec, fgvec);
+    convert_uint8x16_to_float32x4x4(bvec, fbvec);
+
+    float32x4x4_t fyvec, fuvec, fvvec;
+    for(auto i = 0; i < 4; ++i)
+    {
+        rgb_to_yuv_calculation(frvec.val[i], fgvec.val[i], fbvec.val[i],
+                               fyvec.val[i], fuvec.val[i], fvvec.val[i]);
+    }
+
+    uint8x16_t yvec, uvec, vvec;
+    convert_float32x4x4_to_unit8x16(fyvec, yvec);
+    convert_float32x4x4_to_unit8x16(fuvec, uvec);
+    convert_float32x4x4_to_unit8x16(fvvec, vvec);
+
+    vst1q_u8(out_y, yvec);
+    vst1q_u8(out_u, uvec);
+    vst1q_u8(out_v, vvec);
+}
+}
+
+namespace arm_compute
+{
+void colorconvert_rgb_to_rgbx(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IImage *__restrict>(output);
+
+    Iterator in(input_ptr, win);
+    Iterator out(output_ptr, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto   ta1 = vld3q_u8(in.ptr());
+        uint8x16x4_t ta2;
+        ta2.val[0] = ta1.val[0];
+        ta2.val[1] = ta1.val[1];
+        ta2.val[2] = ta1.val[2];
+        ta2.val[3] = vdupq_n_u8(255);
+        vst4q_u8(out.ptr(), ta2);
+    },
+    in, out);
+}
+
+void colorconvert_rgbx_to_rgb(const void *input, void *output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IImage *__restrict>(output);
+
+    Iterator in(input_ptr, win);
+    Iterator out(output_ptr, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto   ta1 = vld4q_u8(in.ptr());
+        uint8x16x3_t ta2;
+        ta2.val[0] = ta1.val[0];
+        ta2.val[1] = ta1.val[1];
+        ta2.val[2] = ta1.val[2];
+        vst3q_u8(out.ptr(), ta2);
+    },
+    in, out);
+}
+
+template <bool yuyv, bool alpha>
+void colorconvert_yuyv_to_rgb(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IImage *__restrict>(output);
+
+    constexpr auto element_size = alpha ? 32 : 24;
+    constexpr auto shift        = yuyv ? 0 : 1;
+
+    Iterator in(input_ptr, win);
+    Iterator out(output_ptr, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        float32x4x4_t uvec, yvec, vvec, yyvec;
+        const auto    ta = vld4q_u8(in.ptr());
+        //ta.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta.val[1] = U0 U2 U4 U6 ...
+        //ta.val[2] = Y1 Y3 Y5 Y7 ...
+        //ta.val[3] = V0 V2 V4 V7 ...
+
+        // Convert the uint8x16x4_t to float32x4x4_t
+        convert_uint8x16_to_float32x4x4(ta.val[0 + shift], yvec);
+        convert_uint8x16_to_float32x4x4(ta.val[1 - shift], uvec);
+        convert_uint8x16_to_float32x4x4(ta.val[2 + shift], yyvec);
+        convert_uint8x16_to_float32x4x4(ta.val[3 - shift], vvec);
+
+        yuyv_to_rgb_calculation(yvec.val[0], uvec.val[0], yyvec.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec.val[1], uvec.val[1], yyvec.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec.val[2], uvec.val[2], yyvec.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec.val[3], uvec.val[3], yyvec.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
+    },
+    in, out);
+}
+
+template <bool uv, bool alpha>
+void colorconvert_nv12_to_rgb(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
+    const auto output_ptr = static_cast<IImage *__restrict>(output);
+
+    constexpr auto element_size = alpha ? 32 : 24;
+    const auto     out_stride   = output_ptr->info()->strides_in_bytes().y();
+    constexpr auto shift        = uv ? 0 : 1;
+
+    // UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in_y(input_ptr->plane(0), win);
+    Iterator in_uv(input_ptr->plane(1), win_uv);
+    Iterator out(output_ptr, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto ta_y_top    = vld2q_u8(in_y.ptr());
+        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+        const auto ta_uv       = vld2q_u8(in_uv.ptr());
+        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+        //ta_uv.val[0] = U0 U2 U4 U6 ...
+        //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+        // Convert the uint8x16x4_t to float32x4x4_t
+        float32x4x4_t yvec_top, yyvec_top, yvec_bottom, yyvec_bottom, uvec, vvec;
+        convert_uint8x16_to_float32x4x4(ta_y_top.val[0], yvec_top);
+        convert_uint8x16_to_float32x4x4(ta_y_top.val[1], yyvec_top);
+        convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0], yvec_bottom);
+        convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1], yyvec_bottom);
+        convert_uint8x16_to_float32x4x4(ta_uv.val[0 + shift], uvec);
+        convert_uint8x16_to_float32x4x4(ta_uv.val[1 - shift], vvec);
+
+        yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
+
+        yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha);
+    },
+    in_y, in_uv, out);
+}
+
+template <bool alpha>
+void colorconvert_iyuv_to_rgb(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
+    const auto output_ptr = static_cast<IImage *__restrict>(output);
+
+    constexpr auto element_size = alpha ? 32 : 24;
+    const auto     out_stride   = output_ptr->info()->strides_in_bytes().y();
+
+    // UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in_y(input_ptr->plane(0), win);
+    Iterator in_u(input_ptr->plane(1), win_uv);
+    Iterator in_v(input_ptr->plane(2), win_uv);
+    Iterator out(output_ptr, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto ta_y_top    = vld2q_u8(in_y.ptr());
+        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+        const auto ta_u        = vld1q_u8(in_u.ptr());
+        const auto ta_v        = vld1q_u8(in_v.ptr());
+        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+        //ta_u.val[0] = U0 U2 U4 U6 ...
+        //ta_v.val[0] = V0 V2 V4 V6 ...
+
+        // Convert the uint8x16x4_t to float32x4x4_t
+        float32x4x4_t yvec_top, yyvec_top, yvec_bottom, yyvec_bottom, uvec, vvec;
+        convert_uint8x16_to_float32x4x4(ta_y_top.val[0], yvec_top);
+        convert_uint8x16_to_float32x4x4(ta_y_top.val[1], yyvec_top);
+        convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0], yvec_bottom);
+        convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1], yyvec_bottom);
+        convert_uint8x16_to_float32x4x4(ta_u, uvec);
+        convert_uint8x16_to_float32x4x4(ta_v, vvec);
+
+        yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
+
+        yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha);
+        yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha);
+    },
+    in_y, in_u, in_v, out);
+}
+
+template <bool yuyv>
+void colorconvert_yuyv_to_nv12(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    constexpr auto shift = yuyv ? 0 : 1;
+
+    // NV12's UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in(input_ptr, win);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_uv(output_ptr->plane(1), win_uv);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto ta_top    = vld4q_u8(in.ptr());
+        const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
+        //ta.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta.val[1] = U0 U2 U4 U6 ...
+        //ta.val[2] = Y1 Y3 Y5 Y7 ...
+        //ta.val[3] = V0 V2 V4 V7 ...
+
+        uint8x16x2_t yvec;
+        yvec.val[0] = ta_top.val[0 + shift];
+        yvec.val[1] = ta_top.val[2 + shift];
+        vst2q_u8(out_y.ptr(), yvec);
+
+        uint8x16x2_t yyvec;
+        yyvec.val[0] = ta_bottom.val[0 + shift];
+        yyvec.val[1] = ta_bottom.val[2 + shift];
+        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
+
+        uint8x16x2_t uvvec;
+        uvvec.val[0] = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
+        uvvec.val[1] = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
+        vst2q_u8(out_uv.ptr(), uvvec);
+    },
+    in, out_y, out_uv);
+}
+
+void colorconvert_iyuv_to_nv12(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    // UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in_y(input_ptr->plane(0), win);
+    Iterator in_u(input_ptr->plane(1), win_uv);
+    Iterator in_v(input_ptr->plane(2), win_uv);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_uv(output_ptr->plane(1), win_uv);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto   ta_y_top    = vld2q_u8(in_y.ptr());
+        const auto   ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+        uint8x16x2_t ta_uv;
+        ta_uv.val[0] = vld1q_u8(in_u.ptr());
+        ta_uv.val[1] = vld1q_u8(in_v.ptr());
+        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+        //ta_uv.val[0] = U0 U2 U4 U6 ...
+        //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+        vst2q_u8(out_y.ptr(), ta_y_top);
+        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+        vst2q_u8(out_uv.ptr(), ta_uv);
+    },
+    in_y, in_u, in_v, out_y, out_uv);
+}
+
+template <bool uv>
+void colorconvert_nv12_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    constexpr auto shift = uv ? 0 : 1;
+
+    // UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in_y(input_ptr->plane(0), win);
+    Iterator in_uv(input_ptr->plane(1), win_uv);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_u(output_ptr->plane(1), win_uv);
+    Iterator out_v(output_ptr->plane(2), win_uv);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto ta_y_top    = vld2q_u8(in_y.ptr());
+        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+        const auto ta_uv       = vld2q_u8(in_uv.ptr());
+        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+        //ta_uv.val[0] = U0 U2 U4 U6 ...
+        //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+        vst2q_u8(out_y.ptr(), ta_y_top);
+        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+        vst1q_u8(out_u.ptr(), ta_uv.val[0 + shift]);
+        vst1q_u8(out_v.ptr(), ta_uv.val[1 - shift]);
+    },
+    in_y, in_uv, out_y, out_u, out_v);
+}
+
+template <bool yuyv>
+void colorconvert_yuyv_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    constexpr auto shift = yuyv ? 0 : 1;
+
+    // Destination's UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in(input_ptr, win);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_u(output_ptr->plane(1), win_uv);
+    Iterator out_v(output_ptr->plane(2), win_uv);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto ta_top    = vld4q_u8(in.ptr());
+        const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
+        //ta.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta.val[1] = U0 U2 U4 U6 ...
+        //ta.val[2] = Y1 Y3 Y5 Y7 ...
+        //ta.val[3] = V0 V2 V4 V7 ...
+
+        uint8x16x2_t yvec;
+        yvec.val[0] = ta_top.val[0 + shift];
+        yvec.val[1] = ta_top.val[2 + shift];
+        vst2q_u8(out_y.ptr(), yvec);
+
+        uint8x16x2_t yyvec;
+        yyvec.val[0] = ta_bottom.val[0 + shift];
+        yyvec.val[1] = ta_bottom.val[2 + shift];
+        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
+
+        uint8x16_t uvec;
+        uvec = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
+        vst1q_u8(out_u.ptr(), uvec);
+
+        uint8x16_t vvec;
+        vvec = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
+        vst1q_u8(out_v.ptr(), vvec);
+    },
+    in, out_y, out_u, out_v);
+}
+
+template <bool uv>
+void colorconvert_nv12_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    constexpr auto shift = uv ? 0 : 1;
+
+    // UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in_y(input_ptr->plane(0), win);
+    Iterator in_uv(input_ptr->plane(1), win_uv);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_u(output_ptr->plane(1), win);
+    Iterator out_v(output_ptr->plane(2), win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto ta_y_top    = vld2q_u8(in_y.ptr());
+        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+        const auto ta_uv       = vld2q_u8(in_uv.ptr());
+        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+        //ta_uv.val[0] = U0 U2 U4 U6 ...
+        //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+        vst2q_u8(out_y.ptr(), ta_y_top);
+        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+
+        uint8x16x2_t uvec;
+        uvec.val[0] = ta_uv.val[0 + shift];
+        uvec.val[1] = ta_uv.val[0 + shift];
+        vst2q_u8(out_u.ptr(), uvec);
+        vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
+
+        uint8x16x2_t vvec;
+        vvec.val[0] = ta_uv.val[1 - shift];
+        vvec.val[1] = ta_uv.val[1 - shift];
+        vst2q_u8(out_v.ptr(), vvec);
+        vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
+    },
+    in_y, in_uv, out_y, out_u, out_v);
+}
+
+void colorconvert_iyuv_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    // UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in_y(input_ptr->plane(0), win);
+    Iterator in_u(input_ptr->plane(1), win_uv);
+    Iterator in_v(input_ptr->plane(2), win_uv);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_u(output_ptr->plane(1), win);
+    Iterator out_v(output_ptr->plane(2), win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto ta_y_top    = vld2q_u8(in_y.ptr());
+        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+        const auto ta_u        = vld1q_u8(in_u.ptr());
+        const auto ta_v        = vld1q_u8(in_v.ptr());
+        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+        //ta_u = U0 U2 U4 U6 ...
+        //ta_v = V0 V2 V4 V6 ...
+
+        vst2q_u8(out_y.ptr(), ta_y_top);
+        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+
+        uint8x16x2_t uvec;
+        uvec.val[0] = ta_u;
+        uvec.val[1] = ta_u;
+        vst2q_u8(out_u.ptr(), uvec);
+        vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
+
+        uint8x16x2_t vvec;
+        vvec.val[0] = ta_v;
+        vvec.val[1] = ta_v;
+        vst2q_u8(out_v.ptr(), vvec);
+        vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
+    },
+    in_y, in_u, in_v, out_y, out_u, out_v);
+}
+
+template <bool alpha>
+void colorconvert_rgb_to_nv12(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    // UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in(input_ptr, win);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_uv(output_ptr->plane(1), win_uv);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto ta_rgb_top    = load_rgb(in.ptr(), alpha);
+        const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
+        //ta_rgb.val[0] = R0 R1 R2 R3 ...
+        //ta_rgb.val[1] = G0 G1 G2 G3 ...
+        //ta_rgb.val[2] = B0 B1 B2 B3 ...
+
+        store_rgb_to_nv12(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2],
+                          ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2],
+                          out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(),
+                          out_uv.ptr());
+    },
+    in, out_y, out_uv);
+}
+
+template <bool alpha>
+void colorconvert_rgb_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    // UV's width and height are subsampled
+    Window win_uv(win);
+    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+    win_uv.validate();
+
+    Iterator in(input_ptr, win);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_u(output_ptr->plane(1), win_uv);
+    Iterator out_v(output_ptr->plane(2), win_uv);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto ta_rgb_top    = load_rgb(in.ptr(), alpha);
+        const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
+        //ta_rgb.val[0] = R0 R1 R2 R3 ...
+        //ta_rgb.val[1] = G0 G1 G2 G3 ...
+        //ta_rgb.val[2] = B0 B1 B2 B3 ...
+
+        store_rgb_to_iyuv(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2],
+                          ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2],
+                          out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(),
+                          out_u.ptr(), out_v.ptr());
+    },
+    in, out_y, out_u, out_v);
+}
+
+template <bool alpha>
+void colorconvert_rgb_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    win.validate();
+
+    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
+    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
+
+    Iterator in(input_ptr, win);
+    Iterator out_y(output_ptr->plane(0), win);
+    Iterator out_u(output_ptr->plane(1), win);
+    Iterator out_v(output_ptr->plane(2), win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto ta_rgb = load_rgb(in.ptr(), alpha);
+        //ta_rgb.val[0] = R0 R1 R2 R3 ...
+        //ta_rgb.val[1] = G0 G1 G2 G3 ...
+        //ta_rgb.val[2] = B0 B1 B2 B3 ...
+
+        store_rgb_to_yuv4(ta_rgb.val[0], ta_rgb.val[1], ta_rgb.val[2],
+                          out_y.ptr(), out_u.ptr(), out_v.ptr());
+    },
+    in, out_y, out_u, out_v);
+}
+}
diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h
new file mode 100644
index 0000000000..fb712611cb
--- /dev/null
+++ b/arm_compute/core/NEON/NEFixedPoint.h
@@ -0,0 +1,686 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFIXEDPOINT_H__
+#define __ARM_COMPUTE_NEFIXEDPOINT_H__
+
+#include "arm_compute/core/FixedPoint.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+using qint8x8_t    = int8x8_t;    /**< 8 bit fixed point vector with 8 elements */
+using qint8x8x2_t  = int8x8x2_t;  /**< 8 bit fixed point vector with 16 elements */
+using qint8x8x3_t  = int8x8x3_t;  /**< 8 bit fixed point vector with 24 elements */
+using qint8x8x4_t  = int8x8x4_t;  /**< 8 bit fixed point vector with 32 elements */
+using qint8x16_t   = int8x16_t;   /**< 8 bit fixed point vector with 16 elements */
+using qint8x16x2_t = int8x16x2_t; /**< 8 bit fixed point vector with 32 elements */
+using qint8x16x3_t = int8x16x3_t; /**< 8 bit fixed point vector with 48 elements */
+using qint8x16x4_t = int8x16x4_t; /**< 8 bit fixed point vector with 64 elements */
+using qint16x4_t   = int16x4_t;   /**< 16 bit fixed point vector with 4 elements */
+using qint16x4x2_t = int16x4x2_t; /**< 16 bit fixed point vector with 8 elements */
+using qint16x4x3_t = int16x4x3_t; /**< 16 bit fixed point vector with 12 elements */
+using qint16x4x4_t = int16x4x4_t; /**< 16 bit fixed point vector with 16 elements */
+using qint16x8_t   = int16x8_t;   /**< 16 bit fixed point vector with 8 elements */
+using qint16x8x2_t = int16x8x2_t; /**< 16 bit fixed point vector with 16 elements */
+using qint16x8x3_t = int16x8x3_t; /**< 16 bit fixed point vector with 24 elements */
+using qint16x8x4_t = int16x8x4_t; /**< 16 bit fixed point vector with 32 elements */
+
+/** Get the lower half of a 16 elements vector
+ *
+ * @param[in] a vector of 16 elements
+ *
+ * @return 8 bit fixed point vector (8 elements)
+ */
+qint8x8_t vget_low_qs8(qint8x16_t a);
+
+/** Get the higher half of a 16 elements vector
+ *
+ * @param[in] a vector of 16 elements
+ *
+ * @return 8 bit fixed point vector (8 elements)
+ */
+qint8x8_t vget_high_qs8(qint8x16_t a);
+
+/** Load a single 8 bit fixed point vector from memory (8 elements)
+ *
+ * @param[in] addr Memory address of the 8 bit fixed point vector to load
+ *
+ * @return 8 bit fixed point vector (8 elements)
+ */
+qint8x8_t vld1_qs8(const qint8_t *addr);
+
+/** Load a single 8 bit fixed point vector from memory (16 elements)
+ *
+ * @param[in] addr Memory address of the 8 bit fixed point vector to load
+ *
+ * @return 8 bit fixed point vector (16 elements)
+ */
+qint8x16_t vld1q_qs8(const qint8_t *addr);
+
+/** Load a single 16 bit fixed point vector from memory (4 elements)
+ *
+ * @param[in] addr Memory address of the 16 bit fixed point vector to load
+ *
+ * @return 16 bit fixed point vector (4 elements)
+ */
+qint16x4_t vld1_qs16(const qint16_t *addr);
+
+/** Load a single 16 bit fixed point vector from memory (8 elements)
+ *
+ * @param[in] addr Memory address of the 16 bit fixed point vector to load
+ *
+ * @return 16 bit fixed point vector (8 elements)
+ */
+qint16x8_t vld1q_qs16(const qint16_t *addr);
+
+/** Load all lanes of 8 bit fixed point vector with same value from memory (8 elements)
+ *
+ * @param[in] addr Memory address of the 8 bit fixed point scalar value to load
+ *
+ * @return 8 bit fixed point vector (8 elements)
+ */
+qint8x8_t vld1_dup_qs8(const qint8_t *addr);
+
+/** Load all lanes of 8 bit fixed point vector with same value from memory (16 elements)
+ *
+ * @param[in] addr Memory address of the 8 bit fixed point scalar value to load
+ *
+ * @return 8 bit fixed point vector (16 elements)
+ */
+qint8x16_t vld1q_dup_qs8(const qint8_t *addr);
+
+/** Store a single 8 bit fixed point vector to memory (8 elements)
+ *
+ * @param[in] addr Memory address where the 8 bit fixed point vector should be stored
+ * @param[in] b    8 bit fixed point vector to store
+ *
+ */
+void vst1_qs8(qint8_t *addr, qint8x8_t b);
+
+/** Store a single 8 bit fixed point vector to memory (16 elements)
+ *
+ * @param[in] addr Memory address where the 8 bit fixed point vector should be stored
+ * @param[in] b    8 bit fixed point vector to store
+ *
+ */
+void vst1q_qs8(qint8_t *addr, qint8x16_t b);
+
+/** Store a single 16 bit fixed point vector to memory (4 elements)
+ *
+ * @param[in] addr Memory address where the 16 bit fixed point vector should be stored
+ * @param[in] b    16 bit fixed point vector to store
+ *
+ */
+void vst1_qs16(qint16_t *addr, qint16x4_t b);
+
+/** Store a single 8 bit fixed point vector to memory (16 elements)
+ *
+ * @param[in] addr Memory address where the 16 bit fixed point vector should be stored
+ * @param[in] b    16 bit fixed point vector to store
+ *
+ */
+void vst1q_qs16(qint16_t *addr, qint16x8_t b);
+
+/** 16 bit fixed point vector saturating narrow (8 elements)
+ *
+ * @param[in] a 16 bit fixed point vector to convert
+ *
+ * @return 8 bit fixed point vector
+ */
+qint8x8_t vqmovn_q16(qint16x8_t a);
+
+/** 8 bit fixed point vector duplicate (8 elements)
+ *
+ * @param[in] a 8 bit fixed point to duplicate
+ *
+ * @return The result of the vector duplication
+ */
+qint8x8_t vdup_n_qs8(qint8_t a);
+
+/** 8 bit fixed point vector duplicate (16 elements)
+ *
+ * @param[in] a 8 bit fixed point to duplicate
+ *
+ * @return The result of the vector duplication
+ */
+qint8x16_t vdupq_n_qs8(qint8_t a);
+
+/** Duplicate a float and convert it to 8 bit fixed point vector (16 elements)
+ *
+ * @param[in] a                    8 bit fixed point to duplicate
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the vector duplication
+ */
+qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position);
+
+/** 16 bit fixed point vector duplicate (8 elements)
+ *
+ * @param[in] a 16 bit fixed point to duplicate
+ *
+ * @return The result of the vector duplication
+ */
+qint16x8_t vdupq_n_qs16(qint16x8_t a);
+
+/** Absolute value of 8 bit fixed point vector (8 elements)
+ *
+ * @param[in] a 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector absolute value
+ */
+qint8x8_t vabs_qs8(qint8x8_t a);
+
+/** Absolute value of 8 bit fixed point vector (16 elements)
+ *
+ * @param[in] a 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector absolute value
+ */
+qint8x16_t vabsq_qs8(qint8x16_t a);
+
+/** Saturating absolute value of 8 bit fixed point vector (8 elements)
+ *
+ * @param[in] a 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector absolute value
+ */
+qint8x8_t vqabs_qs8(qint8x8_t a);
+
+/** Saturating absolute value of 8 bit fixed point vector (16 elements)
+ *
+ * @param[in] a 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector absolute value
+ */
+qint8x16_t vqabsq_qs8(qint8x16_t a);
+
+/** 8 bit fixed point vector max (8 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector max operation
+ */
+qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b);
+
+/** 8 bit fixed point vector max (16 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector max operation
+ */
+qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b);
+
+/** 8 bit fixed point vector pairwise max (8 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector pairwise max operation
+ */
+qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b);
+
+/** 8 bit fixed point vector min (8 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector max operation
+ */
+qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b);
+
+/** 8 bit fixed point vector min (16 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector min operation
+ */
+qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b);
+
+/** 8 bit fixed point vector pairwise min (8 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector pairwise min operation
+ */
+qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b);
+
+/** 8 bit fixed point vector add (8 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector addition
+ */
+qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b);
+
+/** 8 bit fixed point vector add (16 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector addition
+ */
+qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b);
+
+/** 8 bit fixed point vector saturating add (8 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector addition. The result is saturated in case of overflow
+ */
+qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b);
+
+/** 8 bit fixed point vector saturating add (16 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector addition. The result is saturated in case of overflow
+ */
+qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b);
+
+/** 16 bit fixed point vector saturating add (4 elements)
+ *
+ * @param[in] a First 16 bit fixed point input vector
+ * @param[in] b Second 16 bit fixed point input vector
+ *
+ * @return The result of the 16 bit fixed point vector addition. The result is saturated in case of overflow
+ */
+qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b);
+
+/** 16 bit fixed point vector saturating add (8 elements)
+ *
+ * @param[in] a First 16 bit fixed point input vector
+ * @param[in] b Second 16 bit fixed point input vector
+ *
+ * @return The result of the 16 bit fixed point vector addition. The result is saturated in case of overflow
+ */
+qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b);
+
+/** 8 bit fixed point vector saturating pairwise add (8 elements)
+ *
+ * @param[in] a 8 bit fixed point input vector
+ *
+ * @return The result of the 16 bit fixed point vector addition. The result is saturated in case of overflow
+ */
+int16x4_t vpaddl_qs8(qint8x8_t a);
+
+/** 8 bit fixed point vector subtraction (8 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector subtraction
+ */
+qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b);
+
+/** 8 bit fixed point vector subtraction (16 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector subtraction
+ */
+qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b);
+
+/** 8 bit fixed point vector saturating subtraction (8 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector subtraction. The result is saturated in case of overflow
+ */
+qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b);
+
+/** 8 bit fixed point vector saturating subtraction (16 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector subtraction. The result is saturated in case of overflow
+ */
+qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b);
+
+/** 8 bit fixed point vector multiply (8 elements)
+ *
+ * @param[in] a                    First 8 bit fixed point input vector
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiplication.
+ */
+qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position);
+
+/** 8 bit fixed point vector multiply (16 elements)
+ *
+ * @param[in] a                    First 8 bit fixed point input vector
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiplication.
+ */
+qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position);
+
+/** 8 bit fixed point vector saturating multiply (8 elements)
+ *
+ * @param[in] a                    First 8 bit fixed point input vector
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiplication. The result is saturated in case of overflow
+ */
+qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position);
+
+/** 8 bit fixed point vector saturating multiply (16 elements)
+ *
+ * @param[in] a                    First 8 bit fixed point input vector
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiplication. The result is saturated in case of overflow
+ */
+qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position);
+
+/** 8 bit fixed point vector long multiply (8 elements)
+ *
+ * @param[in] a                    First 8 bit fixed point input vector
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point long vector multiplication.
+ */
+qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position);
+
+/** 8 bit fixed point vector multiply-accumulate (8 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
+ *
+ * @param[in] a                    First 8 bit fixed point input vector where the result of multiplication must be added to
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] c                    Third 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiply-accumulate
+ */
+qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position);
+
+/** 8 bit fixed point vector multiply-accumulate (16 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
+ *
+ * @param[in] a                    First 8 bit fixed point input vector where the result of multiplication must be added to
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] c                    Third 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiply-accumulate
+ */
+qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position);
+
+/** 8 bit fixed point vector saturating multiply-accumulate (8 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
+ *
+ * @param[in] a                    First 8 bit fixed point input vector where the result of multiplication must be added to
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] c                    Third 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiply-accumulate. The result is saturated in case of overflow
+ */
+qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position);
+
+/** 8 bit fixed point vector saturating multiply-accumulate (16 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
+ *
+ * @param[in] a                    First 8 bit fixed point input vector where the result of multiplication must be added to
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] c                    Third 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiply-accumulate.The result is saturated in case of overflow
+ */
+qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position);
+
+/** 8 bit fixed point vector multiply-accumulate long (8 elements).
+ *  This operation performs the product between @p b and @p c and add the result to the 16 bit fixed point vector @p a (a + b * c). 8 elements
+ *
+ * @param[in] a                    First 16 bit fixed point input vector where the result of multiplication must be added to
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] c                    Third 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiply-accumulate long
+ */
+qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position);
+
+/** 8 bit fixed point vector saturating multiply-accumulate long (8 elements). The saturation is performed on the 16 bit fixed point output vector.
+ *  This operation performs the product between @p b and @p c and add the result to the 16 bit fixed point vector @p a (a + b * c). 8 elements
+ *
+ * @param[in] a                    First 16 bit fixed point input vector where the result of multiplication must be added to
+ * @param[in] b                    Second 8 bit fixed point input vector
+ * @param[in] c                    Third 8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8 bit fixed point vector multiply-accumulate long
+ */
+qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position);
+
+/** Convert a float vector with 4x2 elements to 8 bit fixed point vector with 8 elements
+ *
+ * @param[in] a                    Float input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the conversion float -> 8 bit fixed point
+ */
+qint8x8_t vcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position);
+
+/** Convert a float vector with 4x4 elements to 8 bit fixed point vector with 16 elements
+ *
+ * @param[in] a                    Float input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the conversion float -> 8 bit fixed point
+ */
+qint8x16_t vcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position);
+
+/** Convert a 8 bit fixed point vector with 8 elements to a float vector with 4x2 elements
+ *
+ * @param[in] a                    8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the conversion 8 bit fixed point -> float32x2x4
+ */
+float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position);
+
+/** Convert a 8 bit fixed point vector with 16 elements to a float vector with 4x4 elements
+ *
+ * @param[in] a                    8 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the conversion 8 bit fixed point -> float32x4x4
+ */
+float32x4x4_t vcvtq_qs8_f32(qint8x16_t a, int fixed_point_position);
+
+/** Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method. (8 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit reciprocal (1/a).
+ */
+qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position);
+
+/** Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method. (16 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit reciprocal (1/a).
+ */
+qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position);
+
+/** Division fixed point 8bit (8 elements)
+ *
+ * @param[in] a                    First 8bit fixed point input vector
+ * @param[in] b                    Second 8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The quotient and remainder number in fixed point format.
+ */
+qint8x8_t vdiv_qs8(qint8x8_t a, int8x8_t b, int fixed_point_position);
+
+/** Division fixed point 8bit (16 elements)
+ *
+ * @param[in] a                    First 8bit fixed point input vector
+ * @param[in] b                    Second 8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The quotient and remainder number in 8bit fixed point format.
+ */
+qint8x16_t vdivq_qs8(qint8x16_t a, int8x16_t b, int fixed_point_position);
+
+/** Perform a 4th degree polynomial approximation. (8 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit taylor approximation.
+ */
+template <bool islog>
+qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position);
+
+/** Perform a 4th degree polynomial approximation. (16 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit taylor approximation.
+ */
+template <bool islog>
+qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position);
+
+/** Calculate saturating exponential fixed point 8bit (8 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit saturating exponential
+ */
+qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position);
+
+/** Calculate saturating exponential fixed point 8bit (16 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit saturating exponential
+ */
+qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position);
+
+/** Calculate logarithm fixed point 16bit (8 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit logarithm.
+ */
+qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position);
+
+/** Calculate logarithm fixed point 16bit (16 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit logarithm.
+ */
+qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position);
+
+/** Calculate inverse square root for fixed point 8bit using Newton-Raphosn method (8 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit inverse sqrt.
+ */
+qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position);
+
+/** Calculate saturating inverse square root for fixed point 8bit using Newton-Raphosn method (8 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit inverse sqrt.
+ */
+qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position);
+
+/** Calculate inverse square root for fixed point 8bit using Newton-Raphosn method (16 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit inverse sqrt.
+ */
+qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position);
+
+/** Calculate saturating inverse square root for fixed point 8bit using Newton-Raphosn method (16 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit inverse sqrt.
+ */
+qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position);
+
+/** Calculate hyperbolic tangent for fixed point 8bit (8 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The calculated Hyperbolic Tangent.
+ */
+qint8x8_t vtanh_qs8(qint8x8_t a, int fixed_point_position);
+
+/** Calculate hyperbolic tangent for fixed point 8bit (16 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The calculated Hyperbolic Tangent.
+ */
+qint8x16_t vtanhq_qs8(qint8x16_t a, int fixed_point_position);
+
+/** Calculate saturating n power for fixed point 8bit (16 elements).
+ *
+ * pow(a,b) = e^(b*log(a))
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] b                    8bit fixed point power vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit power.
+ */
+qint8x8_t vqpowq_qs8(qint8x8_t a, qint8x16_t b, int fixed_point_position);
+}
+#include "arm_compute/core/NEON/NEFixedPoint.inl"
+#endif /* __ARM_COMPUTE_NEFIXEDPOINT_H__ */
diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl
new file mode 100644
index 0000000000..6db344dc11
--- /dev/null
+++ b/arm_compute/core/NEON/NEFixedPoint.inl
@@ -0,0 +1,1018 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+namespace arm_compute
+{
+/**< Exponent polynomial coefficients for 8 bit fixed point (8 elements)
+ *  Format is in Q0.7 for all elements */
+const std::array<qint8x8_t, 4> exp_tab_qs8 =
+{
+    {
+        vdup_n_s8(0x7F), // 0.9978546
+        vdup_n_s8(0x3F), // 0.4994721
+        vdup_n_s8(0x16), // 0.1763723
+        vdup_n_s8(0x05), // 0.0435108
+    }
+};
+
+/**< Exponent polynomial coefficients for 8 bit fixed point (16 elements)
+ * Format is in Q0.7 for all elements */
+const std::array<qint8x16_t, 4> exp_tabq_qs8 =
+{
+    {
+        vdupq_n_s8(0x7F), // 0.9978546
+        vdupq_n_s8(0x3F), // 0.4994721
+        vdupq_n_s8(0x16), // 0.1763723
+        vdupq_n_s8(0x05), // 0.0435108
+    }
+};
+
+/**< Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
+ * Format is in Q0.7 for all elements except the first one which is in Q1.6 */
+const std::array<qint8x8_t, 4> log_tab_qs8 =
+{
+    {
+        vdup_n_s8(0x5C),  // 1.4384189
+        vdup_n_s8(-0x56), // -0.6771900
+        vdup_n_s8(0x29),  // 0.3218538
+        vdup_n_s8(-0x0A), // -0.0832229
+    }
+};
+
+/**< Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
+ * Format is in Q0.7 for all elements except the first one which is in Q1.6 */
+const std::array<qint8x16_t, 4> log_tabq_qs8 =
+{
+    {
+        vdupq_n_s8(0x5C),  // 1.4384189
+        vdupq_n_s8(-0x56), // -0.6771900
+        vdupq_n_s8(0x29),  // 0.3218538
+        vdupq_n_s8(-0x0A), // -0.0832229
+    }
+};
+
+inline qint8x8_t vget_low_qs8(qint8x16_t a)
+{
+    return vget_low_s8(a);
+}
+
+inline qint8x8_t vget_high_qs8(qint8x16_t a)
+{
+    return vget_high_s8(a);
+}
+
+inline qint8x8_t vld1_qs8(const qint8_t *addr)
+{
+    return vld1_s8(addr);
+}
+
+inline qint8x16_t vld1q_qs8(const qint8_t *addr)
+{
+    return vld1q_s8(addr);
+}
+
+inline qint16x4_t vld1_qs16(const qint16_t *addr)
+{
+    return vld1_s16(addr);
+}
+
+inline qint16x8_t vld1q_qs16(const qint16_t *addr)
+{
+    return vld1q_s16(addr);
+}
+
+inline qint8x8_t vld1_dup_qs8(const qint8_t *addr)
+{
+    return vld1_dup_s8(addr);
+}
+
+inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
+{
+    return vld1q_dup_s8(addr);
+}
+
+inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
+{
+    vst1_s8(addr, b);
+}
+
+inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
+{
+    vst1q_s8(addr, b);
+}
+
+inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
+{
+    vst1_s16(addr, b);
+}
+
+inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
+{
+    vst1q_s16(addr, b);
+}
+
+inline qint8x8_t vqmovn_qs16(qint16x8_t a)
+{
+    return vqmovn_s16(a);
+}
+
+inline qint8x8_t vdup_n_qs8(qint8_t a)
+{
+    return vdup_n_s8(a);
+}
+
+inline qint8x16_t vdupq_n_qs8(qint8_t a)
+{
+    return vdupq_n_s8(a);
+}
+
+inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
+{
+    float32x4x4_t res =
+    {
+        {
+            vdupq_n_f32(a),
+            vdupq_n_f32(a),
+            vdupq_n_f32(a),
+            vdupq_n_f32(a),
+        }
+    };
+    return vcvtq_qs8_f32(res, fixed_point_position);
+}
+
+inline qint16x8_t vdupq_n_qs16(qint16_t a)
+{
+    return vdupq_n_s16(a);
+}
+
+inline qint8x8_t vabs_qs8(qint8x8_t a)
+{
+    return vabs_s8(a);
+}
+
+inline qint8x16_t vabsq_qs8(qint8x16_t a)
+{
+    return vabsq_s8(a);
+}
+
+inline qint8x8_t vqabs_qs8(qint8x8_t a)
+{
+    return vqabs_s8(a);
+}
+
+inline qint8x16_t vqabsq_qs8(qint8x16_t a)
+{
+    return vqabsq_s8(a);
+}
+
+inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
+{
+    return vmax_s8(a, b);
+}
+
+inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
+{
+    return vmaxq_s8(a, b);
+}
+
+inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
+{
+    return vpmax_s8(a, b);
+}
+
+inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
+{
+    return vmin_s8(a, b);
+}
+
+inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
+{
+    return vminq_s8(a, b);
+}
+
+inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
+{
+    return vpmin_s8(a, b);
+}
+
+inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
+{
+    return vadd_s8(a, b);
+}
+
+inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
+{
+    return vaddq_s8(a, b);
+}
+
+inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
+{
+    return vqadd_s8(a, b);
+}
+
+inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
+{
+    return vqaddq_s8(a, b);
+}
+
+inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
+{
+    return vqadd_s16(a, b);
+}
+
+inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
+{
+    return vqaddq_s16(a, b);
+}
+
+inline int16x4_t vpaddl_qs8(qint8x8_t a)
+{
+    return vpaddl_s8(a);
+}
+
+inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
+{
+    return vsub_s8(a, b);
+}
+
+inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
+{
+    return vsubq_s8(a, b);
+}
+
+inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
+{
+    return vqsub_s8(a, b);
+}
+
+inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
+{
+    return vqsubq_s8(a, b);
+}
+
+inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary result with a constant used to round up the result
+    qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    res = vmlal_s8(res, a, b);
+
+    // Shift right by fixed_point_position
+    res = vshlq_s16(res, fixed_point_position_s16);
+
+    // Convert back to qint8
+    return vmovn_s16(res);
+}
+
+inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
+    qint16x8_t res1 = res0;
+
+    // Vector multiply-accumulate long
+    res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
+    res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
+
+    // Shift right by fixed_point_position
+    res0 = vshlq_s16(res0, fixed_point_position_s16);
+    res1 = vshlq_s16(res1, fixed_point_position_s16);
+
+    // Convert back to qint8
+    return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
+}
+
+inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary result with a constant used to round up the result
+    qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    res = vmlal_s8(res, a, b);
+
+    // Shift right by fixed_point_position
+    res = vqshlq_s16(res, fixed_point_position_s16);
+
+    // Convert back to qint8 and saturate
+    return vqmovn_s16(res);
+}
+
+inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
+    qint16x8_t res1 = res0;
+
+    // Vector multiply-accumulate long
+    res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
+    res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
+
+    // Shift right by fixed_point_position
+    res0 = vqshlq_s16(res0, fixed_point_position_s16);
+    res1 = vqshlq_s16(res1, fixed_point_position_s16);
+
+    // Convert back to qint8 and saturate
+    return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
+}
+
+inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    qint16x8_t res = vmull_s8(a, b);
+
+    return vqrshlq_s16(res, fixed_point_position_s16);
+}
+
+inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    tmp = vmlal_s8(tmp, b, c);
+
+    // Shift right by fixed_point_position
+    tmp = vshlq_s16(tmp, fixed_point_position_s16);
+
+    // Convert back to qint8 and accumulate
+    return vadd_s8(a, vmovn_s16(tmp));
+}
+
+inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
+    qint16x8_t tmp1 = tmp0;
+
+    // Vector multiply-accumulate long
+    tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
+    tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
+
+    // Shift right by fixed_point_position
+    tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
+    tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
+
+    // Convert back to qint8 and accumulate
+    return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
+}
+
+inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    tmp = vmlal_s8(tmp, b, c);
+
+    // Shift right by fixed_point_position
+    tmp = vqshlq_s16(tmp, fixed_point_position_s16);
+
+    // Convert back to qint8 and accumulate
+    return vqadd_s8(a, vqmovn_s16(tmp));
+}
+
+inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
+    qint16x8_t tmp1 = tmp0;
+
+    // Vector multiply-accumulate long
+    tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
+    tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
+
+    // Shift right by fixed_point_position
+    tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
+    tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
+
+    // Convert back to qint8 and accumulate
+    qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
+    return vqaddq_s8(a, res);
+}
+
+inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    tmp = vmlal_s8(tmp, b, c);
+
+    // Shift right by fixed_point_position
+    tmp = vshlq_s16(tmp, fixed_point_position_s16);
+
+    // Accumulate
+    return vaddq_s16(a, tmp);
+}
+
+inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
+{
+    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    tmp = vmlal_s8(tmp, b, c);
+
+    // Shift right by fixed_point_position
+    tmp = vqshlq_s16(tmp, fixed_point_position_s16);
+
+    // Accumulate
+    return vqaddq_s16(a, tmp);
+}
+
+inline qint8x8_t vcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
+{
+    const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
+
+    float32x4x2_t res_f32 =
+    {
+        {
+            vdupq_n_f32(0.5f),
+            vdupq_n_f32(0.5f)
+        }
+    };
+
+    res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
+    res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
+
+    const int32x4x2_t res_s32 =
+    {
+        {
+            vcvtq_s32_f32(res_f32.val[0]),
+            vcvtq_s32_f32(res_f32.val[1]),
+        }
+    };
+
+    const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
+
+    return vqmovn_s16(res_s16);
+}
+
+inline qint8x16_t vcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
+{
+    const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
+
+    float32x4x4_t res_f32 =
+    {
+        {
+            vdupq_n_f32(0.5f),
+            vdupq_n_f32(0.5f),
+            vdupq_n_f32(0.5f),
+            vdupq_n_f32(0.5f)
+        }
+    };
+
+    res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
+    res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
+    res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
+    res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
+
+    const int32x4x4_t res_s32 =
+    {
+        {
+            vcvtq_s32_f32(res_f32.val[0]),
+            vcvtq_s32_f32(res_f32.val[1]),
+            vcvtq_s32_f32(res_f32.val[2]),
+            vcvtq_s32_f32(res_f32.val[3]),
+        }
+    };
+
+    const int16x8x2_t res_s16 =
+    {
+        {
+            vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
+            vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
+        }
+    };
+
+    return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
+}
+
+inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
+{
+    const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
+
+    const int16x8_t res_s16 = vmovl_s8(a);
+
+    const int32x4x2_t res_s32 =
+    {
+        {
+            vmovl_s16(vget_low_s16(res_s16)),
+            vmovl_s16(vget_high_s16(res_s16))
+        }
+    };
+
+    float32x4x2_t res_f32 =
+    {
+        {
+            vcvtq_f32_s32(res_s32.val[0]),
+            vcvtq_f32_s32(res_s32.val[1])
+        }
+    };
+
+    res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
+    res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
+
+    return res_f32;
+}
+
+inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
+{
+    const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
+
+    const int16x8x2_t res_s16 =
+    {
+        {
+            vmovl_s8(vget_low_s8(a)),
+            vmovl_s8(vget_high_s8(a)),
+        }
+    };
+
+    const int32x4x4_t res_s32 =
+    {
+        {
+            vmovl_s16(vget_low_s16(res_s16.val[0])),
+            vmovl_s16(vget_high_s16(res_s16.val[0])),
+            vmovl_s16(vget_low_s16(res_s16.val[1])),
+            vmovl_s16(vget_high_s16(res_s16.val[1])),
+        }
+    };
+
+    float32x4x4_t res_f32 =
+    {
+        {
+            vcvtq_f32_s32(res_s32.val[0]),
+            vcvtq_f32_s32(res_s32.val[1]),
+            vcvtq_f32_s32(res_s32.val[2]),
+            vcvtq_f32_s32(res_s32.val[3])
+        }
+    };
+
+    res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
+    res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
+    res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
+    res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
+
+    return res_f32;
+}
+
+inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
+{
+    // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
+    const qint8x8_t const_48_over_17       = vdup_n_s8(0x7A >> (5 - fixed_point_position));    // 2.823
+    const qint8x8_t const_minus_32_over_17 = vdup_n_s8(-(0x3C >> (5 - fixed_point_position))); // -1.8823
+    const qint8x8_t const_one              = vdup_n_s8(1 << fixed_point_position);
+
+    // Find shift value
+    const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
+    const qint8x8_t temp        = vshl_s8(a, shift_value);
+
+    qint8x8_t x = vadd_s8(const_48_over_17, vmul_qs8(temp, const_minus_32_over_17, fixed_point_position));
+
+    uint8x8_t set_one = vcgt_s8(x, const_one);
+    x                 = vbsl_s8(set_one, const_one, x);
+
+    // Use three iterations of Newton-Raphson  method to get the result
+    x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
+    x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
+    x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position));
+
+    return vshl_s8(x, shift_value);
+}
+
+inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
+    const qint8x16_t const_48_over_17       = vdupq_n_s8(0x7A >> (5 - fixed_point_position));   // 2.823
+    const qint8x16_t const_minus_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
+    const qint8x16_t const_one              = vdupq_n_s8(1 << fixed_point_position);
+
+    // Find shift value
+    const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
+    const qint8x16_t temp        = vshlq_s8(a, shift_value);
+
+    qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_minus_32_over_17, fixed_point_position));
+
+    // Set initial guess to one if x > 1
+    uint8x16_t set_one = vcgtq_s8(x, const_one);
+    x                  = vbslq_s8(set_one, const_one, x);
+
+    // Use three iterations of Newton-Raphson  method to get the result
+    x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
+    x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
+    x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
+
+    return vshlq_s8(x, shift_value);
+}
+
+inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
+    const qint8x16_t const_48_over_17       = vdupq_n_s8(0x7A >> (5 - fixed_point_position));   // 2.823
+    const qint8x16_t const_minus_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
+    const qint8x16_t const_one              = vdupq_n_s8(1 << fixed_point_position);
+
+    // Find shift value
+    const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
+    const qint8x16_t temp        = vqshlq_s8(a, shift_value);
+
+    qint8x16_t x = vqsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_minus_32_over_17, fixed_point_position));
+
+    // Set initial guess to one if x > 1
+    uint8x16_t set_one = vcgtq_s8(x, const_one);
+    x                  = vbslq_s8(set_one, const_one, x);
+
+    // Use three iterations of Newton-Raphson  method to get the result
+    x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
+    x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
+    x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position));
+
+    return vqshlq_s8(x, shift_value);
+}
+
+inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
+{
+    return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
+}
+
+inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
+{
+    return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
+}
+
+template <bool   islog>
+inline qint8x8_t vtaylor_poly_qs8(int8x8_t a, int fixed_point_position)
+{
+    const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
+    const qint8x8_t const_one   = vdup_n_s8(1);
+    const qint8x8_t A           = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
+    const qint8x8_t B           = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
+    const qint8x8_t C           = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
+    const qint8x8_t D           = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
+    const qint8x8_t x1          = vadd_s8(vmul_qs8(a, D, fixed_point_position), C);
+    const qint8x8_t x2          = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B);
+    const qint8x8_t x3          = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A);
+    const qint8x8_t res         = vmul_qs8(a, x3, fixed_point_position);
+    return res;
+}
+
+template <bool   islog>
+inline qint8x8_t vqtaylor_poly_qs8(int8x8_t a, int fixed_point_position)
+{
+    const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
+    const qint8x8_t const_one   = vdup_n_s8(1);
+    const qint8x8_t A           = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
+    const qint8x8_t B           = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
+    const qint8x8_t C           = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
+    const qint8x8_t D           = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
+    const qint8x8_t x1          = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C);
+    const qint8x8_t x2          = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B);
+    const qint8x8_t x3          = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A);
+    const qint8x8_t res         = vqmul_qs8(a, x3, fixed_point_position);
+    return res;
+}
+
+template <bool    islog>
+inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
+    const qint8x16_t const_one   = vdupq_n_s8(1);
+    const qint8x16_t A           = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
+    const qint8x16_t B           = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
+    const qint8x16_t C           = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
+    const qint8x16_t D           = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
+    const qint8x16_t x1          = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C);
+    const qint8x16_t x2          = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B);
+    const qint8x16_t x3          = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A);
+    const qint8x16_t res         = vmulq_qs8(a, x3, fixed_point_position);
+    return res;
+}
+
+template <bool    islog>
+inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
+    const qint8x16_t const_one   = vdupq_n_s8(1);
+    const qint8x16_t A           = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
+    const qint8x16_t B           = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
+    const qint8x16_t C           = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
+    const qint8x16_t D           = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
+    const qint8x16_t x1          = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C);
+    const qint8x16_t x2          = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B);
+    const qint8x16_t x3          = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A);
+    const qint8x16_t res         = vqmulq_qs8(a, x3, fixed_point_position);
+    return res;
+}
+
+inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
+{
+    const qint8x8_t shift_value   = vdup_n_s8(fixed_point_position - 7);
+    const qint8x8_t const_one     = vdup_n_s8(1 << fixed_point_position);
+    const qint8x8_t const_ln2     = vqrshl_s8(vdup_n_s8(0x58), shift_value);                     // ln(2)
+    const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2)
+
+    // Perform range reduction [-log(2),log(2)]
+    const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
+
+    // get decimal part from m
+    const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
+
+    qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
+    alpha           = vqabs_qs8(vqsub_s8(a, alpha));
+
+    // Polynomial Approximation
+    qint8x8_t poly = vqtaylor_poly_qs8<false>(alpha, fixed_point_position);
+    poly           = vqadd_s8(poly, const_one);
+
+    // Reconstruct
+    poly = vqshl_s8(poly, dec_m);
+
+    return poly;
+}
+
+inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    const qint8x16_t shift_value   = vdupq_n_s8(fixed_point_position - 7);
+    const qint8x16_t const_one     = vdupq_n_s8(1 << fixed_point_position);
+    const qint8x16_t const_ln2     = vqrshlq_s8(vdupq_n_s8(0x58), shift_value);                      // ln(2)
+    const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2)
+
+    // Perform range reduction [-log(2),log(2)]
+    const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
+
+    // get decimal part from m
+    const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
+
+    qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
+    alpha            = vqabsq_qs8(vqsubq_qs8(a, alpha));
+
+    // Polynomial Approximation
+    qint8x16_t poly = vqtaylor_polyq_qs8<false>(alpha, fixed_point_position);
+    poly            = vqaddq_s8(poly, const_one);
+
+    // Reconstruct
+    poly = vqshlq_s8(poly, dec_m);
+
+    return poly;
+}
+
+inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
+{
+    const qint8x8_t const_one       = vdup_n_s8(1 << fixed_point_position);
+    const qint8x8_t const_seven_dec = vdup_n_s8(7);
+    const qint8x8_t const_ln2       = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
+
+    // If 0 < a < 1, calculate log(1/x)
+    uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
+    qint8x8_t recip           = vdup_n_s8(0);
+    recip                     = vbsl_s8(calc_reciprocal, recip, a);
+
+    // Calculate reciprocal
+    recip = vrecip_qs8(recip, fixed_point_position);
+    a     = vbsl_s8(calc_reciprocal, recip, a);
+
+    // Get decimal part of a
+    qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
+    qint8x8_t dec_a       = vshl_s8(a, shift_value); // a >> fixed_point_position
+
+    // Get exponent of 2^n which is equal or less than dec_a
+    shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
+
+    // Get x to range (1, 2]
+    const qint8x8_t shift_value_neg = vneg_s8(shift_value);
+    const qint8x8_t temp            = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
+    const qint8x8_t sum             = vmul_s8(shift_value, const_one);
+
+    // Polynomial Approximation
+    qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
+
+    // Reconstruct
+    poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
+
+    // Set negative value for 0 < a < 1
+    poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
+
+    return poly;
+}
+
+inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    const qint8x16_t const_one       = vdupq_n_s8(1 << fixed_point_position);
+    const qint8x16_t const_seven_dec = vdupq_n_s8(7);
+    const qint8x16_t const_ln2       = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
+
+    // If 0 < a < 1, calculate log(1/x)
+    uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
+    qint8x16_t recip           = vdupq_n_s8(0);
+    recip                      = vbslq_s8(calc_reciprocal, a, recip);
+
+    // Calculate reciprocal
+    recip = vrecipq_qs8(recip, fixed_point_position);
+    a     = vbslq_s8(calc_reciprocal, recip, a);
+
+    // Get decimal part of a
+    qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
+    qint8x16_t dec_a       = vshlq_s8(a, shift_value); // a >> fixed_point_position
+
+    // Get exponent of 2^n which is equal or less than dec_a
+    shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
+
+    // Get x to range (1, 2]
+    const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
+    const qint8x16_t temp            = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
+    const qint8x16_t sum             = vmulq_s8(shift_value, const_one);
+
+    // Polynomial Approximation
+    qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
+
+    // Reconstruct
+    poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
+
+    // Set negative value for 0 < a < 1
+    poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
+
+    return poly;
+}
+
+inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
+{
+    const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
+
+    // Find shift value. Number must be in (0.5, 2) range.
+    qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
+
+    // Add one when the shift value is negative in order to get the correct result when we shift right with 1
+    qint8x8_t temp         = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
+    uint8x8_t temp_ltz     = vclt_s8(temp, vdup_n_qs8(0));
+    temp                   = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
+    qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
+
+    temp = vshl_s8(a, shift_value);
+
+    // Initial guess
+    qint8x8_t x = temp;
+
+    // Calculate (x / 2) * (3 - a * x^2)
+    // After three iterations we have the result for 8 bit
+    x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+
+    return vshl_s8(x, shift_value2);
+}
+
+inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
+{
+    const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
+
+    // Find shift value. Number must be in (0.5, 2) range.
+    qint8x8_t shift_value = vneg_s8(vqsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
+
+    // Add one when the shift value is negative in order to get the correct result when we shift right with 1
+    qint8x8_t temp         = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
+    uint8x8_t temp_ltz     = vclt_s8(temp, vdup_n_qs8(0));
+    temp                   = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
+    qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
+
+    temp = vshl_s8(a, shift_value);
+
+    // Initial guess
+    qint8x8_t x = temp;
+
+    // Calculate (x / 2) * (3 - a * x^2)
+    // After three iterations we have the result for 8 bit
+    x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+
+    return vshl_s8(x, shift_value2);
+}
+
+inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
+
+    // Find shift value. Number must be in (0.5, 2) range.
+    qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
+
+    // Add one when the shift value is negative in order to get the correct result when we shift right with 1
+    qint8x16_t temp         = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
+    uint8x16_t temp_ltz     = vcltq_s8(temp, vdupq_n_qs8(0));
+    temp                    = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
+    qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
+
+    temp = vshlq_s8(a, shift_value);
+
+    // Initial guess
+    qint8x16_t x = temp;
+
+    // Calculate (x / 2) * (3 - a * x^2)
+    // After three iterations we have the result for 8 bit
+    x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+
+    return vshlq_s8(x, shift_value2);
+}
+
+inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
+
+    // Find shift value. Number must be in (0.5, 2) range.
+    qint8x16_t shift_value = vnegq_s8(vqsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
+
+    // Add one when the shift value is negative in order to get the correct result when we shift right with 1
+    qint8x16_t temp         = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
+    uint8x16_t temp_ltz     = vcltq_s8(temp, vdupq_n_qs8(0));
+    temp                    = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
+    qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
+
+    temp = vshlq_s8(a, shift_value);
+
+    // Initial guess
+    qint8x16_t x = temp;
+
+    // Calculate (x / 2) * (3 - a * x^2)
+    // After three iterations we have the result for 8 bit
+    x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+
+    return vshlq_s8(x, shift_value2);
+}
+
+inline qint8x8_t vtanh_qs8(qint8x8_t a, int fixed_point_position)
+{
+    const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
+    const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
+
+    qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
+    qint8x8_t num   = vqsub_qs8(exp2x, const_one);
+    qint8x8_t den   = vqadd_qs8(exp2x, const_one);
+    qint8x8_t tanh  = vqmul_qs8(num, vrecip_qs8(den, fixed_point_position), fixed_point_position);
+
+    return tanh;
+}
+
+inline qint8x16_t vtanhq_qs8(qint8x16_t a, int fixed_point_position)
+{
+    const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
+    const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
+
+    qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position);
+    qint8x16_t num   = vqsubq_qs8(exp2x, const_one);
+    qint8x16_t den   = vqaddq_qs8(exp2x, const_one);
+    qint8x16_t tanh  = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position);
+
+    return tanh;
+}
+
+inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
+{
+    return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
+}
+}
diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h
new file mode 100644
index 0000000000..eaa50f123b
--- /dev/null
+++ b/arm_compute/core/NEON/NEKernels.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEKERNELS_H__
+#define __ARM_COMPUTE_NEKERNELS_H__
+
+/* Header regrouping all the NEON kernels */
+#include "arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h"
+#include "arm_compute/core/NEON/kernels/NEAccumulateKernel.h"
+#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
+#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
+#include "arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h"
+#include "arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h"
+#include "arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h"
+#include "arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h"
+#include "arm_compute/core/NEON/kernels/NEBox3x3Kernel.h"
+#include "arm_compute/core/NEON/kernels/NECannyEdgeKernel.h"
+#include "arm_compute/core/NEON/kernels/NEChannelCombineKernel.h"
+#include "arm_compute/core/NEON/kernels/NEChannelExtractKernel.h"
+#include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
+#include "arm_compute/core/NEON/kernels/NEColorConvertKernel.h"
+#include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h"
+#include "arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDerivativeKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDilateKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEErodeKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFastCornersKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillArrayKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h"
+#include "arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h"
+#include "arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h"
+#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
+#include "arm_compute/core/NEON/kernels/NEHistogramKernel.h"
+#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
+#include "arm_compute/core/NEON/kernels/NEIntegralImageKernel.h"
+#include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h"
+#include "arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
+#include "arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h"
+#include "arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h"
+#include "arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h"
+#include "arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
+#include "arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
+#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NERemapKernel.h"
+#include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
+#include "arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h"
+#include "arm_compute/core/NEON/kernels/NESobel3x3Kernel.h"
+#include "arm_compute/core/NEON/kernels/NESobel5x5Kernel.h"
+#include "arm_compute/core/NEON/kernels/NESobel7x7Kernel.h"
+#include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NETableLookupKernel.h"
+#include "arm_compute/core/NEON/kernels/NEThresholdKernel.h"
+#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
+#include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
+#include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
+
+#endif /* __ARM_COMPUTE_NEKERNELS_H__ */
diff --git a/arm_compute/core/NEON/NEMath.h b/arm_compute/core/NEON/NEMath.h
new file mode 100644
index 0000000000..bb8a330c1e
--- /dev/null
+++ b/arm_compute/core/NEON/NEMath.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEMATH_H__
+#define __ARM_COMPUTE_NEMATH_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+/** Calculate inverse square root.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated inverse square root.
+ */
+float32x4_t vinvsqrtq_f32(float32x4_t x);
+
+/** Calculate reciprocal.
+ *
+ * @param[in] x Input value.
+ *
+ * @return The calculated reciprocal.
+ */
+float32x4_t vinvq_f32(float32x4_t x);
+
+/** Perform a 7th degree polynomial approximation using Estrin's method.
+ *
+ * @param[in] x      Input vector value in F32 format.
+ * @param[in] coeffs Polynomial coefficients table.
+ *
+ * @return The calculated approximation.
+ */
+float32x4_t vtaylor_polyq_f32(float32x4_t x, const std::array<float32x4_t, 8> &coeffs);
+
+/** Calculate exponential
+ *
+ * @param[in] x Input vector value in F32 format.
+ *
+ * @return The calculated exponent.
+ */
+float32x4_t vexpq_f32(float32x4_t x);
+
+/** Calculate logarithm
+ *
+ * @param[in] x Input vector value in F32 format.
+ *
+ * @return The calculated logarithm.
+ */
+float32x4_t vlogq_f32(float32x4_t x);
+
+/** Calculate hyperbolic tangent.
+ *
+ * tanh(x) = (e^2x - 1)/(e^2x + 1)
+ *
+ * @note We clamp x to [-5,5] to avoid overflowing issues.
+ *
+ * @param[in] val Input vector value in F32 format.
+ *
+ * @return The calculated Hyperbolic Tangent.
+ */
+float32x4_t vtanhq_f32(float32x4_t val);
+
+/** Calculate n power of a number.
+ *
+ * pow(x,n) = e^(n*log(x))
+ *
+ * @param[in] val Input vector value in F32 format.
+ * @param[in] n   Powers to raise the input to.
+ *
+ * @return The calculated power.
+ */
+float32x4_t vpowq_f32(float32x4_t val, float32x4_t n);
+}
+#include "arm_compute/core/NEON/NEMath.inl"
+#endif /* __ARM_COMPUTE_NEMATH_H__ */
diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl
new file mode 100644
index 0000000000..a31a4c0dc5
--- /dev/null
+++ b/arm_compute/core/NEON/NEMath.inl
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+namespace arm_compute
+{
+/* Exponent polynomial coefficients */
+const std::array<float32x4_t, 8> exp_tab =
+{
+    {
+        vdupq_n_f32(1.f),
+        vdupq_n_f32(0.0416598916054f),
+        vdupq_n_f32(0.500000596046f),
+        vdupq_n_f32(0.0014122662833f),
+        vdupq_n_f32(1.00000011921f),
+        vdupq_n_f32(0.00833693705499f),
+        vdupq_n_f32(0.166665703058f),
+        vdupq_n_f32(0.000195780929062f),
+    }
+};
+
+/* Logarithm polynomial coefficients */
+const std::array<float32x4_t, 8> log_tab =
+{
+    {
+        vdupq_n_f32(-2.29561495781f),
+        vdupq_n_f32(-2.47071170807f),
+        vdupq_n_f32(-5.68692588806f),
+        vdupq_n_f32(-0.165253549814f),
+        vdupq_n_f32(5.17591238022f),
+        vdupq_n_f32(0.844007015228f),
+        vdupq_n_f32(4.58445882797f),
+        vdupq_n_f32(0.0141278216615f),
+    }
+};
+
+inline float32x4_t vinvsqrtq_f32(float32x4_t x)
+{
+    float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
+    sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+
+    return sqrt_reciprocal;
+}
+
+inline float32x4_t vinvq_f32(float32x4_t x)
+{
+    float32x4_t recip = vrecpeq_f32(x);
+    recip             = vmulq_f32(vrecpsq_f32(x, recip), recip);
+    recip             = vmulq_f32(vrecpsq_f32(x, recip), recip);
+    return recip;
+}
+
+inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const std::array<float32x4_t, 8> &coeffs)
+{
+    float32x4_t A   = vmlaq_f32(coeffs[0], coeffs[4], x);
+    float32x4_t B   = vmlaq_f32(coeffs[2], coeffs[6], x);
+    float32x4_t C   = vmlaq_f32(coeffs[1], coeffs[5], x);
+    float32x4_t D   = vmlaq_f32(coeffs[3], coeffs[7], x);
+    float32x4_t x2  = vmulq_f32(x, x);
+    float32x4_t x4  = vmulq_f32(x2, x2);
+    float32x4_t res = vmlaq_f32(vmlaq_f32(A, B, x2), vmlaq_f32(C, D, x2), x4);
+    return res;
+}
+
+inline float32x4_t vexpq_f32(float32x4_t x)
+{
+    static const float32x4_t CONST_LN2     = vdupq_n_f32(0.6931471805f); // ln(2)
+    static const float32x4_t CONST_INV_LN2 = vdupq_n_f32(1.4426950408f); // 1/ln(2)
+
+    // Perform range reduction [-log(2),log(2)]
+    int32x4_t   m   = vcvtq_s32_f32(vmulq_f32(x, CONST_INV_LN2));
+    float32x4_t val = vmlsq_f32(x, vcvtq_f32_s32(m), CONST_LN2);
+
+    // Polynomial Approximation
+    float32x4_t poly = vtaylor_polyq_f32(val, exp_tab);
+
+    // Reconstruct
+    poly = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(poly), vshlq_n_s32(m, 23)));
+
+    return poly;
+}
+
+inline float32x4_t vlogq_f32(float32x4_t x)
+{
+    static const int32x4_t   CONST_127 = vdupq_n_s32(127);           // 127
+    static const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
+
+    // Extract exponent
+    int32x4_t   m   = vsubq_s32(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127);
+    float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
+
+    // Polynomial Approximation
+    float32x4_t poly = vtaylor_polyq_f32(val, log_tab);
+
+    // Reconstruct
+    poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
+
+    return poly;
+}
+
+inline float32x4_t vtanhq_f32(float32x4_t val)
+{
+    static const float32x4_t CONST_1        = vdupq_n_f32(1.f);
+    static const float32x4_t CONST_2        = vdupq_n_f32(2.f);
+    static const float32x4_t CONST_MIN_TANH = vdupq_n_f32(-10.f);
+    static const float32x4_t CONST_MAX_TANH = vdupq_n_f32(10.f);
+
+    float32x4_t x     = vminq_f32(vmaxq_f32(val, CONST_MIN_TANH), CONST_MAX_TANH);
+    float32x4_t exp2x = vexpq_f32(vmulq_f32(CONST_2, x));
+    float32x4_t num   = vsubq_f32(exp2x, CONST_1);
+    float32x4_t den   = vaddq_f32(exp2x, CONST_1);
+    float32x4_t tanh  = vmulq_f32(num, vinvq_f32(den));
+    return tanh;
+}
+
+inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n)
+{
+    return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
+}
+}
+\ No newline at end of file
diff --git a/arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h b/arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h
new file mode 100644
index 0000000000..9ef93ce67a
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEABSOLUTEDIFFERENCEKERNEL_H__
+#define __ARM_COMPUTE_NEABSOLUTEDIFFERENCEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the absolute difference kernel
+ *
+ * Absolute difference is computed by:
+ * @f[ output(x,y) = | input1(x,y) - input2(x,y) | @f]
+ */
+class NEAbsoluteDifferenceKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEAbsoluteDifferenceKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAbsoluteDifferenceKernel(const NEAbsoluteDifferenceKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAbsoluteDifferenceKernel &operator=(const NEAbsoluteDifferenceKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEAbsoluteDifferenceKernel(NEAbsoluteDifferenceKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEAbsoluteDifferenceKernel &operator=(NEAbsoluteDifferenceKernel &&) = default;
+    /** Default destructor */
+    ~NEAbsoluteDifferenceKernel() = default;
+
+    /** Set the inputs and output tensors
+     *
+     * @param[in]  input1 Source tensor. Data types supported: U8/S16
+     * @param[in]  input2 Source tensor. Data types supported: U8/S16
+     * @param[out] output Destination tensor, Data types supported: U8/S16
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Common signature for all the specialised absolute difference functions
+     *
+     * @param[in]  input1 An input tensor. Data types supported: U8/S16.
+     * @param[in]  input2 An input tensor. Data types supported: U8/S16.
+     * @param[out] output The output tensor, Data types supported: U8 (Only if both inputs are U8), S16.
+     * @param[in]  window Region on which to execute the kernel.
+     */
+    using AbsDiffFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window);
+
+    /** Absolute difference function to use for the particular tensor formats passed to configure() */
+    AbsDiffFunction *_func;
+    const ITensor   *_input1;
+    const ITensor   *_input2;
+    ITensor         *_output;
+};
+}
+#endif /* __ARM_COMPUTE_NEABSOLUTEDIFFERENCEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEAccumulateKernel.h b/arm_compute/core/NEON/kernels/NEAccumulateKernel.h
new file mode 100644
index 0000000000..df6d7b8891
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEAccumulateKernel.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEACCUMULATEKERNEL_H__
+#define __ARM_COMPUTE_NEACCUMULATEKERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the accumulate kernel
+ *
+ * Accumulation is computed by:
+ * @f[ accum(x,y) = accum(x,y) + input(x,y) @f]
+ */
+class NEAccumulateKernel : public INESimpleKernel
+{
+public:
+    /** Set the input and accumulation tensors
+     *
+     * @param[in]  input Source tensor. Data type supported: U8.
+     * @param[out] accum Destination tensor. Data type supported: S16.
+     */
+    void configure(const ITensor *input, ITensor *accum);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+};
+
+/** Interface for the accumulate weighted kernel
+ *
+ * Weighted accumulation is computed:
+ * @f[ accum(x,y) = (1 - \alpha)*accum(x,y) + \alpha*input(x,y) @f]
+ *
+ * Where @f$ 0 \le \alpha \le 1 @f$
+ * Conceptually, the rounding for this is defined as:
+ * @f[ output(x,y)= uint8( (1 - \alpha) * float32( int32( output(x,y) ) ) + \alpha * float32( int32( input(x,y) ) ) ) @f]
+*/
+class NEAccumulateWeightedKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NEAccumulateWeightedKernel();
+    /** Set the input and accumulation tensors, and the scale value
+     *
+     * @param[in]     input Source tensor. Data type supported: U8.
+     * @param[in]     alpha Scalar value in the range [0.0f, 1.0f]
+     * @param[in,out] accum Accumulated tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input, float alpha, ITensor *accum);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+protected:
+    float _alpha;
+};
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+/** Interface for the accumulate weighted kernel using F16 */
+class NEAccumulateWeightedFP16Kernel : public NEAccumulateWeightedKernel
+{
+public:
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+};
+#else
+using NEAccumulateWeightedFP16Kernel = NEAccumulateWeightedKernel;
+#endif
+
+/** Interface for the accumulate squared kernel
+ *
+ * The accumulation of squares is computed:
+ * @f[ accum(x,y) = saturate_{int16} ( (uint16) accum(x,y) + (((uint16)(input(x,y)^2)) >> (shift)) ) @f]
+ *
+ * Where @f$ 0 \le shift \le 15 @f$
+*/
+class NEAccumulateSquaredKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NEAccumulateSquaredKernel();
+    /** Set the input and accumulation tensors and the shift value.
+     *
+     * @param[in]     input Source tensor. Data type supported: U8.
+     * @param[in]     shift Shift value in the range of [0, 15]
+     * @param[in,out] accum Accumulated tensor. Data type supported: S16.
+     */
+    void configure(const ITensor *input, uint32_t shift, ITensor *accum);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    uint32_t _shift;
+};
+}
+#endif /*__ARM_COMPUTE_NEACCUMULATEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
new file mode 100644
index 0000000000..97f92d6a1e
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H__
+#define __ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H__
+
+#include "arm_compute/core/FixedPoint.h"
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the activation layer kernel. */
+class NEActivationLayerKernel : public INESimpleKernel
+{
+public:
+    /** Constructor */
+    NEActivationLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEActivationLayerKernel(const NEActivationLayerKernel &) = delete;
+    /** Default move constructor */
+    NEActivationLayerKernel(NEActivationLayerKernel &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEActivationLayerKernel &operator=(const NEActivationLayerKernel &) = delete;
+    /** Default move assignment operator */
+    NEActivationLayerKernel &operator=(NEActivationLayerKernel &&) = default;
+    /** Set the input and output tensor.
+     *
+     * @param[in]  input           Source tensor. Data types supported: QS8/F32.
+     * @param[out] output          Destination tensor. Data type supported: same as @p input
+     * @param[in]  activation_info Activation layer information.
+     */
+    void configure(const ITensor *input, ITensor *output, ActivationLayerInfo activation_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    using ActivationFunction = ActivationLayerInfo::ActivationFunction;
+    /** Common signature for all the specialised @ref NEActivationLayerKernel functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using ActivationFunctionExecutorPtr = void (NEActivationLayerKernel::*)(const Window &window);
+    /** Function to apply an activation function on a tensor.
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    template <ActivationLayerInfo::ActivationFunction F, typename T>
+    typename std::enable_if<std::is_same<T, float>::value, void>::type activation(const Window &window);
+    /** Function to apply an activation function on a tensor.
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    template <ActivationLayerInfo::ActivationFunction F, typename T>
+    typename std::enable_if<std::is_same<T, qint8_t>::value, void>::type activation(const Window &window);
+
+private:
+    ActivationFunctionExecutorPtr _func;
+    ActivationLayerInfo           _act_info;
+};
+}
+#endif /*__ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
new file mode 100644
index 0000000000..b36ca46e1a
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEARITHMETICADDITIONKERNEL_H__
+#define __ARM_COMPUTE_NEARITHMETICADDITIONKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform addition between two tensors */
+class NEArithmeticAdditionKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEArithmeticAdditionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArithmeticAdditionKernel(const NEArithmeticAdditionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArithmeticAdditionKernel &operator=(const NEArithmeticAdditionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEArithmeticAdditionKernel(NEArithmeticAdditionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEArithmeticAdditionKernel &operator=(NEArithmeticAdditionKernel &&) = default;
+    /** Default destructor */
+    ~NEArithmeticAdditionKernel() = default;
+
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input1 An input tensor. Data types supported: U8/S16/F32
+     * @param[in]  input2 An input tensor. Data types supported: U8/S16/F32 (only if @p input1 is F32).
+     * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F32 (only if both inputs are F32).
+     * @param[in]  policy Overflow policy.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Common signature for all the specialised add functions
+     *
+     * @param[in]  input1 An input tensor. Data types supported: U8/S16/F32.
+     * @param[in]  input2 An input tensor. Data types supported: U8/S16/F32 (only if @p input1 is F32).
+     * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F32 (only if both inputs are F32).
+     * @param[in]  window Region on which to execute the kernel.
+     */
+    using AddFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window);
+    /** Add function to use for the particular tensor types passed to configure() */
+    AddFunction   *_func;
+    const ITensor *_input1;
+    const ITensor *_input2;
+    ITensor       *_output;
+};
+}
+#endif /*__ARM_COMPUTE_NEARITHMETICADDITIONKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
new file mode 100644
index 0000000000..0eb9c23686
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEARITHMETICSUBTRACTIONKERNEL_H__
+#define __ARM_COMPUTE_NEARITHMETICSUBTRACTIONKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform subtraction between two tensors */
+class NEArithmeticSubtractionKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEArithmeticSubtractionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArithmeticSubtractionKernel(const NEArithmeticSubtractionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArithmeticSubtractionKernel &operator=(const NEArithmeticSubtractionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEArithmeticSubtractionKernel(NEArithmeticSubtractionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEArithmeticSubtractionKernel &operator=(NEArithmeticSubtractionKernel &&) = default;
+    /** Default destructor */
+    ~NEArithmeticSubtractionKernel() = default;
+
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input1 An input tensor. Data types supported: U8/S16/F32
+     * @param[in]  input2 An input tensor. Data types supported: U8/S16/F32  (only if @p input1 is F32).
+     * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F32 (only if both inputs are F32).
+     * @param[in]  policy Overflow policy.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Common signature for all the specialised sub functions
+     *
+     * @param[in]  input1 An input tensor. Data types supported: U8, S16, F32.
+     * @param[in]  input2 An input tensor. Data types supported: U8, S16, F32 (only if @p input1 is F32).
+     * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16, F32 (only if both inputs are F32)
+     * @param[in]  window Region on which to execute the kernel.
+     */
+    using SubFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window);
+    /** Sub function to use for the particular tensor types passed to configure() */
+    SubFunction   *_func;
+    const ITensor *_input1;
+    const ITensor *_input2;
+    ITensor       *_output;
+};
+}
+#endif /* __ARM_COMPUTE_NEARITHMETICSUBTRACTIONKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
new file mode 100644
index 0000000000..29fcbd26a0
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H__
+#define __ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the batch normalization layer kernel.
+ */
+class NEBatchNormalizationLayerKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEBatchNormalizationLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBatchNormalizationLayerKernel(const NEBatchNormalizationLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBatchNormalizationLayerKernel &operator=(const NEBatchNormalizationLayerKernel &) = delete;
+    /** Default Move Constructor. */
+    NEBatchNormalizationLayerKernel(NEBatchNormalizationLayerKernel &&) = default;
+    /** Default move assignment operator. */
+    NEBatchNormalizationLayerKernel &operator=(NEBatchNormalizationLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEBatchNormalizationLayerKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input   Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, FM].
+     *                     The rest are optional and used for representing batches. Data types supported: QS8/F32.
+     * @param[in]  mean    Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  var     Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  gamma   Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  beta    Beta values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  epsilon Small value to avoid division with zero.
+     * @param[out] output  Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
+     */
+    void configure(const ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    using BatchNormFunction = void(const ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window);
+    BatchNormFunction *_func;
+    const ITensor     *_input;
+    ITensor           *_output;
+    const ITensor     *_mean;
+    const ITensor     *_var;
+    const ITensor     *_gamma;
+    const ITensor     *_beta;
+    float              _epsilon;
+};
+}
+#endif /*__ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h b/arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h
new file mode 100644
index 0000000000..b931445419
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBITWISEANDKERNEL_H__
+#define __ARM_COMPUTE_NEBITWISEANDKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform bitwise AND between XY-planes of two tensors
+ *
+ * Result is computed by:
+ * @f[ output(x,y) = input1(x,y) \land input2(x,y) @f]
+ */
+class NEBitwiseAndKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEBitwiseAndKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseAndKernel(const NEBitwiseAndKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseAndKernel &operator=(const NEBitwiseAndKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEBitwiseAndKernel(NEBitwiseAndKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEBitwiseAndKernel &operator=(NEBitwiseAndKernel &&) = default;
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]  input1 An input tensor. Data type supported: U8.
+     * @param[in]  input2 An input tensor. Data type supported: U8
+     * @param[out] output Output tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const ITensor *_input1; /**< Source tensor 1 */
+    const ITensor *_input2; /**< Source tensor 2 */
+    ITensor       *_output; /**< Destination tensor */
+};
+}
+#endif /* __ARM_COMPUTE_NEBITWISEANDKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h b/arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h
new file mode 100644
index 0000000000..e34eb0f5ae
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBITWISENOTKERNEL_H__
+#define __ARM_COMPUTE_NEBITWISENOTKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform bitwise NOT operation
+ *
+ * Result is computed by:
+ * @f[ output(x,y) = \lnot input(x,y) @f]
+ */
+class NEBitwiseNotKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEBitwiseNotKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseNotKernel(const NEBitwiseNotKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseNotKernel &operator=(const NEBitwiseNotKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEBitwiseNotKernel(NEBitwiseNotKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEBitwiseNotKernel &operator=(NEBitwiseNotKernel &&) = default;
+    /** Initialise the kernel's input and output
+     *
+     * @param[in]  input  An input tensor. Data type supported: U8.
+     * @param[out] output The output tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const ITensor *_input;  /**< Source tensor */
+    ITensor       *_output; /**< Destination tensor */
+};
+}
+#endif /* __ARM_COMPUTE_NEBITWISENOTKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h b/arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h
new file mode 100644
index 0000000000..d2bae2660c
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBITWISEORKERNEL_H__
+#define __ARM_COMPUTE_NEBITWISEORKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform bitwise inclusive OR between two tensors
+ *
+ * Result is computed by:
+ * @f[ output(x,y) = input1(x,y) \lor input2(x,y) @f]
+ */
+class NEBitwiseOrKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEBitwiseOrKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseOrKernel(const NEBitwiseOrKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseOrKernel &operator=(const NEBitwiseOrKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEBitwiseOrKernel(NEBitwiseOrKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEBitwiseOrKernel &operator=(NEBitwiseOrKernel &&) = default;
+    /** Initialise the kernel's inputs and output.
+     *
+     * @param[in]  input1 An input tensor. Data type supported: U8.
+     * @param[in]  input2 An input tensor. Data type supported: U8
+     * @param[out] output Output tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const ITensor *_input1; /**< Source tensor 1 */
+    const ITensor *_input2; /**< Source tensor 2 */
+    ITensor       *_output; /**< Destination tensor */
+};
+}
+#endif /* __ARM_COMPUTE_NEBITWISEORKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h b/arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h
new file mode 100644
index 0000000000..9dea36e7e3
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBITWISEXORKERNEL_H__
+#define __ARM_COMPUTE_NEBITWISEXORKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform bitwise exclusive OR (XOR) between two tensors
+ *
+ * Result is computed by:
+ * @f[ output(x,y) = input1(x,y) \oplus input2(x,y) @f]
+ */
+class NEBitwiseXorKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEBitwiseXorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseXorKernel(const NEBitwiseXorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseXorKernel &operator=(const NEBitwiseXorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEBitwiseXorKernel(NEBitwiseXorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEBitwiseXorKernel &operator=(NEBitwiseXorKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input1 An input tensor. Data type supported: U8.
+     * @param[in]  input2 An input tensor. Data type supported: U8
+     * @param[out] output The output tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const ITensor *_input1; /**< Source tensor 1 */
+    const ITensor *_input2; /**< Source tensor 2 */
+    ITensor       *_output; /**< Destination tensor */
+};
+}
+#endif /* __ARM_COMPUTE_NEBITWISEXORKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h b/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h
new file mode 100644
index 0000000000..6b7bebbf17
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBOX3x3KERNEL_H__
+#define __ARM_COMPUTE_NEBOX3x3KERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform a Box 3x3 filter */
+class NEBox3x3Kernel : public INESimpleKernel
+{
+public:
+    /** Set the source, destination and border mode of the kernel
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output           Destination tensor. Data type supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+};
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+/** NEON kernel to perform a Box 3x3 filter using F16 simd
+ */
+class NEBox3x3FP16Kernel : public NEBox3x3Kernel
+{
+public:
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+};
+#else
+using NEBox3x3FP16Kernel = NEBox3x3Kernel;
+#endif
+}
+#endif /*__ARM_COMPUTE_NEBOX3x3KERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h b/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h
new file mode 100644
index 0000000000..b86085f439
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECANNYEDGEKERNEL_H__
+#define __ARM_COMPUTE_NECANNYEDGEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Computes magnitude and quantised phase from inputs gradients. */
+class NEGradientKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEGradientKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGradientKernel(const NEGradientKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGradientKernel &operator=(const NEGradientKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGradientKernel(NEGradientKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGradientKernel &operator=(NEGradientKernel &&) = default;
+    /** Default destructor */
+    virtual ~NEGradientKernel() = default;
+
+    /** Initialise the kernel's sources, destinations and border mode.
+     *
+     * @note gx, gy and magnitude must all be the same size (either 16 or 32)
+     *
+     * @param[in]  gx        Source tensor - Gx component. Data type supported: S16/S32.
+     * @param[in]  gy        Source tensor - Gy component. Data type supported: same as @p gx.
+     * @param[out] magnitude Destination tensor - Magnitude. Data type supported: U16 (if the data type of @p gx is S16) / U32 (if the data type of @p gx is S32).
+     * @param[out] phase     Destination tensor - Quantized phase. Data type supported: U8.
+     * @param[in]  norm_type Normalization type. If 1, L1-Norm otherwise L2-Norm
+     */
+    virtual void configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase, int32_t norm_type);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+protected:
+    /** Common signature for all the specialised gradient functions
+     *
+     * @param[in]  gx_ptr        Pointer to the first input tensor.
+     * @param[in]  gy_ptr        Pointer to the second input tensor.
+     * @param[out] magnitude_ptr Pointer to the first output tensor
+     * @param[out] phase_ptr     Pointer to the second output tensor
+     */
+    using GradientFunction = void(const void *__restrict gx_ptr, const void *__restrict gy_ptr, void *__restrict magnitude_ptr, void *__restrict phase_ptr);
+
+    GradientFunction *_func;      /**< Gradient function to use for the particular tensor types passed to configure() */
+    const ITensor    *_gx;        /**< Source tensor - Gx component */
+    const ITensor    *_gy;        /**< Source tensor - Gy component */
+    ITensor          *_magnitude; /**< Destination tensor - Magnitude */
+    ITensor          *_phase;     /**< Destination tensor - Quantized phase */
+};
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+/** NEON kernel to perform Gradient computation
+ */
+class NEGradientFP16Kernel : public NEGradientKernel
+{
+public:
+    // Inherited methods overriden:
+    void configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase, int32_t norm_type) override;
+};
+#else  /* ARM_COMPUTE_ENABLE_FP16 */
+using NEGradientFP16Kernel = NEGradientKernel;
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
+/** NEON kernel to perform Non-Maxima suppression for Canny Edge.
+ *
+ * @note This kernel is meant to be used alongside CannyEdge and performs a non-maxima suppression using magnitude and phase of input
+ *       to characterize points as possible edges. Thus, at the end, each point will be set to EDGE, NO_EDGE or MAYBE.
+ *
+ * @note Hysteresis is computed in @ref NEEdgeTraceKernel
+ */
+class NEEdgeNonMaxSuppressionKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEEdgeNonMaxSuppressionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEEdgeNonMaxSuppressionKernel(const NEEdgeNonMaxSuppressionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEEdgeNonMaxSuppressionKernel &operator=(const NEEdgeNonMaxSuppressionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEEdgeNonMaxSuppressionKernel(NEEdgeNonMaxSuppressionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEEdgeNonMaxSuppressionKernel &operator=(NEEdgeNonMaxSuppressionKernel &&) = default;
+    /** Default destructor */
+    ~NEEdgeNonMaxSuppressionKernel() = default;
+
+    /** Initialise the kernel's sources, destination and border mode.
+     *
+     * @param[in]  magnitude        Source tensor - Magnitude. Data type supported: U16/U32.
+     * @param[in]  phase            Source tensor - Quantized phase. Data type supported: U8.
+     * @param[out] output           Output tensor. Data type supported: U8. It will be filled with 0 for "no edge", 127 for "maybe", 255 for "edge"
+     * @param[in]  upper_thr        Upper threshold used for the hysteresis
+     * @param[in]  lower_thr        Lower threshold used for the hysteresis
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *magnitude, const ITensor *phase, ITensor *output, int32_t upper_thr, int32_t lower_thr, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Common signature for all the specialised non-maxima suppression functions
+     *
+     * @param[in]  magnitude_ptr Pointer to the first input tensor.
+     * @param[in]  phase_ptr     Pointer to the second input tensor.
+     * @param[out] output_ptr    Pointer to the output tensor
+     * @param[in]  stride_mag    Stride of the magnitude tensor
+     * @param[in]  upper_thr     Upper threshold used for the hysteresis
+     * @param[in]  lower_thr     Lower threshold used for the hysteresis
+     */
+    using EdgeNonMaxSupprFunction = void(const void *__restrict magnitude_ptr, const void *__restrict phase_ptr, void *__restrict output_ptr, const uint32_t stride_mag, const int32_t upper_thr,
+                                         const int32_t lower_thr);
+
+    EdgeNonMaxSupprFunction *_func;      /**< Non-Maxima suppression function to use for the particular tensor types passed to configure() */
+    const ITensor           *_magnitude; /**< Source tensor - Magnitude */
+    const ITensor           *_phase;     /**< Source tensor - Quantized phase */
+    ITensor                 *_output;    /**< Destination tensor */
+    int32_t                  _lower_thr; /**< Lower threshold used for the hysteresis */
+    int32_t                  _upper_thr; /**< Upper threshold used for the hysteresis */
+};
+
+/** NEON kernel to perform Edge tracing */
+class NEEdgeTraceKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEEdgeTraceKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEEdgeTraceKernel(const NEEdgeTraceKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEEdgeTraceKernel &operator=(const NEEdgeTraceKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEEdgeTraceKernel(NEEdgeTraceKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEEdgeTraceKernel &operator=(NEEdgeTraceKernel &&) = default;
+    /** Default constructor */
+    ~NEEdgeTraceKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in,out] input  Source tensor. Data type supported: U8. Must contain 0 for "no edge", 127 for "maybe", 255 for "edge"
+     * @param[in,out] output Destination tensor. Data type supported: U8. Must be initialized to 0 (No edge).
+     */
+    void configure(ITensor *input, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+    bool       is_parallelisable() const override;
+
+private:
+    ITensor *_input;  /**< Source tensor */
+    ITensor *_output; /**< Destination tensor */
+};
+}
+#endif /* __ARM_COMPUTE_NECANNYEDGEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEChannelCombineKernel.h b/arm_compute/core/NEON/kernels/NEChannelCombineKernel.h
new file mode 100644
index 0000000000..8b669a4d28
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEChannelCombineKernel.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECHANNELCOMBINEKERNEL_H__
+#define __ARM_COMPUTE_NECHANNELCOMBINEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+#include <array>
+#include <cstdint>
+
+namespace arm_compute
+{
+class IMultiImage;
+class ITensor;
+using IImage = ITensor;
+
+/** Interface for the channel combine kernel */
+class NEChannelCombineKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEChannelCombineKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEChannelCombineKernel(const NEChannelCombineKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEChannelCombineKernel &operator=(const NEChannelCombineKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEChannelCombineKernel(NEChannelCombineKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEChannelCombineKernel &operator=(NEChannelCombineKernel &&) = default;
+    /** Default destructor */
+    ~NEChannelCombineKernel() = default;
+
+    /** Configure function's inputs and outputs.
+     *
+     * @param[in]  plane0 The 2D plane that forms channel 0. Data type supported: U8
+     * @param[in]  plane1 The 2D plane that forms channel 1. Data type supported: U8
+     * @param[in]  plane2 The 2D plane that forms channel 2. Data type supported: U8
+     * @param[in]  plane3 The 2D plane that forms channel 3. Data type supported: U8
+     * @param[out] output The single planar output tensor. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
+     */
+    void configure(const ITensor *plane0, const ITensor *plane1, const ITensor *plane2, const ITensor *plane3, ITensor *output);
+    /** Configure function's inputs and outputs.
+     *
+     * @param[in]  plane0 The 2D plane that forms channel 0. Data type supported: U8
+     * @param[in]  plane1 The 2D plane that forms channel 1. Data type supported: U8
+     * @param[in]  plane2 The 2D plane that forms channel 2. Data type supported: U8
+     * @param[out] output The multi planar output tensor. Formats supported: NV12/NV21/IYUV/YUV444
+     */
+    void configure(const IImage *plane0, const IImage *plane1, const IImage *plane2, IMultiImage *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    bool is_parallelisable() const override;
+
+private:
+    /** Combine 3 planes to form a three channel single plane tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void combine_3C(const Window &win);
+    /** Combine 4 planes to form a four channel single plane tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void combine_4C(const Window &win);
+    /** Combine 3 planes to form a single plane YUV tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    template <bool is_yuyv>
+    void combine_YUV_1p(const Window &win);
+    /** Combine 3 planes to form a two plane YUV tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void combine_YUV_2p(const Window &win);
+    /** Combine 3 planes to form a three plane YUV tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void combine_YUV_3p(const Window &win);
+    /** Copies a full plane to the output tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void copy_plane(const Window &win, uint32_t plane_id);
+    /** Common signature for all the specialised ChannelCombine functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using ChannelCombineFunction = void (NEChannelCombineKernel::*)(const Window &window);
+    /** ChannelCombine function to use for the particular tensor types passed to configure() */
+    ChannelCombineFunction _func;
+    std::array<const ITensor *, 4> _planes;
+    ITensor     *_output;
+    IMultiImage *_output_multi;
+    std::array<uint32_t, 3> _x_subsampling;
+    std::array<uint32_t, 3> _y_subsampling;
+    unsigned int _num_elems_processed_per_iteration;
+    bool         _is_parallelizable;
+};
+}
+#endif /* __ARM_COMPUTE_NECHANNELCOMBINEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEChannelExtractKernel.h b/arm_compute/core/NEON/kernels/NEChannelExtractKernel.h
new file mode 100644
index 0000000000..0715e1f8cb
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEChannelExtractKernel.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECHANNELEXTRACTKERNEL_H__
+#define __ARM_COMPUTE_NECHANNELEXTRACTKERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class IMultiImage;
+class ITensor;
+using IImage = ITensor;
+
+/** Interface for the channel extract kernel */
+class NEChannelExtractKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NEChannelExtractKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEChannelExtractKernel(const NEChannelExtractKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEChannelExtractKernel &operator=(const NEChannelExtractKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEChannelExtractKernel(NEChannelExtractKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEChannelExtractKernel &operator=(NEChannelExtractKernel &&) = default;
+    /** Default destructor */
+    ~NEChannelExtractKernel() = default;
+
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input   Source tensor. Formats supported: RGB888/RGBA8888/YUYV422/UYVY422
+     * @param[in]  channel Channel to extract.
+     * @param[out] output  Destination tensor. Format supported: u8
+     */
+    void configure(const ITensor *input, Channel channel, ITensor *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input   Multi-planar source image. Formats supported: NV12/NV21/IYUV/YUV444
+     * @param[in]  channel Channel to extract.
+     * @param[out] output  Single-planar destination image. Format supported: U8
+     */
+    void configure(const IMultiImage *input, Channel channel, IImage *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Extract one channel from a two channel planar tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void extract_1C_from_2C_img(const Window &win);
+    /** Extract one channel from a three channel planar tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void extract_1C_from_3C_img(const Window &win);
+    /** Extract one channel from a four channel planar tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void extract_1C_from_4C_img(const Window &win);
+    /** Extract U/V channel from a single planar YUVY/UYVY tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void extract_YUYV_uv(const Window &win);
+    /** Copies a full plane to the output tensor.
+     *
+     * @param[in] win Region on which to execute the kernel.
+     */
+    void copy_plane(const Window &win);
+    /** Common signature for all the specialised ChannelExtract functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using ChannelExtractFunction = void (NEChannelExtractKernel::*)(const Window &window);
+    /** ChannelExtract function to use for the particular tensor types passed to configure() */
+    ChannelExtractFunction _func;
+    unsigned int           _lut_index;
+};
+}
+#endif /* __ARM_COMPUTE_NECHANNELEXTRACTKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NECol2ImKernel.h b/arm_compute/core/NEON/kernels/NECol2ImKernel.h
new file mode 100644
index 0000000000..f6bc2152da
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NECol2ImKernel.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECOL2IMKERNEL_H__
+#define __ARM_COMPUTE_NECOL2IMKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform col2im reshaping.
+ *
+ * Rearranges each matrix column into image blocks. It's the inverse operation of @ref NEIm2ColKernel.
+ *
+ * For example, a vector of 9 elements can be reshaped to a block(image) of 3x3:
+ *
+ * @f[
+ * \left( \begin{array}{ccccccccc}
+ * a0 & a1 & a2 & a3 & a4 & a5 & a6 & a7 & a8 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccc}
+ * a0 & a1 & a2 \\
+ * a3 & a4 & a5 \\
+ * a6 & a7 & a8 \\
+ * \end{array} \right)
+ * @f]
+ */
+class NECol2ImKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NECol2ImKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NECol2ImKernel(const NECol2ImKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NECol2ImKernel &operator=(const NECol2ImKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NECol2ImKernel(NECol2ImKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NECol2ImKernel &operator=(NECol2ImKernel &&) = default;
+    /** Default destructor */
+    ~NECol2ImKernel() = default;
+
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input          The input tensor to convert. Data types supported: U8/S8/QS8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[out] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
+     *                            while the rest represent batch of outputs. Data types supported: Same as @p input
+     * @param[in]  convolved_dims Output convolved dimensions.
+     */
+    void configure(const ITensor *input, ITensor *output, std::pair<unsigned int, unsigned int> convolved_dims);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Template function to run the col2im
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <typename T>
+    void run_col2im(const Window &window);
+
+    /** Common signature for all the specialised col2im functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using Col2ImFunctionPtr = void (NECol2ImKernel::*)(const Window &window);
+
+    Col2ImFunctionPtr _func;
+    const ITensor    *_input;
+    ITensor          *_output;
+    std::pair<unsigned int, unsigned int> _convolved_dims;
+};
+}
+
+#endif /*__ARM_COMPUTE_NECOL2IMKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEColorConvertKernel.h b/arm_compute/core/NEON/kernels/NEColorConvertKernel.h
new file mode 100644
index 0000000000..2297218117
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEColorConvertKernel.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_COLORCONVERTKERNEL_H__
+#define __ARM_COMPUTE_COLORCONVERTKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class IMultiImage;
+class ITensor;
+using IImage = ITensor;
+
+/** Interface for the color convert kernel */
+class NEColorConvertKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEColorConvertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEColorConvertKernel(const NEColorConvertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEColorConvertKernel &operator=(const NEColorConvertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEColorConvertKernel(NEColorConvertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEColorConvertKernel &operator=(NEColorConvertKernel &&) = default;
+    /** Default destructor */
+    ~NEColorConvertKernel() = default;
+
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input  Source tensor. Formats supported: RGBA8888/UYVY422/YUYV422/RGB888
+     * @param[out] output Destination tensor. Formats supported: RGB888 (if the formats of @p input are RGBA8888/UYVY422/YUYV422),
+     *                                                          RGBA8888 (if the formats of @p input are UYVY422/YUYV422/RGB888/)
+     */
+    void configure(const ITensor *input, ITensor *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
+     * @param[out] output Single-planar destination image. Formats supported: RGB888/RGBA8888
+     */
+    void configure(const IMultiImage *input, IImage *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input  Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
+     * @param[out] output Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888)
+     */
+    void configure(const IImage *input, IMultiImage *output);
+    /** Set the input and output of the kernel
+     *
+     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
+     * @param[out] output Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of  @p input is IYUV)
+     */
+    void configure(const IMultiImage *input, IMultiImage *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    using ColorConvertFunction = void(const void *__restrict input_ptr, void *__restrict output_ptr, const Window &win);
+    const void           *_input;
+    void                 *_output;
+    ColorConvertFunction *_func;
+};
+}
+#endif /*__ARM_COMPUTE_NECOLORCONVERTKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEConvolutionKernel.h b/arm_compute/core/NEON/kernels/NEConvolutionKernel.h
new file mode 100644
index 0000000000..588a228a5d
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEConvolutionKernel.h
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECONVOLUTIONKERNEL_H__
+#define __ARM_COMPUTE_NECONVOLUTIONKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+#include <array>
+#include <cstdint>
+#include <vector>
+
+namespace arm_compute
+{
+class ITensor;
+
+/****************************************************************************************\
+ *                                    Square Convolution                                *
+\****************************************************************************************/
+
+/** Interface for the kernel to run an arbitrary size convolution on a tensor. (Currently supports 3x3, 5x5, 7x7 and 9x9).
+ * The client can supply a convolution matrix \f$ C_{m,n} \f$.
+ * @f{eqnarray}{
+ *  k_0 &=& \frac{m}{2}  \\
+ *  l_0 &=& \frac{n}{2}  \\
+ *  sum &=& \sum_{k=0,l=0}^{k=m-1,l=n-1} input(x+k-k_0, y+l-l_0) C_{k,l}
+ *  @f}
+ *
+ * @note The above equation for this function is similar to the default OpenCV Filter2D function,
+ *       which actually computes a correlation and not a convolution.
+ *       In case of a real convolution the convolution matrix should be flipped both horizontally and vertically.
+ */
+template <unsigned int matrix_size>
+class NEConvolutionKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NEConvolutionKernel();
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output           Destination tensor. Data types supported: U8, S16.
+     * @param[in]  conv             Convolution matrix to apply to the input tensor.
+     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    template <typename OutputType>
+    void convolution(const Window &win);
+
+protected:
+    uint32_t _scale;                                             /**< scale of the convolution */
+    std::array<int16_t, matrix_size *matrix_size> _convolution;  /**< convolution matrix */
+};
+
+/** Interface for the kernel which applied a 3x3 convolution to a tensor.*/
+using NEConvolution3x3Kernel = NEConvolutionKernel<3>;
+/** Interface for the kernel which applied a 5x5 convolution to a tensor.*/
+using NEConvolution5x5Kernel = NEConvolutionKernel<5>;
+/** Interface for the kernel which applied a 7x7 convolution to a tensor.*/
+using NEConvolution7x7Kernel = NEConvolutionKernel<7>;
+///** Interface for the kernel which applied a 9x9 convolution to a tensor.*/
+using NEConvolution9x9Kernel = NEConvolutionKernel<9>;
+
+/****************************************************************************************\
+ *                              Separable Square Convolution                            *
+\****************************************************************************************/
+
+/** Kernel for the Horizontal pass of a Separable Convolution */
+template <unsigned int matrix_size>
+class NESeparableConvolutionHorKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NESeparableConvolutionHorKernel();
+
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output           Destination tensor. Data types supported: U16, S16, S32.
+     * @param[in]  conv_row         Convolution matrix to apply to the input tensor.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, const int16_t *conv_row, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Apply the object's convolution to the given window of the input tensor..
+     *
+     * @param[in] window Window to apply the convolution on.
+     */
+    template <typename OutputType>
+    void convolve(const Window &window);
+
+    std::array<int16_t, matrix_size> _conv_row; /**< Convolution coefficients */
+    BorderSize _border_size;                    /**< Border size */
+};
+
+/** Interface for the kernel which applied a 5x1 horizontal convolution to a tensor.*/
+using NESeparableConvolution5x5HorKernel = NESeparableConvolutionHorKernel<5>;
+/** Interface for the kernel which applied a 7x1 horizontal convolution to a tensor.*/
+using NESeparableConvolution7x7HorKernel = NESeparableConvolutionHorKernel<7>;
+/** Interface for the kernel which applied a 9x1 horizontal convolution to a tensor.*/
+using NESeparableConvolution9x9HorKernel = NESeparableConvolutionHorKernel<9>;
+
+/** Kernel for the Vertical pass of a Separable Convolution */
+template <unsigned int matrix_size>
+class NESeparableConvolutionVertKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NESeparableConvolutionVertKernel();
+
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U16, S16, S32.
+     * @param[out] output           Destination tensor, Data types supported: U8, S16.
+     * @param[in]  conv_col         Convolution matrix to apply to the input tensor.
+     * @param[in]  scale            Scale of the convolution matrix
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, const int16_t *conv_col, uint32_t scale, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Apply the object's convolution to the given window of the input tensor.
+     *  This function is used if the intermediate values have been stored as U16.
+     *
+     * @param[in] win Window to apply the convolution on.
+     */
+    template <typename OutputType>
+    void convolution_u16(const Window &win);
+    /** Apply the object's convolution to the given window of the input tensor.
+     *  This function is used if the intermediate values have been stored as S16.
+     *
+     * @param[in] win Window to apply the convolution on.
+     */
+    template <typename OutputType>
+    void convolution_s16(const Window &win);
+    /** Apply the object's convolution to the given window of the input tensor.
+     *  This function is used if the intermediate values have been stored as S32.
+     *
+     * @param[in] win Window to apply the convolution on.
+     */
+    template <typename OutputType>
+    void convolution_s32(const Window &win);
+
+    std::array<int16_t, matrix_size> _conv_col; /**< Convolution coefficients */
+    uint32_t _scale;                            /**< Convolution's scale */
+};
+
+/** Interface for the kernel which applied a 1x5 vertical convolution to a tensor.*/
+using NESeparableConvolution5x5VertKernel = NESeparableConvolutionVertKernel<5>;
+/** Interface for the kernel which applied a 1x7 vertical convolution to a tensor.*/
+using NESeparableConvolution7x7VertKernel = NESeparableConvolutionVertKernel<7>;
+/** Interface for the kernel which applied a 1x9 vertical convolution to a tensor.*/
+using NESeparableConvolution9x9VertKernel = NESeparableConvolutionVertKernel<9>;
+
+/****************************************************************************************\
+ *                                 Rectangle Convolution                                *
+\****************************************************************************************/
+
+/** Kernel for the running convolution on a rectangle matrix.
+ *
+ * @note Supports combinations of 3,5,7 and 9.
+ */
+class NEConvolutionRectangleKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEConvolutionRectangleKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConvolutionRectangleKernel(NEConvolutionRectangleKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConvolutionRectangleKernel &operator=(NEConvolutionRectangleKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEConvolutionRectangleKernel(NEConvolutionRectangleKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEConvolutionRectangleKernel &operator=(NEConvolutionRectangleKernel &&) = default;
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output           Destination tensor, Data types supported: U8, S16.
+     * @param[in]  conv             Convolution matrix to apply to the input tensor.
+     * @param[in]  width            Width of convolution matrix (Number of columns)
+     * @param[in]  height           Height of convolution matrix (Number of rows)
+     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    unsigned int get_index(uint32_t val);
+    /** Apply the object's convolution to the given window of the input tensor.
+     *
+     * @param[in] win Window to apply the convolution on.
+     */
+    template <typename OutputType, unsigned int rows, unsigned int cols>
+    void convolution(const Window &win);
+
+protected:
+    const ITensor            *_input;       /**< Input tensor */
+    ITensor                  *_output;      /**< Output tensor */
+    uint32_t                  _scale;       /**< Scale of the convolution */
+    std::vector<int16_t>      _convolution; /**< Convolution matrix */
+    BorderSize                _border_size; /**< Calculated border width */
+    uint32_t                  _func_idx;    /**< Index used to specify convolution function to be used */
+    const static unsigned int _nr_supported_sizes
+    {
+        4
+    }; /**< Number of supported permutations */
+};
+}
+#endif /*__ARM_COMPUTE_NECONVOLUTIONKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h b/arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h
new file mode 100644
index 0000000000..67b8c6052d
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECUMULATIVEDISTRIBUTIONKERNEL_H__
+#define __ARM_COMPUTE_NECUMULATIVEDISTRIBUTIONKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class IDistribution1D;
+class ILut;
+class ITensor;
+using IImage = ITensor;
+
+/** Interface for the cumulative distribution (cummulative summmation) calculation kernel.
+ *
+ * This kernel calculates the cumulative sum of a given distribution (meaning that each output element
+ * is the sum of all its previous elements including itself) and creates a lookup table with the normalized
+ * pixel intensities which is used for improve the constrast of the image.
+ */
+class NECumulativeDistributionKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NECumulativeDistributionKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NECumulativeDistributionKernel(const NECumulativeDistributionKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NECumulativeDistributionKernel &operator=(const NECumulativeDistributionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NECumulativeDistributionKernel(NECumulativeDistributionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NECumulativeDistributionKernel &operator=(NECumulativeDistributionKernel &&) = default;
+    /** Set the input and output distribution.
+     *
+     * @param[in]  input          Input image. Data type supported: U8
+     * @param[in]  distribution   Unnormalized 256-bin distribution of the input image.
+     * @param[out] cumulative_sum Cummulative distribution (Summed histogram). Should be same size as @p distribution.
+     * @param[out] output         Equalization lookup table. Should consist of 256 entries of U8 elements.
+     */
+    void configure(const IImage *input, const IDistribution1D *distribution, IDistribution1D *cumulative_sum, ILut *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    bool is_parallelisable() const override;
+
+private:
+    const IImage          *_input;          /**< Input image. */
+    const IDistribution1D *_distribution;   /**< Input histogram of the input image. */
+    IDistribution1D       *_cumulative_sum; /**< The cummulative distribution. */
+    ILut                  *_output;         /**< Output with the equalization lookup table. */
+private:
+    static const uint32_t _histogram_size = 256; /**< Default histogram size of 256. */
+};
+}
+
+#endif /*__ARM_COMPUTE_NECUMULATIVEDISTRIBUTIONKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h b/arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h
new file mode 100644
index 0000000000..7384cd1f02
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_NEDEPTHCONCATENATEKERNEL_H__
+#define __ARM_COMPUTE_NEDEPTHCONCATENATEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the depth concatenate kernel.
+ *  The input tensor will be concatenated into the output tensor.
+ */
+class NEDepthConcatenateKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEDepthConcatenateKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthConcatenateKernel(const NEDepthConcatenateKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthConcatenateKernel &operator=(const NEDepthConcatenateKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEDepthConcatenateKernel(NEDepthConcatenateKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEDepthConcatenateKernel &operator=(NEDepthConcatenateKernel &&) = default;
+    /** Default destructor */
+    ~NEDepthConcatenateKernel() = default;
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]     input        Input tensor. Data types supported: F32.
+     * @param[in]     depth_offset The offset on the Z axis.
+     * @param[in,out] output       Output tensor. Data types supported: F32.
+     *
+     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
+     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
+     *
+     */
+    void configure(const ITensor *input, unsigned int depth_offset, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    const ITensor *_input;
+    ITensor       *_output;
+    int            _top_bottom;
+    int            _left_right;
+    unsigned int   _depth_offset;
+};
+}
+#endif /* __ARM_COMPUTE_NEDEPTHCONCATENATEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEDepthConvertKernel.h b/arm_compute/core/NEON/kernels/NEDepthConvertKernel.h
new file mode 100644
index 0000000000..0c5c29e4db
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEDepthConvertKernel.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_DEPTHCONVERTKERNEL_H__
+#define __ARM_COMPUTE_DEPTHCONVERTKERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Depth conversion kernel */
+class NEDepthConvertKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor*/
+    NEDepthConvertKernel();
+    /** Set the input and output of the kernel
+     *
+     * Valid conversions Input -> Output :
+     *
+     *   - QS8 -> F32
+     *   - U8 -> U16, S16, S32
+     *   - U16 -> U8, U32
+     *   - S16 -> U8, S32
+     *   - F32 -> QS8
+     *
+     *
+     * @param[in]  input  The input tensor to convert. Data types supported: U8/QS8/U16/S16/F32.
+     * @param[out] output The output tensor. Data types supported: U8/QS8/U16/S16/U32/S32/F32.
+     * @param[in]  policy Conversion policy.
+     * @param[in]  shift  Value for down/up conversions. Must be 0 <= shift < 8.
+     */
+    void configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    ConvertPolicy _policy;
+    uint32_t      _shift;
+};
+}
+#endif /*__ARM_COMPUTE_NEDEPTHCONVERTKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEDerivativeKernel.h b/arm_compute/core/NEON/kernels/NEDerivativeKernel.h
new file mode 100644
index 0000000000..abb8a894c0
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEDerivativeKernel.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDERIVATIVEKERNEL_H__
+#define __ARM_COMPUTE_NEDERIVATIVEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to run the derivative along the X/Y directions on a tensor.
+ *
+ */
+class NEDerivativeKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEDerivativeKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDerivativeKernel(const NEDerivativeKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDerivativeKernel &operator=(const NEDerivativeKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEDerivativeKernel(NEDerivativeKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEDerivativeKernel &operator=(NEDerivativeKernel &&) = default;
+    /** Initialise the kernel's sources, destination and border
+     *
+     * @note At least one of output_x or output_y must be set
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Function to perform derivative along the X direction on the given window
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    void derivative_x(const Window &window);
+    /** Function to perform derivative along the Y direction on the given window
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    void derivative_y(const Window &window);
+    /** Function to perform derivative along the X and Y direction on the given window
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    void derivative_xy(const Window &window);
+    /** Common signature for all the specialised derivative functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using DerivativeFunction = void (NEDerivativeKernel::*)(const Window &window);
+    /** Derivative function to use for the particular tensor types passed to configure() */
+    DerivativeFunction _func;
+
+private:
+    const ITensor *_input;    /**< Input tensor */
+    ITensor       *_output_x; /**< Output tensor - Derivate along the X direction */
+    ITensor       *_output_y; /**< Output tensor - Derivate along the Y direction */
+};
+}
+#endif /* __ARM_COMPUTE_NEDERIVATIVEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEDilateKernel.h b/arm_compute/core/NEON/kernels/NEDilateKernel.h
new file mode 100644
index 0000000000..05f148a1fd
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEDilateKernel.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDILATEKERNEL_H__
+#define __ARM_COMPUTE_NEDILATEKERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform boolean image dilatation */
+class NEDilateKernel : public INESimpleKernel
+{
+public:
+    /** Set the source, destination and border mode of the kernel
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8
+     * @param[out] output           Destination tensor. Data type supported: U8
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+};
+}
+#endif /*__ARM_COMPUTE_NEDILATEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h
new file mode 100644
index 0000000000..f098e18655
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERBIASACCUMULATEKERNEL_H__
+#define __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERBIASACCUMULATEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+/** NEON kernel to accumulate the biases to each element of the input tensor
+ *
+ * @note We assume bias to be shared
+ */
+class NEDirectConvolutionLayerBiasAccumulateKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEDirectConvolutionLayerBiasAccumulateKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDirectConvolutionLayerBiasAccumulateKernel(const NEDirectConvolutionLayerBiasAccumulateKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDirectConvolutionLayerBiasAccumulateKernel &operator=(const NEDirectConvolutionLayerBiasAccumulateKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEDirectConvolutionLayerBiasAccumulateKernel(NEDirectConvolutionLayerBiasAccumulateKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEDirectConvolutionLayerBiasAccumulateKernel &operator=(NEDirectConvolutionLayerBiasAccumulateKernel &&) = default;
+    /** Default destructor */
+    ~NEDirectConvolutionLayerBiasAccumulateKernel() = default;
+    /** Set the accumulate buffer and the biases of the kernel.
+     *
+     * @param[in, out] input  Input to add the bias to. If @p output is not specified then accumulation is done in-place.
+     *                        Data type supported: QS8/F32
+     * @param[in]      bias   The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input
+     * @param[out]     output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
+     *                         Data type supported: Same as @p input
+     */
+    void configure(ITensor *input, const ITensor *bias, ITensor *output = nullptr);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    using BiasAccumulateKernel = void(ITensor *input, const ITensor *bias, const Window window, ITensor *output);
+
+private:
+    BiasAccumulateKernel *_func;
+    ITensor              *_input;
+    const ITensor        *_bias;
+    ITensor              *_output;
+};
+}
+#endif /*__ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERBIASACCUMULATEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
new file mode 100644
index 0000000000..d726071606
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H__
+#define __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON interface for Direct Convolution Layer kernel */
+class NEDirectConvolutionLayerKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEDirectConvolutionLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDirectConvolutionLayerKernel(const NEDirectConvolutionLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDirectConvolutionLayerKernel &operator=(const NEDirectConvolutionLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEDirectConvolutionLayerKernel(NEDirectConvolutionLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEDirectConvolutionLayerKernel &operator=(NEDirectConvolutionLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEDirectConvolutionLayerKernel() = default;
+    /** Set the input, weights and output tensors.
+      *
+      * @param[in]  input     Input tensor. Data types supported: QS8/F32.
+      * @param[in]  weights   Set of kernels to convolve the input volume.
+      *                       The 3rd dimension must be the same as the input's volume 3rd dimension.
+      *                       Data type supported: Same as @p input.
+      * @param[out] output    Output tensor.
+      *                       The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
+      * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
+      */
+    void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    const ITensor *_input;
+    const ITensor *_weights;
+    ITensor       *_output;
+    PadStrideInfo  _conv_info;
+    BorderSize     _border_size;
+    unsigned int   _kernel_size;
+    unsigned int   _num_elems_read_per_iteration;
+    unsigned int   _num_elems_written_per_iteration;
+};
+}
+#endif /*__ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEErodeKernel.h b/arm_compute/core/NEON/kernels/NEErodeKernel.h
new file mode 100644
index 0000000000..86dc217cc0
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEErodeKernel.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEERODEKERNEL_H__
+#define __ARM_COMPUTE_NEERODEKERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform boolean image erosion */
+class NEErodeKernel : public INESimpleKernel
+{
+public:
+    /** Set the source, destination and border mode of the kernel
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8
+     * @param[out] output           Destination tensor. Data type supported: U8
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+};
+}
+#endif /*__ARM_COMPUTE_NEERODEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEFastCornersKernel.h b/arm_compute/core/NEON/kernels/NEFastCornersKernel.h
new file mode 100644
index 0000000000..d9bd6acde9
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEFastCornersKernel.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFASTCORNERSKERNEL_H__
+#define __ARM_COMPUTE_NEFASTCORNERSKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+
+/** NEON kernel to perform fast corners */
+class NEFastCornersKernel : public INEKernel
+{
+public:
+    /** Constructor */
+    NEFastCornersKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFastCornersKernel(const NEFastCornersKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFastCornersKernel &operator=(const NEFastCornersKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEFastCornersKernel(NEFastCornersKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEFastCornersKernel &operator=(NEFastCornersKernel &&) = default;
+    /** Initialise the kernel.
+     *
+     * @param[in]  input               Source image. Data type supported: U8.
+     * @param[out] output              Output image. Data type supported: U8.
+     * @param[in]  threshold           Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3.
+     * @param[in]  non_max_suppression True if non-maxima suppresion is applied, false otherwise.
+     * @param[in]  border_undefined    True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const IImage *input, IImage *output, uint8_t threshold, bool non_max_suppression, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    const IImage *_input;               /**< source image */
+    IImage       *_output;              /**< inermediate results */
+    uint8_t       _threshold;           /**< threshold on difference between intensity */
+    bool          _non_max_suppression; /** true if non-maxima suppression is applied in the next stage */
+};
+}
+#endif
diff --git a/arm_compute/core/NEON/kernels/NEFillArrayKernel.h b/arm_compute/core/NEON/kernels/NEFillArrayKernel.h
new file mode 100644
index 0000000000..8e0846ea88
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEFillArrayKernel.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFILLARRAYKERNEL_H__
+#define __ARM_COMPUTE_NEFILLARRAYKERNEL_H__
+
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+
+/** This kernel adds all texels greater than or equal to the threshold value to the keypoint array. */
+class NEFillArrayKernel : public INEKernel
+{
+public:
+    /** Default contructor */
+    NEFillArrayKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFillArrayKernel(const NEFillArrayKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFillArrayKernel &operator=(const NEFillArrayKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEFillArrayKernel(NEFillArrayKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEFillArrayKernel &operator=(NEFillArrayKernel &&) = default;
+    /** Default detructor */
+    ~NEFillArrayKernel() = default;
+
+    /** Initialise the kernel.
+     *
+     * @param[in]  input     Source image. Data type supported: U8.
+     * @param[in]  threshold Texels greater than the threshold will be added to the array.
+     * @param[out] output    Arrays of keypoints to store the results.
+     */
+    void configure(const IImage *input, uint8_t threshold, IKeyPointArray *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    bool is_parallelisable() const override;
+
+private:
+    const IImage   *_input;
+    IKeyPointArray *_output;
+    uint8_t         _threshold;
+};
+}
+#endif
diff --git a/arm_compute/core/NEON/kernels/NEFillBorderKernel.h b/arm_compute/core/NEON/kernels/NEFillBorderKernel.h
new file mode 100644
index 0000000000..3ec66115e2
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEFillBorderKernel.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFILLBORDERKERNEL_H__
+#define __ARM_COMPUTE_NEFILLBORDERKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to fill borders */
+class NEFillBorderKernel : public INEKernel
+{
+public:
+    /** Default Constructor */
+    NEFillBorderKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFillBorderKernel(const NEFillBorderKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFillBorderKernel &operator=(const NEFillBorderKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEFillBorderKernel(NEFillBorderKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEFillBorderKernel &operator=(NEFillBorderKernel &&) = default;
+    /** Default destructor */
+    ~NEFillBorderKernel() = default;
+
+    /** Initialise the function.
+     *
+     * @note This kernel fills the borders within the XY-planes.
+     *
+     * @param[in,out] tensor                Tensor to process. Data types supported: U8/S8/QS8/QS16/S16/S32/F32.
+     * @param[in]     border_size           Size of the border to fill in elements.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    template <typename T>
+    void fill_replicate_single_channel(const Window &window);
+    template <typename T>
+    void fill_constant_value_single_channel(const Window &window);
+
+    ITensor   *_tensor;
+    BorderSize _border_size;
+    BorderMode _mode;
+    PixelValue _constant_border_value;
+};
+}
+#endif /*__ARM_COMPUTE_NEFILLBORDERKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h b/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h
new file mode 100644
index 0000000000..61e6e46463
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFILLINNERBORDERKERNEL_H__
+#define __ARM_COMPUTE_NEFILLINNERBORDERKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to fill the interior borders */
+class NEFillInnerBorderKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEFillInnerBorderKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFillInnerBorderKernel(const NEFillInnerBorderKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFillInnerBorderKernel &operator=(const NEFillInnerBorderKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEFillInnerBorderKernel(NEFillInnerBorderKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEFillInnerBorderKernel &operator=(NEFillInnerBorderKernel &&) = default;
+    /** Default destructor */
+    ~NEFillInnerBorderKernel() = default;
+
+    /** Initialise the function.
+     *
+     * @note This kernel fills the borders within the XY-planes.
+     *
+     * @param[in,out] input                 Tensor to process. Data types supported: U8/QS8/S16/S32/F32.
+     * @param[in]     border_size           Size of the border to fill in elements.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(ITensor *input, BorderSize border_size, const PixelValue &constant_border_value = PixelValue());
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    template <typename T>
+    void fill_value_single_channel(const Window &window);
+
+    ITensor   *_tensor;
+    BorderSize _border_size;
+    PixelValue _constant_border_value;
+};
+}
+#endif /*__ARM_COMPUTE_NEFILLINNERBORDERKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
new file mode 100644
index 0000000000..b9884ffb57
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMINTERLEAVE4x4KERNEL_H__
+#define __ARM_COMPUTE_NEGEMMINTERLEAVE4x4KERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to interleave the elements of a matrix
+ *
+ * This function puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
+ *
+ * @f[
+ * \left( \begin{array}{cccc}
+ * a00 & a01 & a02 & a03 \\
+ * a10 & a11 & a12 & a13 \\
+ * a20 & a21 & a22 & a23 \\
+ * a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccccccccccccccccc}
+ * a00 & a10 & a20 & a30 & a01 & a11 & a21 & a31 & a02 & a12 & a22 & a32 & a03 & a13 & a23 & a33 \\
+ * \end{array} \right)
+ * @f]
+ *
+ * After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
+ */
+class NEGEMMInterleave4x4Kernel : public INESimpleKernel
+{
+public:
+    /* Constructor */
+    NEGEMMInterleave4x4Kernel();
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32
+     * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input.
+     */
+    void configure(const ITensor *input, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Common signature for all the transpose functions
+     *
+     * @param[in]  input  An input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32
+     * @param[out] output The output tensor. Data type supported: same as @p input
+     * @param[in]  window Region on which to execute the kernel.
+     */
+    using GEMMInterleaveFunction = void(const ITensor *input, ITensor *output, const Window &window);
+
+    GEMMInterleaveFunction *_func; /**< GEMM interleave function to use for the particular tensor types passed to configure() */
+};
+}
+#endif /*__ARM_COMPUTE_NEGEMMINTERLEAVE4x4KERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
new file mode 100644
index 0000000000..ba4dcc3373
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H__
+#define __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to multiply matrices
+ *
+ * @note @ref NEGEMMLowpMatrixMultiplyKernel low precision matrix product kernel
+ *  This kernel performs the following computation:
+ *
+ *  -# Convert a values from uint8 to int32 and add a_offset to each of them.
+ *  -# Convert b values from uint8 to int32 and add b_offset to each of them.
+ *  -# Compute the int32 matrix product of the resulting a * b.
+ *  -# Add output_offset to each entry of the result.
+ *  -# Multiply each entry of the result and round to the nearest integer
+ *  -# Clamp the resulting int32 values to the [0..255] range and cast to uint8.
+ *
+ */
+class NEGEMMLowpMatrixMultiplyKernel : public INEKernel
+{
+public:
+    /** Constructor */
+    NEGEMMLowpMatrixMultiplyKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEGEMMLowpMatrixMultiplyKernel(const NEGEMMLowpMatrixMultiplyKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEGEMMLowpMatrixMultiplyKernel &operator=(const NEGEMMLowpMatrixMultiplyKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpMatrixMultiplyKernel(NEGEMMLowpMatrixMultiplyKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMLowpMatrixMultiplyKernel &operator=(NEGEMMLowpMatrixMultiplyKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * The input matrices @p input0 and @p input1 must be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel. These two
+     * kernels change the layout of the original matrices to be more cache-friendly.
+     *
+     * @param[in]  input0          Input tensor containing the interleaved Matrix A. Data type supported: U8
+     * @param[in]  input1          Input tensor containing the transposed Matrix B. Data type supported: same as @p input0
+     * @param[out] output          Output tensor to store the result of matrix multiplication, Data type supported: same as @p input0
+     * @param[in]  a_offset        Offset to be added to each element of the matrix A.
+     * @param[in]  b_offset        Offset to be added to each element of the matrix B.
+     * @param[in]  output_offset   Offset to be added to each element of the output matrix
+     * @param[in]  output_mult_int Value to be multipied to each entry of the result.
+     * @param[in]  shift           Number of bits to shift right the result.
+     */
+    void configure(const ITensor *input0, const ITensor *input1, ITensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift);
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const ITensor *_input0;
+    const ITensor *_input1;
+    ITensor       *_output;
+    int32_t        _a_offset;
+    int32_t        _b_offset;
+    int32_t        _output_offset;
+    int32_t        _output_mult_int;
+    int32_t        _shift;
+};
+}
+#endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h
new file mode 100644
index 0000000000..c0ecafcd39
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H__
+#define __ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+/** NEON kernel to add a bias to each row of the input tensor */
+class NEGEMMMatrixAccumulateBiasesKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEGEMMMatrixAccumulateBiasesKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMMatrixAccumulateBiasesKernel(const NEGEMMMatrixAccumulateBiasesKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMMatrixAccumulateBiasesKernel &operator=(const NEGEMMMatrixAccumulateBiasesKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMMatrixAccumulateBiasesKernel(NEGEMMMatrixAccumulateBiasesKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMMatrixAccumulateBiasesKernel &operator=(NEGEMMMatrixAccumulateBiasesKernel &&) = default;
+    /** Default destructor */
+    ~NEGEMMMatrixAccumulateBiasesKernel() = default;
+    /** Set the accumulate buffer and the biases of the kernel.
+     *
+     * @param[in, out] accum  The accumulate tensor to convert. Data type supported: QS8/F32
+     * @param[in]      biases The shared biases tensor to append. It must be 1D Tensor. Data type supported: Same as @p input
+     */
+    void configure(ITensor *accum, const ITensor *biases);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    ITensor       *_accum;
+    const ITensor *_biases;
+};
+}
+#endif /*__ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
new file mode 100644
index 0000000000..1ab52fa2f2
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMMATRIXADDITIONKERNEL_H__
+#define __ARM_COMPUTE_NEGEMMMATRIXADDITIONKERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
+ *
+ * @note [ MTX_OUT = MTX_0 + beta * MTX_1 ] with MTX_0 and MTX_1 of the same size
+ *
+ * @note This stage is used to finalize the GEMM result and it is computed if and only if beta != 0.0. In case this kernel is used for finalizing GEMM result, we have:
+ *        - MTX_0 = A * B * alpha, where MTX_0 is the output of @ref NEGEMMMatrixMultiplyKernel
+ *        - MTX_1 = C
+ */
+class NEGEMMMatrixAdditionKernel : public INESimpleKernel
+{
+public:
+    /** Constructor */
+    NEGEMMMatrixAdditionKernel();
+    /** Prevent instances of this class from being copied */
+    NEGEMMMatrixAdditionKernel(const NEGEMMMatrixAdditionKernel &) = delete;
+    /** Prevent instances of this class from being copied */
+    NEGEMMMatrixAdditionKernel &operator=(const NEGEMMMatrixAdditionKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMMatrixAdditionKernel(NEGEMMMatrixAdditionKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMMatrixAdditionKernel &operator=(NEGEMMMatrixAdditionKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @note The input and output tensor must have the same dimensions
+     *
+     * @param[in]      input  Input tensor (Matrix C). Data types supported: QS8/F16/F32
+     * @param[in, out] output Output tensor. If this kernel is used to finalize the GEMM result, output contains the result obtained by the kernel @ref NEGEMMMatrixMultiplyKernel. Data type supported: the same as @p input.
+     * @param[in]      beta   Weight of matrix C
+     */
+    void configure(const ITensor *input, ITensor *output, float beta);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Common signature for all the matrix addition functions
+     *
+     * @param[in]  input  An input tensor. Data types supported: QS8/F16/F32
+     * @param[out] output The output tensor. Data type supported: same as @p input
+     * @param[in]  window Region on which to execute the kernel.
+     * @param[in]  beta   Weight of matrix C
+     */
+    using MatrixAdditionFunction = void(const ITensor *input, ITensor *output, const Window &window, float beta);
+    /** Matrix addition function to use for the particular tensor types passed to configure() */
+    MatrixAdditionFunction *_func;
+    float                   _beta;
+};
+}
+#endif /* __ARM_COMPUTE_NEGEMMMATRIXADDITIONKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
new file mode 100644
index 0000000000..a684945828
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMMATRIXMULTIPLYKERNEL_H__
+#define __ARM_COMPUTE_NEGEMMMATRIXMULTIPLYKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to multiply two input matrices "A" and "B". All elements of the output matrix/vector will be multiplied by alpha after the matrix multiplication
+ *
+ * @note If the output tensor is a matrix, the implementation assumes that the input tensors @p input0 and @p input1 are both matrices and reshaped respectively with @ref NEGEMMInterleave4x4Kernel" and @ref NEGEMMTranspose1xWKernel
+ * @note If the output tensor is a vector and the data type is F32, the implementation assumes that the first input tensor @p input0 is a vector and the second input tensor @p input1 a matrix. The implementation also assumes that both tensors have not been reshaped
+ *
+ */
+class NEGEMMMatrixMultiplyKernel : public INEKernel
+{
+public:
+    /** Constructor */
+    NEGEMMMatrixMultiplyKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMMatrixMultiplyKernel(const NEGEMMMatrixMultiplyKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMMatrixMultiplyKernel &operator=(const NEGEMMMatrixMultiplyKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGEMMMatrixMultiplyKernel(NEGEMMMatrixMultiplyKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGEMMMatrixMultiplyKernel &operator=(NEGEMMMatrixMultiplyKernel &&) = default;
+    /** Initialise the kernel's input and output.
+     *
+     * @note If the output tensor is a matrix, the input matrices @p input0 and @p input1 should be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel
+     *       These two kernels change the layout of the original matrices to be more cache-friendly.
+     *
+     * @param[in]  input0 Input tensor containing the interleaved Matrix A or the vector A. Data types supported: F16/F32
+     * @param[in]  input1 Input tensor containing the transposed Matrix B if the first input tensor A is not a vector.
+     *                    If the output tensor is a vector, input1 must contain the matrix B not reshaped. Data type supported: same as @p input0
+     * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
+     * @param[in]  alpha  Weight of the matrix product
+     */
+    void configure(const ITensor *input0, const ITensor *input1, ITensor *output, float alpha);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const ITensor *_input0;
+    const ITensor *_input1;
+    ITensor       *_output;
+    float          _alpha;
+};
+}
+#endif /*__ARM_COMPUTE_NEGEMMMATRIXMULTIPLYKERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
new file mode 100644
index 0000000000..5d8a3697cb
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMTRANSPOSE1xWKERNEL_H__
+#define __ARM_COMPUTE_NEGEMMTRANSPOSE1xWKERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel which transposes the elements of a matrix in chunks of 1xW, where W is equal to (16 / element size of the tensor)
+ *
+ * Following an example of how the transposition1xW works when the input data is F32
+ *
+ * @f[
+ * \left( \begin{array}{cccc}
+ * a00 & a01 & a02 & a03 \\
+ * a10 & a11 & a12 & a13 \\
+ * a20 & a21 & a22 & a23 \\
+ * a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccccccccccccccccc}
+ * a00 & a01 & a02 & a03 & a10 & a11 & a12 & a13 & a20 & a21 & a22 & a23 & a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * @f]
+ *
+ * Following an example of how the transposition1xW works when the input data type is F16
+ *
+ * @f[
+ * \left( \begin{array}{cccccccc}
+ * a00 & a01 & a02 & a03 & a04 & a05 & a06 & a7 \\
+ * a10 & a11 & a12 & a13 & a14 & a15 & a16 & 17 \\
+ * a20 & a21 & a22 & a23 & a24 & a25 & a26 & 27 \\
+ * a30 & a31 & a32 & a33 & a34 & a35 & a36 & 37 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc}
+ * a00 & a01 & a02 & a03 & a04 & a05 & a06 & a07 & a10 & a11 & a12 & a13 & a14 & a15 & a16 & a17 & a20 & a21 & a22 & a23 & a24 & a25 & a26 & a27 & a30 & a31 & a32 & a33 & a34 & a35 & a36 & a37\\
+ * \end{array} \right)
+ * @f]
+ *
+ * @note The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
+ *
+ */
+class NEGEMMTranspose1xWKernel : public INESimpleKernel
+{
+public:
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32
+     * @param[out] output Output tensor. Data type supported: same as @p input.
+     */
+    void configure(const ITensor *input, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+};
+}
+#endif /*__ARM_COMPUTE_NEGEMMTRANSPOSE1xWKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h b/arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h
new file mode 100644
index 0000000000..763fab88f6
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGAUSSIAN3x3KERNEL_H__
+#define __ARM_COMPUTE_NEGAUSSIAN3x3KERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform a Gaussian 3x3 filter */
+class NEGaussian3x3Kernel : public INESimpleKernel
+{
+public:
+    /** Set the source, destination and border mode of the kernel
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8
+     * @param[out] output           Destination tensor. Data type supported: S16
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+};
+}
+#endif /*__ARM_COMPUTE_NEGAUSSIAN3x3KERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h b/arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h
new file mode 100644
index 0000000000..86b28907da
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGAUSSIAN5x5KERNEL_H__
+#define __ARM_COMPUTE_NEGAUSSIAN5x5KERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform a Gaussian 5x5 filter (horizontal pass) */
+class NEGaussian5x5HorKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NEGaussian5x5HorKernel();
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output           Destination tensor. Data type supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    BorderSize _border_size;
+};
+
+/** NEON kernel to perform a Gaussian 5x5 filter (vertical pass) */
+class NEGaussian5x5VertKernel : public INESimpleKernel
+{
+public:
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in]  input            Source tensor. Data type supported: S16.
+     * @param[out] output           Destination tensor, Data type supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+};
+}
+#endif /*__ARM_COMPUTE_NEGAUSSIAN5x5KERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h b/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h
new file mode 100644
index 0000000000..40a6aa7375
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGAUSSIANPYRAMIDKERNEL_H__
+#define __ARM_COMPUTE_NEGAUSSIANPYRAMIDKERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform a GaussianPyramid (horizontal pass) */
+class NEGaussianPyramidHorKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NEGaussianPyramidHorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussianPyramidHorKernel(NEGaussianPyramidHorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussianPyramidHorKernel &operator=(NEGaussianPyramidHorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGaussianPyramidHorKernel(NEGaussianPyramidHorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGaussianPyramidHorKernel &operator=(NEGaussianPyramidHorKernel &&) = default;
+    /** Default destructor */
+    ~NEGaussianPyramidHorKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output           Destination tensor. Data type supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    BorderSize _border_size;
+    int        _l2_load_offset;
+};
+
+/** NEON kernel to perform a GaussianPyramid (vertical pass) */
+class NEGaussianPyramidVertKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NEGaussianPyramidVertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussianPyramidVertKernel(NEGaussianPyramidVertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussianPyramidVertKernel &operator=(NEGaussianPyramidVertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGaussianPyramidVertKernel(NEGaussianPyramidVertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGaussianPyramidVertKernel &operator=(NEGaussianPyramidVertKernel &&) = default;
+    /** Default destructor */
+    ~NEGaussianPyramidVertKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in]  input            Source tensor. Data type supported: S16.
+     * @param[out] output           Destination tensor. Data type supported: U8.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    int _t2_load_offset;
+};
+}
+#endif /*__ARM_COMPUTE_NEGAUSSIANPYRAMIDKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h b/arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h
new file mode 100644
index 0000000000..dd85778b8a
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEHOGDESCRIPTORKERNEL_H__
+#define __ARM_COMPUTE_NEHOGDESCRIPTORKERNEL_H__
+
+#include "arm_compute/core/IHOG.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Size2D.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform HOG Orientation Binning */
+class NEHOGOrientationBinningKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEHOGOrientationBinningKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHOGOrientationBinningKernel(const NEHOGOrientationBinningKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHOGOrientationBinningKernel &operator=(const NEHOGOrientationBinningKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEHOGOrientationBinningKernel(NEHOGOrientationBinningKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEHOGOrientationBinningKernel &operator=(NEHOGOrientationBinningKernel &&) = default;
+    /** Default destructor */
+    ~NEHOGOrientationBinningKernel() = default;
+
+    /**  Initialise the kernel's inputs, output and HOG's metadata
+     *
+     * @param[in]  input_magnitude Input tensor which stores the magnitude of the gradient for each pixel. Data type supported: S16.
+     * @param[in]  input_phase     Input tensor which stores the phase of the gradient for each pixel. Data type supported: U8
+     * @param[out] output          Output tensor which stores the local HOG for each cell. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per cell
+     * @param[in]  hog_info        HOG's metadata
+     */
+    void configure(const ITensor *input_magnitude, const ITensor *input_phase, ITensor *output, const HOGInfo *hog_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Common signature for all the specialised block normalization functions
+     *
+     * @param[in]  mag_row_ptr   Pointer to the first row of the cell in the magnitude tensor
+     * @param[in]  phase_row_ptr Pointer to the first row of the cell in the phase tensor
+     * @param[out] output_ptr    Pointer to the output cell of hog space tensor
+     * @param[in]  mag_stride    Stride of the magnitude tensor
+     * @param[in]  phase_stride  Stride of the phase tensor
+     * @param[in]  cell_width    Width of the cell
+     * @param[in]  cell_height   Height of the cell
+     * @param[in]  num_bins      Number of bins for each cell
+     * @param[in]  phase_scale   Scale factor to apply to the phase in order to calculate the histogram index
+     */
+    using OrientBinFunc = void(const int16_t *__restrict mag_row_ptr, const uint8_t *__restrict phase_row_ptr, float *__restrict output_ptr, size_t mag_stride, size_t phase_stride, size_t cell_width,
+                               size_t cell_height, size_t num_bins, float phase_scale);
+    /** Orientation binning function to use for the particular cell width passed to configure() */
+    OrientBinFunc *_func;
+    const ITensor *_input_magnitude;
+    const ITensor *_input_phase;
+    ITensor       *_output;
+    size_t         _cell_width;
+    size_t         _cell_height;
+    size_t         _num_bins;
+    float          _phase_scale;
+};
+
+/** NEON kernel to perform HOG block normalization */
+class NEHOGBlockNormalizationKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEHOGBlockNormalizationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHOGBlockNormalizationKernel(const NEHOGBlockNormalizationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHOGBlockNormalizationKernel &operator=(const NEHOGBlockNormalizationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEHOGBlockNormalizationKernel(NEHOGBlockNormalizationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEHOGBlockNormalizationKernel &operator=(NEHOGBlockNormalizationKernel &&) = default;
+    /** Default destructor */
+    ~NEHOGBlockNormalizationKernel() = default;
+
+    /** Initialise the kernel's input, output and HOG's metadata
+     *
+     * @param[in]  input    Input tensor which stores the local HOG for each cell. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per cell
+     * @param[out] output   Output tensor which stores the normalised blocks. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
+     * @param[in]  hog_info HOG's metadata
+     */
+    void configure(const ITensor *input, ITensor *output, const HOGInfo *hog_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Common signature for all the specialised block normalization functions
+     *
+     * @param[in]  input_row_ptr              Pointer to the first row of the block in the input hog space tensor
+     * @param[out] output_ptr                 Pointer to the output block of the hog normalized space
+     * @param[in]  input_stride               Stride of the input hog space tensor
+     * @param[in]  num_cells_per_block_height Number of cells per block along the Y direction
+     * @param[in]  num_bins_block_x           Number of bins per block along the X direction
+     * @param[in]  num_bins_block             Number of total bins per block
+     * @param[in]  l2_hyst_threshold          Threshold to use for l2 hysteresis normalization
+     */
+    using BlockNormFunc = void(const float *input_row_ptr, float *output_ptr, size_t input_stride, size_t num_cells_per_block_height, size_t num_bins_block_x, size_t num_bins_block,
+                               float l2_hyst_threshold);
+    /** Block normalization function to use for the particular normalization type passed to configure() */
+    BlockNormFunc *_func;
+    const ITensor *_input;
+    ITensor       *_output;
+    Size2D         _num_cells_per_block;
+    Size2D         _num_cells_per_block_stride;
+    size_t         _num_bins;
+    float          _l2_hyst_threshold;
+};
+}
+#endif /* __ARM_COMPUTE_NEHOGDESCRIPTORKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h b/arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h
new file mode 100644
index 0000000000..e56d1e5fd8
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEHOGDETECTORKERNEL_H__
+#define __ARM_COMPUTE_NEHOGDETECTORKERNEL_H__
+
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/IHOG.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+
+#include <mutex>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform HOG detector kernel using linear SVM */
+class NEHOGDetectorKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEHOGDetectorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHOGDetectorKernel(const NEHOGDetectorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHOGDetectorKernel &operator=(const NEHOGDetectorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEHOGDetectorKernel(NEHOGDetectorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEHOGDetectorKernel &operator=(NEHOGDetectorKernel &&) = default;
+    /** Default destructor */
+    ~NEHOGDetectorKernel() = default;
+
+    /** Initialise the kernel's input, HOG data-object, detection window, the stride of the detection window, the threshold and index of the object to detect
+     *
+     * @param[in]  input                   Input tensor which stores the HOG descriptor obtained with @ref NEHOGOrientationBinningKernel. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
+     * @param[in]  hog                     HOG data object used by @ref NEHOGOrientationBinningKernel and  @ref NEHOGBlockNormalizationKernel
+     * @param[out] detection_windows       Array of @ref DetectionWindow. This array stores all the detected objects
+     * @param[in]  detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions.
+     *                                     It must be multiple of the hog->info()->block_stride()
+     * @param[in]  threshold               (Optional) Threshold for the distance between features and SVM classifying plane
+     * @param[in]  idx_class               (Optional) Index of the class used for evaluating which class the detection window belongs to
+     */
+    void configure(const ITensor *input, const IHOG *hog, IDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold = 0.0f, uint16_t idx_class = 0);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const ITensor         *_input;
+    IDetectionWindowArray *_detection_windows;
+    const float           *_hog_descriptor;
+    float                  _bias;
+    float                  _threshold;
+    uint16_t               _idx_class;
+    size_t                 _num_bins_per_descriptor_x;
+    size_t                 _num_blocks_per_descriptor_y;
+    size_t                 _block_stride_width;
+    size_t                 _block_stride_height;
+    size_t                 _detection_window_width;
+    size_t                 _detection_window_height;
+    size_t                 _max_num_detection_windows;
+    std::mutex             _mutex;
+};
+}
+
+#endif /* __ARM_COMPUTE_NEHOGDETECTORKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h b/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h
new file mode 100644
index 0000000000..0abd73ef97
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEHARRISCORNERSKERNEL_H__
+#define __ARM_COMPUTE_NEHARRISCORNERSKERNEL_H__
+
+#include "arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h"
+#include "arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h"
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+
+#include <cstdint>
+#include <mutex>
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+
+/** Common interface for all Harris Score kernels */
+class INEHarrisScoreKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    INEHarrisScoreKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    INEHarrisScoreKernel(const INEHarrisScoreKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    INEHarrisScoreKernel &operator=(const INEHarrisScoreKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    INEHarrisScoreKernel(INEHarrisScoreKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    INEHarrisScoreKernel &operator=(INEHarrisScoreKernel &&) = default;
+    /** Default destructor */
+    ~INEHarrisScoreKernel() = default;
+
+public:
+    /** Setup the kernel parameters
+     *
+     * @param[in]  input1           Source image (gradient X). Data types supported: S16/S32
+     * @param[in]  input2           Source image (gradient Y). Data types supported: same as @ input1
+     * @param[out] output           Destination image (harris score). Data types supported: F32
+     * @param[in]  norm_factor      Normalization factor to use accordingly with the gradient size (Must be different from 0)
+     * @param[in]  strength_thresh  Minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel).
+     * @param[in]  sensitivity      Sensitivity threshold k from the Harris-Stephens equation
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    virtual void configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity, bool border_undefined) = 0;
+
+protected:
+    const IImage *_input1;          /**< Source image - Gx component */
+    const IImage *_input2;          /**< Source image - Gy component */
+    IImage       *_output;          /**< Source image - Harris score */
+    float         _sensitivity;     /**< Sensitivity value */
+    float         _strength_thresh; /**< Threshold value */
+    float         _norm_factor;     /**< Normalization factor */
+    BorderSize    _border_size;     /**< Border size */
+};
+
+/** Template NEON kernel to perform Harris Score.
+ *  The implementation supports 3, 5, and 7 for the block_size
+ */
+template <int32_t block_size>
+class NEHarrisScoreKernel : public INEHarrisScoreKernel
+{
+public:
+    /** Default constructor */
+    NEHarrisScoreKernel();
+    // Inherited methods overridden:
+    void configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity, bool border_undefined) override;
+    BorderSize border_size() const override;
+    void run(const Window &window) override;
+
+private:
+    /** Common signature for all the specialised harris score functions */
+    using HarrisScoreFunction = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
+                                     float norm_factor, float sensitivity, float strength_thresh);
+    /** Harris Score function to use for the particular image types passed to configure() */
+    HarrisScoreFunction *_func;
+};
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+/** Interface for the accumulate Weighted kernel using F16 */
+template <int32_t block_size>
+class NEHarrisScoreFP16Kernel : public INEHarrisScoreKernel
+{
+public:
+    /** Default constructor */
+    NEHarrisScoreFP16Kernel();
+    // Inherited methods overridden:
+    void configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity, bool border_undefined) override;
+    BorderSize border_size() const override;
+    void run(const Window &window) override;
+
+private:
+    using HarrisScoreFunction = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
+                                     float norm_factor, float sensitivity, float strength_thresh);
+    /** Harris Score function to use for the particular image types passed to configure() */
+    HarrisScoreFunction *_func;
+};
+#else
+template <int32_t block_size>
+using NEHarrisScoreFP16Kernel = NEHarrisScoreKernel<block_size>;
+#endif
+}
+#endif /* __ARM_COMPUTE_NEHARRISCORNERSKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEHistogramKernel.h b/arm_compute/core/NEON/kernels/NEHistogramKernel.h
new file mode 100644
index 0000000000..c4dbbeae83
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEHistogramKernel.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEHISTOGRAMKERNEL_H__
+#define __ARM_COMPUTE_NEHISTOGRAMKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <mutex>
+
+namespace arm_compute
+{
+class IDistribution1D;
+class ITensor;
+using IImage = ITensor;
+
+/** Interface for the histogram kernel */
+class NEHistogramKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEHistogramKernel();
+    /** Default destructor */
+    ~NEHistogramKernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHistogramKernel(const NEHistogramKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHistogramKernel &operator=(const NEHistogramKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEHistogramKernel(NEHistogramKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEHistogramKernel &operator=(NEHistogramKernel &&) = default;
+
+    /** Set the input image and the distribution output.
+     *
+     * @param[in]     input      Source image. Data type supported: U8.
+     * @param[out]    output     Destination distribution.
+     * @param[in,out] local_hist Array that the threads use to save their local histograms.
+     *                           It's size should be equal to (number_of_threads * num_bins),
+     *                           and the Window::thread_id() is used to determine the part of the array
+     *                           used by each thread.
+     * @param[out]    window_lut LUT with pre-calculated possible window values.
+     *                           The size of the LUT should be equal to max_range_size and it will be filled
+     *                           during the configure stage, while it re-used in every run, therefore can be
+     *                           safely shared among threads.
+     */
+    void configure(const IImage *input, IDistribution1D *output, uint32_t *local_hist, uint32_t *window_lut);
+    /** Set the input image and the distribution output.
+     *
+     * @note Used for histogram of fixed size equal to 256
+     *
+     * @param[in]  input  Source image. Data type supported: U8.
+     * @param[out] output Destination distribution which must be of 256 bins..
+     */
+    void configure(const IImage *input, IDistribution1D *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Function to merge multiple partial histograms.
+     *
+     *  @param[out] global_hist Pointer to the final histogram.
+     *  @param[in]  local_hist  Pointer to the partial histograms.
+     *  @param[in]  bins        Number of bins.
+     */
+    void merge_histogram(uint32_t *global_hist, const uint32_t *local_hist, size_t bins);
+    /** Function to merge multiple minimum values of partial histograms.
+     *
+     *  @param[out] global_min Pointer to the global min value.
+     *  @param[in]  local_min  Local min value.
+     */
+    void merge_min(uint8_t *global_min, const uint8_t &local_min);
+    /** Function to perform histogram on the given window
+      *
+     *  @param[in] win Region on which to execute the kernel
+     */
+    void histogram_U8(Window win);
+    /** Function to perform histogram on the given window where histogram is
+     *         of fixed size 256 without ranges and offsets.
+     *
+     *  @param[in] win Region on which to execute the kernel
+     */
+    void histogram_fixed_U8(Window win);
+    /** Pre-calculate the pixel windowing for every possible pixel
+     *
+     * Calculate (V - offset) * numBins / range where V is every possible pixel value.
+     *
+     * @note We currently support U8 image thus possible pixel values are between 0 and 255
+     */
+    void calculate_window_lut() const;
+    /** Common signature for all the specialised Histogram functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using HistogramFunctionPtr = void (NEHistogramKernel::*)(Window window);
+
+    HistogramFunctionPtr          _func; ///< Histogram function to use for the particular image types passed to configure()
+    const IImage                 *_input;
+    IDistribution1D              *_output;
+    uint32_t                     *_local_hist;
+    uint32_t                     *_window_lut;
+    std::mutex                    _hist_mtx;
+    static constexpr unsigned int _max_range_size{ 256 }; ///< 256 possible pixel values as we handle only U8 images
+};
+}
+#endif /*__ARM_COMPUTE_NEHISTOGRAMKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
new file mode 100644
index 0000000000..ebaafb467f
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEIM2COLKERNEL_H__
+#define __ARM_COMPUTE_NEIM2COLKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the im2col reshape kernel.
+ *
+ * Rearranges image blocks into columns. It is used to strip out each convolution block to a single column.
+ * It is used to transform a convolution to a plain matrix multiplication.
+ *
+ * For example taking into account the image below and assuming 3x3 image blocks with stride of 1 we have:
+ *
+ * @f[
+ * \left( \begin{array}{cccc}
+ * a00 & a01 & a02 & a03 \\
+ * a10 & a11 & a12 & a13 \\
+ * a20 & a21 & a22 & a23 \\
+ * a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccccccccc}
+ * a00 & a01 & a02 & a10 & a11 & a12 & a20 & a21 & a22 \\
+ * a01 & a02 & a03 & a11 & a12 & a13 & a21 & a22 & a23 \\
+ * a10 & a11 & a12 & a20 & a21 & a22 & a30 & a31 & a32 \\
+ * a11 & a12 & a13 & a21 & a22 & a23 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * @f]
+ */
+class NEIm2ColKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEIm2ColKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEIm2ColKernel(const NEIm2ColKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEIm2ColKernel &operator=(const NEIm2ColKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEIm2ColKernel(NEIm2ColKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEIm2ColKernel &operator=(NEIm2ColKernel &&) = default;
+    /** Default destructor */
+    ~NEIm2ColKernel() = default;
+
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input          The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
+     *                            while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/F32
+     * @param[out] output         The output tensor. Data types supported: Same as @p input
+     * @param[in]  convolved_dims The convolved output dimensions.
+     * @param[in]  conv_info      Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  has_bias       In case biases are provided expands the matrix with 1.
+     */
+    void configure(const ITensor *input, ITensor *output, std::pair<unsigned int, unsigned int> convolved_dims, const PadStrideInfo &conv_info, bool has_bias);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Template function to run the im2col optimised for the fully connected layer case
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <typename T>
+    void run_reduced(const Window &window);
+    /** Template function to run the im2col used for the convolution layer case
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <typename T, bool has_pads>
+    void run_generic(const Window &window);
+    /** Common signature for all the specialised im2col functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using Im2ColFunctionPtr = void (NEIm2ColKernel::*)(const Window &window);
+
+    Im2ColFunctionPtr _func;
+    const ITensor    *_input;
+    ITensor          *_output;
+    std::pair<unsigned int, unsigned int> _convolved_dims;
+    PadStrideInfo _conv_info;
+    unsigned int  _kernel_size;
+    bool          _has_bias;
+};
+}
+#endif /*__ARM_COMPUTE_NEIM2COLKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEIntegralImageKernel.h b/arm_compute/core/NEON/kernels/NEIntegralImageKernel.h
new file mode 100644
index 0000000000..13647889ab
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEIntegralImageKernel.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEINTEGRALIMAGEKERNEL_H__
+#define __ARM_COMPUTE_NEINTEGRALIMAGEKERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Kernel to perform an image integral on an image */
+class NEIntegralImageKernel : public INESimpleKernel
+{
+public:
+    /** Set the source, destination and border mode of the kernel
+     *
+     * @param[in]  input  Source tensor. Data type supported: U8
+     * @param[out] output Destination tensor. Data type supported: U32
+     */
+    void configure(const ITensor *input, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+    bool       is_parallelisable() const override;
+};
+}
+#endif /*__ARM_COMPUTE_NEINTEGRALIMAGEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NELKTrackerKernel.h b/arm_compute/core/NEON/kernels/NELKTrackerKernel.h
new file mode 100644
index 0000000000..9ab7f91092
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NELKTrackerKernel.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_LKTRACKERKERNEL_H__
+#define __ARM_COMPUTE_LKTRACKERKERNEL_H__
+
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+#include <utility>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Internal keypoint class for Lucas-Kanade Optical Flow */
+struct NELKInternalKeypoint
+{
+    float x{ 0.f };                 /**< x coordinate of the keypoint */
+    float y{ 0.f };                 /**< y coordinate of the keypoint */
+    bool  tracking_status{ false }; /**< the tracking status of the keypoint */
+};
+
+using INELKInternalKeypointArray = IArray<NELKInternalKeypoint>;
+
+/** Interface for the Lucas-Kanade tracker kernel */
+class NELKTrackerKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NELKTrackerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELKTrackerKernel(const NELKTrackerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELKTrackerKernel &operator=(const NELKTrackerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NELKTrackerKernel(NELKTrackerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NELKTrackerKernel &operator=(NELKTrackerKernel &&) = default;
+    /** Default destructor */
+    ~NELKTrackerKernel() = default;
+
+    /** Initialise the kernel input and output
+     *
+     * @param[in]      input_old            Pointer to the input old tensor. Data type supported: U8
+     * @param[in]      input_new            Pointer to the input new tensor. Data type supported. U8
+     * @param[in]      old_scharr_gx        Pointer to the input scharr X tensor. Data type supported: S16
+     * @param[in]      old_scharr_gy        Pointer to the input scharr Y tensor. Data type supported: S16
+     * @param[in]      old_points           Pointer to the IKeyPointArray storing old key points
+     * @param[in]      new_points_estimates Pointer to the IKeyPointArray storing new estimates key points
+     * @param[out]     new_points           Pointer to the IKeyPointArray storing new key points
+     * @param[in, out] old_points_internal  Pointer to the array of NELKInternalKeypoint for old points
+     * @param[out]     new_points_internal  Pointer to the array of NELKInternalKeypoint for new points
+     * @param[in]      termination          The criteria to terminate the search of each keypoint.
+     * @param[in]      use_initial_estimate The flag to indicate whether the initial estimated position should be used
+     * @param[in]      epsilon              The error for terminating the algorithm
+     * @param[in]      num_iterations       The maximum number of iterations before terminate the algorithm
+     * @param[in]      window_dimension     The size of the window on which to perform the algorithm
+     * @param[in]      level                The pyramid level
+     * @param[in]      num_levels           The number of pyramid levels
+     * @param[in]      pyramid_scale        Scale factor used for generating the pyramid
+     */
+    void configure(const ITensor *input_old, const ITensor *input_new, const ITensor *old_scharr_gx, const ITensor *old_scharr_gy,
+                   const IKeyPointArray *old_points, const IKeyPointArray *new_points_estimates, IKeyPointArray *new_points,
+                   INELKInternalKeypointArray *old_points_internal, INELKInternalKeypointArray *new_points_internal,
+                   Termination termination, bool use_initial_estimate, float epsilon, unsigned int num_iterations, size_t window_dimension,
+                   size_t level, size_t num_levels, float pyramid_scale);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Initialise the array of keypoints in the provide range
+     *
+     * @param[in] start Index of first element in the keypoints array to be initialised
+     * @param[in] end   Index after last elelemnt in the keypoints array to be initialised
+     */
+    void init_keypoints(int start, int end);
+    /** Compute the structure tensor A^T * A based on the scharr gradients I_x and I_y
+     *
+     * @param[in]  keypoint    Keypoint for which gradients are computed
+     * @param[out] bilinear_ix Intermediate interpolated data for X gradient
+     * @param[out] bilinear_iy Intermediate interpolated data for Y gradient
+     *
+     * @return Values A11, A12, A22
+     */
+    std::tuple<int, int, int> compute_spatial_gradient_matrix(const NELKInternalKeypoint &keypoint, int *bilinear_ix, int *bilinear_iy);
+    /** Compute the vector A^T * b, i.e. -sum(I_d * I_t) for d in {x,y}
+     *
+     * @param[in] old_keypoint Old keypoint for which gradient is computed
+     * @param[in] new_keypoint New keypoint for which gradient is computed
+     * @param[in] bilinear_ix  Intermediate interpolated data for X gradient
+     * @param[in] bilinear_iy  Intermediate interpolated data for Y gradient
+     *
+     * @return Values b1, b2
+     */
+    std::pair<int, int> compute_image_mismatch_vector(const NELKInternalKeypoint &old_keypoint, const NELKInternalKeypoint &new_keypoint, const int *bilinear_ix, const int *bilinear_iy);
+
+    const ITensor              *_input_old;
+    const ITensor              *_input_new;
+    const ITensor              *_old_scharr_gx;
+    const ITensor              *_old_scharr_gy;
+    IKeyPointArray             *_new_points;
+    const IKeyPointArray       *_new_points_estimates;
+    const IKeyPointArray       *_old_points;
+    INELKInternalKeypointArray *_old_points_internal;
+    INELKInternalKeypointArray *_new_points_internal;
+    Termination                 _termination;
+    bool                        _use_initial_estimate;
+    float                       _pyramid_scale;
+    float                       _epsilon;
+    unsigned int                _num_iterations;
+    int                         _window_dimension;
+    unsigned int                _level;
+    unsigned int                _num_levels;
+    ValidRegion                 _valid_region;
+};
+}
+#endif /*__ARM_COMPUTE_NELKTRACKERKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h
new file mode 100644
index 0000000000..d4bff661f9
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NELOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H__
+#define __ARM_COMPUTE_NELOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to multiply each row of first tensor with low 2 dimensions of second tensor. */
+class NELocallyConnectedMatrixMultiplyKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NELocallyConnectedMatrixMultiplyKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELocallyConnectedMatrixMultiplyKernel(const NELocallyConnectedMatrixMultiplyKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELocallyConnectedMatrixMultiplyKernel &operator=(const NELocallyConnectedMatrixMultiplyKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NELocallyConnectedMatrixMultiplyKernel(NELocallyConnectedMatrixMultiplyKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NELocallyConnectedMatrixMultiplyKernel &operator=(NELocallyConnectedMatrixMultiplyKernel &&) = default;
+    /** Initialise the kernel's input and output
+     *
+     * @param[in]  input0 First input tensor. Data types supported: F32
+     * @param[in]  input1 Second input tensor containing the Matrix B. Data type supported: same as @p input0
+     * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
+     */
+    void configure(const ITensor *input0, const ITensor *input1, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const ITensor *_input0;
+    const ITensor *_input1;
+    ITensor       *_output;
+};
+}
+#endif /* __ARM_COMPUTE_NELOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h b/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h
new file mode 100644
index 0000000000..5d49901dd0
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEMAGNITUDEPHASEKERNEL_H__
+#define __ARM_COMPUTE_NEMAGNITUDEPHASEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Template interface for the kernel to compute magnitude and phase */
+template <MagnitudeType mag_type, PhaseType phase_type>
+class NEMagnitudePhaseKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEMagnitudePhaseKernel();
+    /** Destructor */
+    ~NEMagnitudePhaseKernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMagnitudePhaseKernel(const NEMagnitudePhaseKernel &) = delete;
+    /** Default move constructor */
+    NEMagnitudePhaseKernel(NEMagnitudePhaseKernel &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMagnitudePhaseKernel &operator=(const NEMagnitudePhaseKernel &) = delete;
+    /** Default move assignment operator */
+    NEMagnitudePhaseKernel &operator=(NEMagnitudePhaseKernel &&) = default;
+
+    /** Initialise the kernel's input, output.
+     *
+     * @note At least one of out1 or out2 must be set
+     *
+     * @param[in]  gx        Gradient X tensor. Data type supported: S16.
+     * @param[in]  gy        Gradient Y tensor. Data type supported: S16.
+     * @param[out] magnitude (Optional) The output tensor - Magnitude. Data type supported: S16.
+     * @param[out] phase     (Optional) The output tensor - Phase. Data type supported: U8.
+     */
+    void configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Function to perform magnitude on the given window
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    void magnitude(const Window &window);
+    /** Function to perform phase on the given window
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    void phase(const Window &window);
+    /** Function to perform magnitude and phase on the given window
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    void magnitude_phase(const Window &window);
+
+private:
+    /** Common signature for all the specialised MagnitudePhase functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using MagnitudePhaseFunctionPtr = void (NEMagnitudePhaseKernel::*)(const Window &window);
+    /** MagnitudePhase function to use for the particular formats passed to configure() */
+    MagnitudePhaseFunctionPtr _func;
+    const ITensor            *_gx;        /**< Input gradient X */
+    const ITensor            *_gy;        /**< Input gradient Y */
+    ITensor                  *_magnitude; /**< Output - Magnitude */
+    ITensor                  *_phase;     /**< Output - Phase */
+};
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+/** Template interface for the kernel to compute magnitude and phase */
+template <MagnitudeType mag_type, PhaseType phase_type>
+class NEMagnitudePhaseFP16Kernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEMagnitudePhaseFP16Kernel();
+    /** Destructor */
+    ~NEMagnitudePhaseFP16Kernel() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMagnitudePhaseFP16Kernel(const NEMagnitudePhaseFP16Kernel &) = delete;
+    /** Default move constructor */
+    NEMagnitudePhaseFP16Kernel(NEMagnitudePhaseFP16Kernel &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMagnitudePhaseFP16Kernel &operator=(const NEMagnitudePhaseFP16Kernel &) = delete;
+    /** Default move assignment operator */
+    NEMagnitudePhaseFP16Kernel &operator=(NEMagnitudePhaseFP16Kernel &&) = default;
+
+    /** Initialise the kernel's input, output.
+     *
+     * @note At least one of out1 or out2 must be set
+     *
+     * @param[in]  gx        Gradient X tensor. Data type supported: S16.
+     * @param[in]  gy        Gradient Y tensor. Data type supported: S16.
+     * @param[out] magnitude (Optional) The output tensor - Magnitude. Data type supported: S16.
+     * @param[out] phase     (Optional) The output tensor - Phase. Data type supported: U8.
+     */
+    void configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Function to perform magnitude on the given window
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    void magnitude(const Window &window);
+    /** Function to perform phase on the given window
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    void phase(const Window &window);
+    /** Function to perform magnitude and phase on the given window
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    void magnitude_phase(const Window &window);
+
+    /** Common signature for all the specialised MagnitudePhase functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using MagnitudePhaseFunctionPtr = void (NEMagnitudePhaseFP16Kernel::*)(const Window &window);
+    /** MagnitudePhase function to use for the particular formats passed to configure() */
+    MagnitudePhaseFunctionPtr _func;
+    const ITensor            *_gx;        /**< Input gradient X */
+    const ITensor            *_gy;        /**< Input gradient Y */
+    ITensor                  *_magnitude; /**< Output - Magnitude */
+    ITensor                  *_phase;     /**< Output - Phase */
+};
+#else
+template <MagnitudeType mag_type, PhaseType phase_type>
+using NEMagnitudePhaseFP16Kernel = NEMagnitudePhaseKernel<mag_type, phase_type>;
+#endif
+}
+#endif /* __ARM_COMPUTE_NEMAGNITUDEPHASEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h b/arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h
new file mode 100644
index 0000000000..83407ccb7d
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEMEANSTDDEVKERNEL_H__
+#define __ARM_COMPUTE_NEMEANSTDDEVKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+#include <cstdint>
+#include <mutex>
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+
+/** Interface for the kernel to calculate mean and standard deviation of input image pixels. */
+class NEMeanStdDevKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEMeanStdDevKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMeanStdDevKernel(const NEMeanStdDevKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMeanStdDevKernel &operator=(const NEMeanStdDevKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEMeanStdDevKernel(NEMeanStdDevKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEMeanStdDevKernel &operator=(NEMeanStdDevKernel &&) = default;
+    /** Default destructor */
+    ~NEMeanStdDevKernel() = default;
+
+    /** Initialise the kernel's input and outputs.
+     *
+     * @param[in]  input              Input image. Data type supported: U8.
+     * @param[out] mean               Input average pixel value.
+     * @param[out] global_sum         Keeps global sum of pixel values.
+     * @param[out] stddev             (Optional) Output standard deviation of pixel values.
+     * @param[out] global_sum_squared (Optional if stddev is not set, required if stddev is set) Keeps global sum of squared pixel values.
+     */
+    void configure(const IImage *input, float *mean, uint64_t *global_sum, float *stddev = nullptr, uint64_t *global_sum_squared = nullptr);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    const IImage *_input;
+    float        *_mean;
+    float        *_stddev;
+    uint64_t     *_global_sum;
+    uint64_t     *_global_sum_squared;
+    std::mutex    _mtx;
+};
+}
+#endif /* __ARM_COMPUTE_NEMEANSTDDEVKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h b/arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h
new file mode 100644
index 0000000000..dee1aadfb9
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEMEDIAN3x3KERNEL_H__
+#define __ARM_COMPUTE_NEMEDIAN3x3KERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Kernel to perform a median filter on a tensor */
+class NEMedian3x3Kernel : public INESimpleKernel
+{
+public:
+    /** Set the source, destination and border mode of the kernel
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8
+     * @param[out] output           Destination tensor. Data type supported: U8
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+};
+}
+#endif /*__ARM_COMPUTE_NEMEDIAN3x3KERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h b/arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h
new file mode 100644
index 0000000000..e405ea5ae4
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEMINMAXLOCATIONKERNEL_H__
+#define __ARM_COMPUTE_NEMINMAXLOCATIONKERNEL_H__
+
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+
+#include <cstdint>
+#include <mutex>
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+
+/** Interface for the kernel to perform min max search on an image. */
+class NEMinMaxKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEMinMaxKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMinMaxKernel(const NEMinMaxKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMinMaxKernel &operator=(const NEMinMaxKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEMinMaxKernel(NEMinMaxKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEMinMaxKernel &operator=(NEMinMaxKernel &&) = default;
+    /** Default destructor */
+    ~NEMinMaxKernel() = default;
+
+    /** Initialise the kernel's input and outputs.
+     *
+     * @param[in]  input Input Image. Data types supported: U8/S16.
+     * @param[out] min   Minimum value of image.
+     * @param[out] max   Maximum value of image.
+     */
+    void configure(const IImage *input, int32_t *min, int32_t *max);
+    /** Resets global minimum and maximum. */
+    void reset();
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Performs the min/max algorithm on U8 images on a given window.
+     *
+     * @param win The window to run the algorithm on.
+     */
+    void minmax_U8(const Window &win);
+    /** Performs the min/max algorithm on S16 images on a given window.
+     *
+     * @param win The window to run the algorithm on.
+     */
+    void minmax_S16(const Window &win);
+    /** Common signature for all the specialised MinMax functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using MinMaxFunction = void (NEMinMaxKernel::*)(const Window &window);
+    /** MinMax function to use for the particular image types passed to configure() */
+    MinMaxFunction _func;
+    /** Helper to update min/max values **/
+    template <typename T>
+    void update_min_max(T min, T max);
+
+    const IImage *_input;    /**< Input image. */
+    int32_t      *_min;      /**< Minimum value. */
+    int32_t      *_max;      /**< Maximum value. */
+    int32_t       _min_init; /**< Value to initialise global minimum value. */
+    int32_t       _max_init; /**< Value to initialise global maximum value. */
+    std::mutex    _mtx;      /**< Mutex used for result reduction. */
+};
+
+/** Interface for the kernel to find min max locations of an image. */
+class NEMinMaxLocationKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEMinMaxLocationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMinMaxLocationKernel(const NEMinMaxLocationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMinMaxLocationKernel &operator=(const NEMinMaxLocationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEMinMaxLocationKernel(NEMinMaxLocationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEMinMaxLocationKernel &operator=(NEMinMaxLocationKernel &&) = default;
+    /** Default destructor */
+    ~NEMinMaxLocationKernel() = default;
+
+    /** Initialise the kernel's input and outputs.
+     *
+     * @param[in]  input     Input Image. Data types supported: U8 or S16.
+     * @param[out] min       Minimum value of image.
+     * @param[out] max       Maximum value of image.
+     * @param[out] min_loc   Array of minimum value locations.
+     * @param[out] max_loc   Array of maximum value locations.
+     * @param[out] min_count Number of minimum value encounters.
+     * @param[out] max_count Number of maximum value encounters.
+     */
+    void configure(const IImage *input, int32_t *min, int32_t *max,
+                   ICoordinates2DArray *min_loc = nullptr, ICoordinates2DArray *max_loc = nullptr,
+                   uint32_t *min_count = nullptr, uint32_t *max_count = nullptr);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    bool is_parallelisable() const override;
+
+private:
+    /** Performs the min/max location algorithm on T type images on a given window.
+     *
+     * @param win The window to run the algorithm on.
+     */
+    template <class T, bool count_min, bool count_max, bool loc_min, bool loc_max>
+    void minmax_loc(const Window &win);
+    /** Common signature for all the specialised MinMaxLoc functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using MinMaxLocFunction = void (NEMinMaxLocationKernel::*)(const Window &window);
+    /** MinMaxLoc function to use for the particular image types passed to configure() */
+    MinMaxLocFunction _func;
+    /** Helper to create a function pointer table for the parameterized MinMaxLocation functions. */
+    template <class T, typename>
+    struct create_func_table;
+
+    const IImage        *_input;                             /**< Input image. */
+    int32_t             *_min;                               /**< Minimum value. */
+    int32_t             *_max;                               /**< Maximum value. */
+    uint32_t            *_min_count;                         /**< Count of minimum value encounters. */
+    uint32_t            *_max_count;                         /**< Count of maximum value encounters. */
+    ICoordinates2DArray *_min_loc;                           /**< Locations of minimum values. */
+    ICoordinates2DArray *_max_loc;                           /**< Locations of maximum values. */
+    unsigned int         _num_elems_processed_per_iteration; /**< Elements processed per iteration. */
+};
+}
+#endif /*__ARM_COMPUTE_NEMINMAXLOCATIONKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h b/arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h
new file mode 100644
index 0000000000..ede0294a73
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NENONLINEARFILTERKERNEL_H__
+#define __ARM_COMPUTE_NENONLINEARFILTERKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to apply a non-linear filter */
+class NENonLinearFilterKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NENonLinearFilterKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NENonLinearFilterKernel(NENonLinearFilterKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NENonLinearFilterKernel &operator=(NENonLinearFilterKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NENonLinearFilterKernel(NENonLinearFilterKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NENonLinearFilterKernel &operator=(NENonLinearFilterKernel &&) = default;
+    /** Set the source, destination and border mode of the kernel
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8
+     * @param[out] output           Destination tensor. Data type supported: U8
+     * @param[in]  function         Non linear function to perform
+     * @param[in]  mask_size        Mask size. Supported sizes: 3, 5
+     * @param[in]  pattern          Mask pattern
+     * @param[in]  mask             The given mask. Will be used only if pattern is specified to PATTERN_OTHER
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Fill mask with the corresponding given pattern.
+     *
+     * @param[in,out] mask    Mask to be filled according to pattern
+     * @param[in]     cols    Columns (width) of mask
+     * @param[in]     rows    Rows (height) of mask
+     * @param[in]     pattern Pattern to fill the mask according to
+     */
+    void fill_mask(uint8_t *mask, int cols, int rows, MatrixPattern pattern);
+    /** Apply a median filter when given mask pattern is defined as box.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void median_filter_box(const Window &win);
+    /** Apply a min filter when given mask pattern is defined as box.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void min_filter_box(const Window &win);
+    /** Apply a max filter when given mask pattern is defined as box.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void max_filter_box(const Window &win);
+    /** Apply a median filter when given mask pattern is defined as cross.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void median_filter_cross(const Window &win);
+    /** Apply a min filter when given mask pattern is defined as cross.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void min_filter_cross(const Window &win);
+    /** Apply a max filter when given mask pattern is defined as cross.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void max_filter_cross(const Window &win);
+    /** Apply a median filter when given mask pattern is defined as disk.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void median_filter_disk(const Window &win);
+    /** Apply a min filter when given mask pattern is defined as disk.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void min_filter_disk(const Window &win);
+    /** Apply a max filter when given mask pattern is defined as disk.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void max_filter_disk(const Window &win);
+    /** Apply a non-linear filter when given mask has user-defined pattern.
+     *
+     * @param[in] win Window to apply the filter on.
+     */
+    template <int mask_w, int mask_h>
+    void non_linear_filter_generic(const Window &win);
+
+private:
+    unsigned int            _border_width;
+    const ITensor          *_input;
+    ITensor                *_output;
+    const uint8_t          *_mask;
+    MatrixPattern           _pattern;
+    NonLinearFilterFunction _function;
+    unsigned int            _func_idx;
+    BorderSize              _border_size;
+};
+}
+#endif /*__ARM_COMPUTE_NENONLINEARFILTERKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h b/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
new file mode 100644
index 0000000000..0daae59e54
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H__
+#define __ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface to perform Non-Maxima suppression over a 3x3 window using NEON
+ *
+ * @note Used by @ref NEFastCorners and @ref NEHarrisCorners
+ */
+class NENonMaximaSuppression3x3Kernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NENonMaximaSuppression3x3Kernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NENonMaximaSuppression3x3Kernel(const NENonMaximaSuppression3x3Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NENonMaximaSuppression3x3Kernel &operator=(const NENonMaximaSuppression3x3Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NENonMaximaSuppression3x3Kernel(NENonMaximaSuppression3x3Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NENonMaximaSuppression3x3Kernel &operator=(NENonMaximaSuppression3x3Kernel &&) = default;
+    /** Default destructor */
+    ~NENonMaximaSuppression3x3Kernel() = default;
+
+    /** Initialise the kernel's sources, destinations and border mode.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8/F32
+     * @param[out] output           Destination tensor. Data types supported: same as @p input
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+protected:
+    /** Common signature for all the specialised non-maxima suppression 3x3 functions
+     *
+     * @param[in]  input_ptr    Pointer to the input tensor.
+     * @param[out] output_ptr   Pointer to the output tensor
+     * @param[in]  input_stride Stride of the input tensor
+     */
+    using NonMaxSuppr3x3Function = void(const void *__restrict input_ptr, void *__restrict output_ptr, const uint32_t input_stride);
+
+    NonMaxSuppr3x3Function *_func;   /**< Non-Maxima suppression function to use for the particular tensor types passed to configure() */
+    const ITensor          *_input;  /**< Source tensor */
+    ITensor                *_output; /**< Destination tensor */
+};
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+/** NEON kernel to perform Non-Maxima suppression 3x3 with intermediate results in F16 if the input data type is F32
+ */
+class NENonMaximaSuppression3x3FP16Kernel : public NENonMaximaSuppression3x3Kernel
+{
+public:
+    /** Initialise the kernel's sources, destinations and border mode.
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8/F32.
+     * @param[out] output           Destination tensor. Data types supported: same as @p input
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output, bool border_undefined);
+};
+#else
+using NENonMaximaSuppression3x3FP16Kernel = NENonMaximaSuppression3x3Kernel;
+#endif
+}
+#endif /* _ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
new file mode 100644
index 0000000000..d4e36d5ff1
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H__
+#define __ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the normalization layer kernel.
+ */
+class NENormalizationLayerKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NENormalizationLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NENormalizationLayerKernel(const NENormalizationLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NENormalizationLayerKernel &operator=(const NENormalizationLayerKernel &) = delete;
+    /** Default Move Constructor. */
+    NENormalizationLayerKernel(NENormalizationLayerKernel &&) = default;
+    /** Default move assignment operator. */
+    NENormalizationLayerKernel &operator=(NENormalizationLayerKernel &&) = default;
+    /** Default destructor */
+    ~NENormalizationLayerKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input         Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
+     *                           and an optional 4th dimension for batch of inputs. Data types supported: QS8/F32.
+     * @param[in]  input_squared Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM],
+     *                           Data type supported: same as @p input
+     * @param[out] output        Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
+     * @param[in]  norm_info     Normalization layer information like the normalization type, normalization size and other parameters.
+     */
+    void configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Function to perform normalization depending on the given template
+     *  dimension. The second template parameter specifies whether the
+     *  normalization has to be 1D or 2D.
+     *
+     * @note Only supported normalizations are:
+     *  - 1D over X or Z
+     *  - 2D over X and Y
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    template <unsigned int dim, bool do_2D_norm>
+    void normalize(const Window &window);
+
+    /** Function to perform normalization for fixed-point values depending on
+     * the given template dimension. The second template parameter specifies
+     * whether the normalization has to be 1D or 2D.
+     *
+     * @note Only supported normalizations are:
+     *  - 1D over X or Z
+     *  - 2D over X and Y
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    template <unsigned int dim, bool do_2D_norm>
+    void normalize_fixed_point(const Window &window);
+    /** Common signature for all the specialised normalization functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using NormalizationFunction = void (NENormalizationLayerKernel::*)(const Window &window);
+
+private:
+    NormalizationFunction  _func;
+    const ITensor         *_input;
+    const ITensor         *_input_squared;
+    ITensor               *_output;
+    NormalizationLayerInfo _norm_info;
+    BorderSize             _border_size;
+};
+}
+#endif /*__ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
new file mode 100644
index 0000000000..7e402cd220
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEPIXELWISEMULTIPLICATIONKERNEL_H__
+#define __ARM_COMPUTE_NEPIXELWISEMULTIPLICATIONKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to perform addition between two tensors */
+class NEPixelWiseMultiplicationKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEPixelWiseMultiplicationKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPixelWiseMultiplicationKernel(const NEPixelWiseMultiplicationKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPixelWiseMultiplicationKernel &operator=(const NEPixelWiseMultiplicationKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEPixelWiseMultiplicationKernel(NEPixelWiseMultiplicationKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEPixelWiseMultiplicationKernel &operator=(NEPixelWiseMultiplicationKernel &&) = default;
+    /** Default destructor */
+    ~NEPixelWiseMultiplicationKernel() = default;
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
+     *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
+     *
+     * @param[in]  input1          An input tensor. Data types supported: U8/QS8/S16/F32.
+     * @param[in]  input2          An input tensor. Data types supported: U8/QS8/S16/F32.
+     * @param[out] output          The output tensor. Data types supported: U8 (Only if both inputs are U8) /S16/F32.
+     * @param[in]  scale           Scale to apply after multiplication.
+     *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
+     * @param[in]  overflow_policy Overflow policy.
+     * @param[in]  rounding_policy Rounding policy.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Common signature for all the specialised multiplication functions with integer scaling factor
+     *
+     * @param[in]  input1_ptr Pointer to the first input tensor.
+     * @param[in]  input2_ptr Pointer to the second input tensor.
+     * @param[out] output_ptr Pointer to the output tensor.
+     */
+    using MulFunctionInt = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int scale);
+    /** Common signature for all the specialised multiplication functions with fixed-point values
+     *
+     * @param[in]  input1_ptr           Pointer to the first input tensor.
+     * @param[in]  input2_ptr           Pointer to the second input tensor.
+     * @param[in]  scale                Scaling factor.
+     * @param[in]  fixed_point_position Fixed-point position that expresses the number of bits for the fractional part of the number.
+     * @param[out] output_ptr           Pointer to the output tensor.
+     */
+    using MulFunctionQInt = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int scale, int fixed_point_position);
+    /** Common signature for all the specialised multiplication functions with float scaling factor
+     *
+     * @param[in]  input1_ptr Pointer to the first input tensor.
+     * @param[in]  input2_ptr Pointer to the second input tensor.
+     * @param[out] output_ptr Pointer to the output tensor.
+     */
+    using MulFunctionFloat = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale);
+
+    MulFunctionFloat *_func_float;
+    MulFunctionInt   *_func_int;
+    MulFunctionQInt *_func_q_int;
+
+private:
+    const ITensor *_input1;
+    const ITensor *_input2;
+    ITensor       *_output;
+    float          _scale;
+    int            _scale_exponent;
+};
+}
+#endif /*__ARM_COMPUTE_NEPIXELWISEMULTIPLICATIONKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
new file mode 100644
index 0000000000..62a087841a
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEPOOLINGLAYERKERNEL_H__
+#define __ARM_COMPUTE_NEPOOLINGLAYERKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the pooling layer kernel */
+class NEPoolingLayerKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEPoolingLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPoolingLayerKernel(const NEPoolingLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPoolingLayerKernel &operator=(const NEPoolingLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEPoolingLayerKernel(NEPoolingLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEPoolingLayerKernel &operator=(NEPoolingLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEPoolingLayerKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input     Source tensor. Data types supported: QS8/F32.
+     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     */
+    void configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    /** Function to perform 2x2 pooling.
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    template <PoolingType pooling_type>
+    void pooling2_f32(const Window &window_input, const Window &window);
+    /** Function to perform 2x2 pooling for 8bit fixed point.
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    template <PoolingType pooling_type>
+    void pooling2_q8(const Window &window_input, const Window &window);
+    /** Function to perform 3x3 pooling.
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    template <PoolingType pooling_type>
+    void pooling3_f32(const Window &window_input, const Window &window);
+    /** Function to perform 3x3 pooling for 8bit fixed point.
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    template <PoolingType pooling_type>
+    void pooling3_q8(const Window &window_input, const Window &window);
+    /** Common signature for all the specialised Pooling functions
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    using PoolingFunction = void (NEPoolingLayerKernel::*)(const Window &window_input, const Window &window);
+
+private:
+    PoolingFunction  _func;
+    const ITensor   *_input;
+    ITensor         *_output;
+    PoolingLayerInfo _pool_info;
+    int              _num_elems_processed_per_iteration;
+    BorderSize       _border_size;
+};
+}
+#endif /*__ARM_COMPUTE_NEPOOLINGLAYERKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NERemapKernel.h b/arm_compute/core/NEON/kernels/NERemapKernel.h
new file mode 100644
index 0000000000..f9eae68ee8
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NERemapKernel.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEREMAPKERNEL_H__
+#define __ARM_COMPUTE_NEREMAPKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform a remap on a tensor */
+class NERemapKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NERemapKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NERemapKernel(const NERemapKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NERemapKernel &operator=(const NERemapKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NERemapKernel(NERemapKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NERemapKernel &operator=(NERemapKernel &&) = default;
+    /** Default destructor */
+    ~NERemapKernel() = default;
+
+    /** Initialize the kernel's input, output and border mode.
+     *
+     * @param[in]  input  Source tensor. Data type supported: U8.
+     * @param[in]  map_x  Map for X coordinates. Data type supported: F32.
+     * @param[in]  map_y  Map for Y coordinates. Data type supported: F32.
+     * @param[out] output Destination tensor. Data types supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane.
+     * @param[in]  policy The interpolation type.
+     */
+    void configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** function to perform nearest interpolation on the given window */
+    void remap_nearest(const Window &window);
+    /** function to perform bilinear interpolation on the given window */
+    void remap_bilinear(const Window &window);
+    /** Remap function to use for the particular interpolation type passed to configure() */
+    void (NERemapKernel::*_func)(const Window &window);
+
+    const ITensor *_input;  /**< Input image */
+    ITensor       *_output; /**< Output image */
+    const ITensor *_map_x;  /**< Input remap x coordinates */
+    const ITensor *_map_y;  /**< Input remap y coordinates */
+};
+}
+#endif /*__ARM_COMPUTE_NEREMAPKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEScaleKernel.h b/arm_compute/core/NEON/kernels/NEScaleKernel.h
new file mode 100644
index 0000000000..03e26520b5
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEScaleKernel.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESCALEKERNEL_H__
+#define __ARM_COMPUTE_NESCALEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform scaling on a tensor */
+class NEScaleKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEScaleKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEScaleKernel(const NEScaleKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEScaleKernel &operator=(const NEScaleKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEScaleKernel(NEScaleKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEScaleKernel &operator=(NEScaleKernel &&) = default;
+    /** Default destructor */
+    ~NEScaleKernel() = default;
+
+    /** Initialise the kernel's inputs, output and interpolation policy
+     *
+     * @note dx, dy and offsets have the same dimensions (width and height) of the output tensor
+     *
+     * @param[in]  input            Source tensor. Data types supported: U8/S16.
+     * @param[in]  dx               Pixel's distance between the X real coordinate and the smallest X following integer. Data type supported: F32
+     * @param[in]  dy               Pixel's distance between the Y real coordinate and the smallest Y following integer. Data type supported: F32
+     * @param[in]  offsets          Offset to access the pixel with NEAREST interpolation or the top-left pixel with BILINEAR interpolation in the input tensor. Data type supported: S32.
+     * @param[out] output           Destination tensor. Data types supported: U8/S16. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in]  policy           Interpolation type to use
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets, ITensor *output, InterpolationPolicy policy, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    /** function to perform scale using nearest interpolation on the given window */
+    void scale_nearest(const Window &window);
+    /** function to perform scale using bilinear interpolation on the given window */
+    void scale_bilinear(const Window &window);
+    /** function to perform scale using area interpolation on the given window
+     *
+     *  @note Used only in case down-sampling.
+     */
+    void scale_area(const Window &window);
+    /** Scale function to use for the particular interpolation type passed to configure() */
+    void (NEScaleKernel::*_func)(const Window &window);
+
+    const ITensor *_offsets;
+    const ITensor *_dx;
+    const ITensor *_dy;
+    const ITensor *_input;
+    ITensor       *_output;
+};
+}
+#endif /*__ARM_COMPUTE_NESCALEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h b/arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h
new file mode 100644
index 0000000000..c618456d49
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESCHARR3x3KERNEL_H__
+#define __ARM_COMPUTE_NESCHARR3x3KERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to run a 3x3 Scharr filter on a tensor.
+ *
+* @f[
+*      \mathbf{G}_x=\begin{vmatrix}
+*      -3 & 0 & +3\\
+*      -10& 0 & +10\\
+*      -3 & 0 & +3
+*      \end{vmatrix}
+* @f]
+*/
+class NEScharr3x3Kernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NEScharr3x3Kernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEScharr3x3Kernel(const NEScharr3x3Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEScharr3x3Kernel &operator=(const NEScharr3x3Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEScharr3x3Kernel(NEScharr3x3Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEScharr3x3Kernel &operator=(NEScharr3x3Kernel &&) = default;
+    /** Default destructor */
+    ~NEScharr3x3Kernel() = default;
+
+    /** Initialise the kernel's source, destination and border.
+     *
+     * @note At least one of output_x or output_y must be set.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    bool           _run_scharr_x; /**< Do we need to run Scharr X ? */
+    bool           _run_scharr_y; /**< Do we need to run Scharr Y ? */
+    const ITensor *_input;        /**< Input tensor */
+    ITensor       *_output_x;     /**< Output tensor for scharr X */
+    ITensor       *_output_y;     /**< Output tensor for scharr Y */
+};
+}
+#endif /*__ARM_COMPUTE_NESCHARR3x3KERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NESobel3x3Kernel.h b/arm_compute/core/NEON/kernels/NESobel3x3Kernel.h
new file mode 100644
index 0000000000..246dd83573
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NESobel3x3Kernel.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESOBEL3x3KERNEL_H__
+#define __ARM_COMPUTE_NESOBEL3x3KERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to run a 3x3 Sobel X filter on a tensor.
+ *
+ * @f[
+ *      \mathbf{G}_x=\begin{vmatrix}
+ *      -1 & 0 & +1\\
+ *      -2 & 0 & +2\\
+ *      -1 & 0 & +1
+ *      \end{vmatrix}
+ * @f]
+*/
+class NESobel3x3Kernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NESobel3x3Kernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel3x3Kernel(const NESobel3x3Kernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel3x3Kernel &operator=(const NESobel3x3Kernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NESobel3x3Kernel(NESobel3x3Kernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NESobel3x3Kernel &operator=(NESobel3x3Kernel &&) = default;
+    /** Default destructor */
+    ~NESobel3x3Kernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @note At least one of output_x or output_y must be set.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    bool           _run_sobel_x; /**< Do we need to run Sobel X ? */
+    bool           _run_sobel_y; /**< Do we need to run Sobel Y ? */
+    const ITensor *_input;       /**< Input tensor */
+    ITensor       *_output_x;    /**< Output tensor for sobel X */
+    ITensor       *_output_y;    /**< Output tensor for sobel Y */
+};
+}
+#endif /*__ARM_COMPUTE_NESOBEL3x3KERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NESobel5x5Kernel.h b/arm_compute/core/NEON/kernels/NESobel5x5Kernel.h
new file mode 100644
index 0000000000..49c1c41e6d
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NESobel5x5Kernel.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESOBEL5x5KERNEL_H__
+#define __ARM_COMPUTE_NESOBEL5x5KERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to run the horizontal pass of 5x5 Sobel filter on a tensor.
+ *
+ */
+class NESobel5x5HorKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NESobel5x5HorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel5x5HorKernel(const NESobel5x5HorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel5x5HorKernel &operator=(const NESobel5x5HorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NESobel5x5HorKernel(NESobel5x5HorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NESobel5x5HorKernel &operator=(NESobel5x5HorKernel &&) = default;
+    /** Default destructor */
+    ~NESobel5x5HorKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @note At least one of output_x or output_y must be set
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S16.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    const ITensor *_input;       /**< Input tensor */
+    ITensor       *_output_x;    /**< X output of horizontal pass */
+    ITensor       *_output_y;    /**< Y output of horizontal pass */
+    bool           _run_sobel_x; /**< Do we need to run Sobel X? */
+    bool           _run_sobel_y; /**< Do we need to run Sobel Y? */
+    BorderSize     _border_size; /**< Border size */
+};
+
+/** Interface for the kernel to run the vertical pass of 5x5 Sobel Y filter on a tensor.
+ *
+*/
+class NESobel5x5VertKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NESobel5x5VertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel5x5VertKernel(const NESobel5x5VertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel5x5VertKernel &operator=(const NESobel5x5VertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NESobel5x5VertKernel(NESobel5x5VertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NESobel5x5VertKernel &operator=(NESobel5x5VertKernel &&) = default;
+    /** Default destructor */
+    ~NESobel5x5VertKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @param[in]  input_x          Input for X (X output of hor pass). Data type supported: S16.
+     * @param[in]  input_y          Input for Y (Y output of hor pass). Data type supported: S16.
+     * @param[out] output_x         Destination tensor for the X gradient. Data type supported: S16.
+     * @param[out] output_y         Destination tensor for the Y gradient. Data type supported: S16.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(ITensor *input_x, ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    ITensor *_input_x;     /**< X input (X output of the hor pass) */
+    ITensor *_input_y;     /**< Y input (Y output of the hor pass) */
+    ITensor *_output_x;    /**< X output of sobel */
+    ITensor *_output_y;    /**< Y output of sobel */
+    bool     _run_sobel_x; /**< Do we need to run sobel X? */
+    bool     _run_sobel_y; /**< Do we need to run sobel Y? */
+};
+}
+#endif /*__ARM_COMPUTE_NESOBEL5x5KERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NESobel7x7Kernel.h b/arm_compute/core/NEON/kernels/NESobel7x7Kernel.h
new file mode 100644
index 0000000000..4bff8596b8
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NESobel7x7Kernel.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESOBEL7x7KERNEL_H__
+#define __ARM_COMPUTE_NESOBEL7x7KERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the kernel to run the horizontal pass of 7x7 Sobel filter on a tensor.
+ *
+ */
+class NESobel7x7HorKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NESobel7x7HorKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel7x7HorKernel(const NESobel7x7HorKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel7x7HorKernel &operator=(const NESobel7x7HorKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NESobel7x7HorKernel(NESobel7x7HorKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NESobel7x7HorKernel &operator=(NESobel7x7HorKernel &&) = default;
+    /** Default destructor */
+    ~NESobel7x7HorKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @note At least one of output_x or output_y must be set.
+     *
+     * @param[in]  input            Source tensor. Data type supported: U8.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S32.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S32.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    const ITensor *_input;       /**< Input tensor */
+    ITensor       *_output_x;    /**< X output of horizontal pass */
+    ITensor       *_output_y;    /**< Y output of horizontal pass */
+    bool           _run_sobel_x; /**< Do we need to run Sobel X? */
+    bool           _run_sobel_y; /**< Do we need to run Sobel Y? */
+    BorderSize     _border_size; /**< Border size */
+};
+
+/** Interface for the kernel to run the vertical pass of 7x7 Sobel Y filter on a tensor.
+ *
+*/
+class NESobel7x7VertKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NESobel7x7VertKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel7x7VertKernel(const NESobel7x7VertKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESobel7x7VertKernel &operator=(const NESobel7x7VertKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NESobel7x7VertKernel(NESobel7x7VertKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NESobel7x7VertKernel &operator=(NESobel7x7VertKernel &&) = default;
+    /** Default destructor */
+    ~NESobel7x7VertKernel() = default;
+
+    /** Initialise the kernel's source, destination and border mode.
+     *
+     * @note At least one of output_x or output_y must be set
+     * @note If output_x is set then input_x must be set too
+     * @note If output_y is set then input_y must be set too
+     *
+     * @param[in]  input_x          (Optional) Input for X (X output of hor pass). Data type supported: S32.
+     * @param[in]  input_y          (Optional) Input for Y (Y output of hor pass). Data type supported: S32.
+     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S32.
+     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S32.
+     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
+     */
+    void configure(const ITensor *input_x, const ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    const ITensor *_input_x;     /**< X input (X output of the hor pass) */
+    const ITensor *_input_y;     /**< Y input (Y output of the hor pass) */
+    ITensor       *_output_x;    /**< X output of sobel */
+    ITensor       *_output_y;    /**< Y output of sobel */
+    bool           _run_sobel_x; /**< Do we need to run sobel X? */
+    bool           _run_sobel_y; /**< Do we need to run sobel Y? */
+};
+}
+#endif /*__ARM_COMPUTE_NESOBEL7x7KERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
new file mode 100644
index 0000000000..ab626ad5ec
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESOFTMAXLAYERKERNEL_H__
+#define __ARM_COMPUTE_NESOFTMAXLAYERKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the identifying the max value of 1D Logits */
+class NELogits1DMaxKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NELogits1DMaxKernel();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Source tensor. Data types supported: QS8, F32.
+     * @param[out] output Destination tensor. Data types supported: same as @p input
+     */
+    void configure(const ITensor *input, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+    BorderSize border_size() const override;
+
+private:
+    using Logits1DMaxFunction = void(const ITensor *in, ITensor *out, const Window &window);
+
+private:
+    Logits1DMaxFunction *_func;
+    BorderSize           _border_size;
+};
+
+/** Interface for shifting the logits values around the max value and exponentiating the result */
+class NELogits1DShiftExpSumKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NELogits1DShiftExpSumKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELogits1DShiftExpSumKernel(const NELogits1DShiftExpSumKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELogits1DShiftExpSumKernel &operator=(const NELogits1DShiftExpSumKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NELogits1DShiftExpSumKernel(NELogits1DShiftExpSumKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NELogits1DShiftExpSumKernel &operator=(NELogits1DShiftExpSumKernel &&) = default;
+    /** Default destructor */
+    ~NELogits1DShiftExpSumKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Source tensor. Data types supported: QS8, F32.
+     * @param[in]  max    Max values tensor. Data types supported: same as @p input.
+     * @param[out] output Destination tensor. Data types supported: same as @p input.
+     * @param[out] sum    Sum of 1D logits tensor. Data types supported: same as @p input.
+     */
+    void configure(const ITensor *input, const ITensor *max, ITensor *output, ITensor *sum);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    using Logits1DShiftExpSumFunction = void(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window);
+
+private:
+    Logits1DShiftExpSumFunction *_func;
+    const ITensor               *_input;
+    const ITensor               *_max;
+    ITensor                     *_output;
+    ITensor                     *_sum;
+};
+
+/** Interface for calculating the final step of the Softmax Layer where each logit value is multiplied by the inverse of the sum of the logits. */
+class NELogits1DNormKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NELogits1DNormKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELogits1DNormKernel(const NELogits1DNormKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELogits1DNormKernel &operator=(const NELogits1DNormKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NELogits1DNormKernel(NELogits1DNormKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NELogits1DNormKernel &operator=(NELogits1DNormKernel &&) = default;
+    /** Default destructor */
+    ~NELogits1DNormKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Source tensor. Data types supported: QS8, F32.
+     * @param[in]  sum    Sum tensor. The number of dimensions should be dim(input)-1. Data types supported: same as @p input.
+     * @param[out] output Destination tensor. Data types supported: same as @p input.
+     */
+    void configure(const ITensor *input, const ITensor *sum, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    using Logits1DNormFunction = void(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window);
+
+private:
+    Logits1DNormFunction *_func;
+    const ITensor        *_input;
+    const ITensor        *_sum;
+    ITensor              *_output;
+};
+}
+#endif /*__ARM_COMPUTE_NESOFTMAXLAYERKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NETableLookupKernel.h b/arm_compute/core/NEON/kernels/NETableLookupKernel.h
new file mode 100644
index 0000000000..b3963e5a75
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NETableLookupKernel.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NETABLELOOKUPKERNEL_H__
+#define __ARM_COMPUTE_NETABLELOOKUPKERNEL_H__
+
+#include "arm_compute/core/NEON/INESimpleKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+class ILut;
+
+/** Interface for the kernel to perform table lookup calculations. */
+class NETableLookupKernel : public INESimpleKernel
+{
+public:
+    /** Default constructor */
+    NETableLookupKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NETableLookupKernel(const NETableLookupKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NETableLookupKernel &operator=(const NETableLookupKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NETableLookupKernel(NETableLookupKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NETableLookupKernel &operator=(NETableLookupKernel &&) = default;
+    /** Initialise the kernel's input, lut and output.
+     *
+     * @param[in]  input  An input tensor. Data types supported: U8/S16.
+     * @param[in]  lut    The input LUT.
+     * @param[out] output The output tensor. Data types supported: same as @p input
+     */
+    void configure(const ITensor *input, const ILut *lut, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Perform table lookup on a given window.
+     *
+     * @param window window Region on which to execute the kernel.
+     */
+    template <class T>
+    void tableLookup(const Window &window);
+    /** Common signature for all the specialised lut functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using TableLookupFunction = void (NETableLookupKernel::*)(const Window &window);
+    /** Sub function to use for the particular tensor types passed to configure() */
+    TableLookupFunction _func;
+    const ILut         *_lut;
+};
+}
+#endif /* __ARM_COMPUTE_NETABLELOOKUPKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEThresholdKernel.h b/arm_compute/core/NEON/kernels/NEThresholdKernel.h
new file mode 100644
index 0000000000..778176293f
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEThresholdKernel.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NETHRESHOLDKERNEL_H__
+#define __ARM_COMPUTE_NETHRESHOLDKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the thresholding kernel
+ *
+ */
+class NEThresholdKernel : public INEKernel
+{
+public:
+    /** Constructor
+     * Initialize all the pointers to nullptr and parameters to zero.
+     */
+    NEThresholdKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEThresholdKernel(const NEThresholdKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEThresholdKernel &operator=(const NEThresholdKernel &) = delete;
+    /** Initialise the kernel's input, output and threshold parameters.
+     *
+     * @param[in]  input       An input tensor. Data type supported: U8
+     * @param[out] output      The output tensor. Data type supported: U8.
+     * @param[in]  threshold   Threshold. When the threhold type is RANGE, this is used as the lower threshold.
+     * @param[in]  false_value value to set when the condition is not respected.
+     * @param[in]  true_value  value to set when the condition is respected.
+     * @param[in]  type        Thresholding type. Either RANGE or BINARY.
+     * @param[in]  upper       Upper threshold. Only used when the thresholding type is RANGE.
+     */
+    void configure(const ITensor *input, ITensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** run binary thresholding on the given window */
+    void run_binary(const Window &window);
+    /** run range thresholding on the given window */
+    void run_range(const Window &window);
+
+    void (NEThresholdKernel::*_func)(const Window &window);
+
+    const ITensor *_input;  /**< Input */
+    ITensor       *_output; /**< Output */
+    uint8_t        _threshold;
+    uint8_t        _false_value;
+    uint8_t        _true_value;
+    uint8_t        _upper;
+};
+}
+#endif /*__ARM_COMPUTE_NETHRESHOLDKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NETransposeKernel.h b/arm_compute/core/NEON/kernels/NETransposeKernel.h
new file mode 100644
index 0000000000..ac9449ff92
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NETransposeKernel.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NETRANSPOSEKERNEL_H__
+#define __ARM_COMPUTE_NETRANSPOSEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel which transposes the elements of a matrix.
+ *
+ * [width, height, batch] -> [height, width, batch]
+ *
+ */
+class NETransposeKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    NETransposeKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NETransposeKernel(const NETransposeKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NETransposeKernel &operator=(const NETransposeKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NETransposeKernel(NETransposeKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NETransposeKernel &operator=(NETransposeKernel &&) = default;
+    /** Default destructor */
+    ~NETransposeKernel() = default;
+
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32
+     * @param[out] output Output tensor. Data type supported: Same as @p input
+     */
+    void configure(const ITensor *input, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    /** Common signature for all the transpose functions
+     *
+     * @param[in]  input  An input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32
+     * @param[out] output The output tensor. Data type supported: same as @p input
+     * @param[in]  window Region on which to execute the kernel.
+     */
+    using TransposeFunction = void(const ITensor *input, ITensor *output, const Window &window);
+    /** Transpose function to use for the particular tensor types passed to configure() */
+    TransposeFunction *_func;
+    const ITensor     *_input;
+    ITensor           *_output;
+};
+}
+#endif /* __ARM_COMPUTE_NETRANSPOSEKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEWarpKernel.h b/arm_compute/core/NEON/kernels/NEWarpKernel.h
new file mode 100644
index 0000000000..10fed1d450
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEWarpKernel.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEWARPKERNEL_H__
+#define __ARM_COMPUTE_NEWARPKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Common interface for warp affine and warp perspective */
+class INEWarpKernel : public INEKernel
+{
+public:
+    /** Default constructor */
+    INEWarpKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    INEWarpKernel(const INEWarpKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    INEWarpKernel &operator=(const INEWarpKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    INEWarpKernel(INEWarpKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    INEWarpKernel &operator=(INEWarpKernel &&) = default;
+    /** Initialise the kernel's input, output and border mode.
+     *
+     * @param[in]  input                 Source tensor. Data type supported: U8.
+     * @param[out] output                Destination tensor. Data type supported: U8.
+     * @param[in]  matrix                The perspective or affine matrix to use. Must be 2x3 for affine and 3x3 for perspective of type float.
+     * @param[in]  border_mode           Strategy to use for borders
+     * @param[in]  constant_border_value Constant value used for filling the border.
+     */
+    virtual void configure(const ITensor *input, ITensor *output, const float *matrix, BorderMode border_mode, uint8_t constant_border_value);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+protected:
+    /** function to perform warp affine or warp perspective on the given window when border mode == UNDEFINED
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    virtual void warp_undefined(const Window &window) = 0;
+    /** function to perform warp affine or warp perspective on the given window when border mode == CONSTANT
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    virtual void warp_constant(const Window &window) = 0;
+    /** function to perform warp affine or warp perspective on the given window when border mode == REPLICATE
+     *
+     *  @param[in] window Region on which to execute the kernel
+     */
+    virtual void warp_replicate(const Window &window) = 0;
+    /** Common signature for all the specialised warp functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    void (INEWarpKernel::*_func)(const Window &window);
+
+    const ITensor *_input;                 /**< Input Tensor */
+    ITensor       *_output;                /**< Output Tensor */
+    uint8_t        _constant_border_value; /**< Constant value used for filling the border. This value is used for those pixels out of the ROI when the border mode is CONSTANT */
+    const float   *_matrix;                /**< The affine or perspective matrix. Must be 2x3 for warp affine or 3x3 for warp perspective of type float. */
+};
+
+/** Template interface for the kernel to compute warp affine
+ *
+ */
+template <InterpolationPolicy interpolation>
+class NEWarpAffineKernel : public INEWarpKernel
+{
+private:
+    // Inherited methods overridden:
+    void warp_undefined(const Window &window) override;
+    void warp_constant(const Window &window) override;
+    void warp_replicate(const Window &window) override;
+};
+
+/** Template interface for the kernel to compute warp perspective
+ *
+ */
+template <InterpolationPolicy interpolation>
+class NEWarpPerspectiveKernel : public INEWarpKernel
+{
+private:
+    // Inherited methods overridden:
+    void warp_undefined(const Window &window) override;
+    void warp_constant(const Window &window) override;
+    void warp_replicate(const Window &window) override;
+};
+}
+#endif /*__ARM_COMPUTE_NEWARPKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
new file mode 100644
index 0000000000..cad2d00b1f
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H__
+#define __ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel to perform reshaping on the weights used by convolution and locally connected layer
+ *
+ * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels.
+ * In combination with the @ref NEIm2ColKernel can transform a convolution to a matrix multiplication.
+ *
+ * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have:
+ * @f[
+ * \left( \begin{array}{ccc}
+ * a000 & a001 & a002 \\
+ * a010 & a011 & a012 \\
+ * a020 & a021 & a022 \\
+ * \end{array} \right)
+ * \left( \begin{array}{ccc}
+ * a100 & a101 & a102 \\
+ * a110 & a111 & a112 \\
+ * a120 & a121 & a122 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccccccccc}
+ * a000 & a001 & a002 & a010 & a011 & a012 & a020 & a021 & a022 & a100 & a101 & a102 & a110 & a111 & a112 & a120 & a121 & a122 \\
+ * \end{array} \right)
+ * @f]
+ */
+class NEWeightsReshapeKernel : public INEKernel
+{
+public:
+    /** Constructor.*/
+    NEWeightsReshapeKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEWeightsReshapeKernel(const NEWeightsReshapeKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEWeightsReshapeKernel &operator=(const NEWeightsReshapeKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEWeightsReshapeKernel(NEWeightsReshapeKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEWeightsReshapeKernel &operator=(NEWeightsReshapeKernel &&) = default;
+    /** Default destructor */
+    ~NEWeightsReshapeKernel() = default;
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
+     *                    and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QS8/F32
+     * @param[in]  bias   The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
+     *                    dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
+     * @param[out] output The output tensor. Data types supported: Same as @p input
+     */
+    void configure(const ITensor *input, const ITensor *bias, ITensor *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window) override;
+
+private:
+    using WeightsReshapeKernel = void(const ITensor *input, const ITensor *bias, ITensor *output, const Window &window);
+
+    WeightsReshapeKernel *_func;
+    const ITensor        *_input;
+    const ITensor        *_bias;
+    ITensor              *_output;
+};
+}
+
+#endif /*__ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H__ */
diff --git a/arm_compute/core/PixelValue.h b/arm_compute/core/PixelValue.h
new file mode 100644
index 0000000000..b4912ce15a
--- /dev/null
+++ b/arm_compute/core/PixelValue.h
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_PIXELVALUE_H__
+#define __ARM_COMPUTE_PIXELVALUE_H__
+
+#include <cstdint>
+
+namespace arm_compute
+{
+/** Class describing the value of a pixel for any image format. */
+class PixelValue
+{
+public:
+    /** Default constructor: value initialized to 0 */
+    PixelValue()
+        : value{ { 0 } }
+    {
+    }
+    /** Initialize the union with a U8 pixel value
+     *
+     * @param[in] v U8 value.
+     */
+    PixelValue(uint8_t v)
+        : PixelValue()
+    {
+        value.u8 = v;
+    }
+    /** Initialize the union with a U16 pixel value
+     *
+     * @param[in] v U16 value.
+     */
+    PixelValue(uint16_t v)
+        : PixelValue()
+    {
+        value.u16 = v;
+    }
+    /** Initialize the union with a S16 pixel value
+     *
+     * @param[in] v S16 value.
+     */
+    PixelValue(int16_t v)
+        : PixelValue()
+    {
+        value.s16 = v;
+    }
+    /** Initialize the union with a U32 pixel value
+     *
+     * @param[in] v U32 value.
+     */
+    PixelValue(uint32_t v)
+        : PixelValue()
+    {
+        value.u32 = v;
+    }
+    /** Initialize the union with a S32 pixel value
+     *
+     * @param[in] v S32 value.
+     */
+    PixelValue(int32_t v)
+        : PixelValue()
+    {
+        value.s32 = v;
+    }
+    /** Initialize the union with a F32 pixel value
+     *
+     * @param[in] v F32 value.
+     */
+    PixelValue(float v)
+        : PixelValue()
+    {
+        value.f32 = v;
+    }
+    /** Union which describes the value of a pixel for any image format.
+     * Use the field corresponding to the image format
+     */
+    union
+        {
+            uint8_t  rgb[3];  /**< 3 channels: RGB888 */
+            uint8_t  yuv[3];  /**< 3 channels: Any YUV format */
+            uint8_t  rgbx[4]; /**< 4 channels: RGBX8888 */
+            float    f32;     /**< Single channel float 32 */
+            uint8_t  u8;      /**< Single channel U8 */
+            int8_t   s8;      /**< Single channel S8 */
+            uint16_t u16;     /**< Single channel U16 */
+            int16_t  s16;     /**< Single channel S16 */
+            uint32_t u32;     /**< Single channel U32 */
+            int32_t  s32;     /**< Single channel S32 */
+        } value;
+    /** Interpret the pixel value as a U8
+     *
+     * @param[out] v Returned value
+     */
+    void get(uint8_t &v) const
+    {
+        v = value.u8;
+    }
+    /** Interpret the pixel value as a S8
+     *
+     * @param[out] v Returned value
+     */
+    void get(int8_t &v) const
+    {
+        v = value.s8;
+    }
+    /** Interpret the pixel value as a U16
+     *
+     * @param[out] v Returned value
+     */
+    void get(uint16_t &v) const
+    {
+        v = value.u16;
+    }
+    /** Interpret the pixel value as a S16
+     *
+     * @param[out] v Returned value
+     */
+    void get(int16_t &v) const
+    {
+        v = value.s16;
+    }
+    /** Interpret the pixel value as a U32
+     *
+     * @param[out] v Returned value
+     */
+    void get(uint32_t &v) const
+    {
+        v = value.u32;
+    }
+    /** Interpret the pixel value as a S32
+     *
+     * @param[out] v Returned value
+     */
+    void get(int32_t &v) const
+    {
+        v = value.s32;
+    }
+    /** Interpret the pixel value as a F32
+     *
+     * @param[out] v Returned value
+     */
+    void get(float &v) const
+    {
+        v = value.f32;
+    }
+};
+}
+#endif /* __ARM_COMPUTE_PIXELVALUE_H__ */
diff --git a/arm_compute/core/PyramidInfo.h b/arm_compute/core/PyramidInfo.h
new file mode 100644
index 0000000000..76b3852bbf
--- /dev/null
+++ b/arm_compute/core/PyramidInfo.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_PYRAMIDINFO_H__
+#define __ARM_COMPUTE_PYRAMIDINFO_H__
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+/** Store the Pyramid's metadata */
+class PyramidInfo
+{
+public:
+    /** Default constructor */
+    PyramidInfo();
+    /** Default destructor */
+    virtual ~PyramidInfo() = default;
+    /** Allow instances of this class to be copy constructed */
+    PyramidInfo(const PyramidInfo &) = default;
+    /** Allow instances of this class to be copied */
+    PyramidInfo &operator=(const PyramidInfo &) = default;
+    /** Allow instances of this class to be move constructed */
+    PyramidInfo(PyramidInfo &&) = default;
+    /** Allow instances of this class to be moved */
+    PyramidInfo &operator=(PyramidInfo &&) = default;
+
+    /** Create pyramid info for 2D tensors
+     *
+     * @param[in] num_levels The number of pyramid levels. This is required to be a non-zero value
+     * @param[in] scale      Used to indicate the scale between the pyramid levels.
+     *                       This is required to be a non-zero positive value.
+     * @param[in] width      The width of the 2D tensor at 0th pyramid level
+     * @param[in] height     The height of the 2D tensor at 0th pyramid level
+     * @param[in] format     The format of all 2D tensors in the pyramid
+     *                       NV12, NV21, IYUV, UYVY and YUYV formats are not supported.
+     */
+    PyramidInfo(size_t num_levels, float scale, size_t width, size_t height, Format format);
+
+    /** Create pyramid info using TensorShape
+     *
+     * @param[in] num_levels   The number of pyramid levels. This is required to be a non-zero value
+     * @param[in] scale        Used to indicate the scale between the pyramid levels.
+     *                         This is required to be a non-zero positive value.
+     * @param[in] tensor_shape It specifies the size for each dimension of the tensor 0th pyramid level in number of elements
+     * @param[in] format       The format of all tensors in the pyramid
+     */
+    PyramidInfo(size_t num_levels, float scale, const TensorShape &tensor_shape, Format format);
+
+    /** Initialize pyramid's metadata for 2D tensors
+     *
+     * @param[in] num_levels The number of pyramid levels. This is required to be a non-zero value
+     * @param[in] scale      Used to indicate the scale between the pyramid levels.
+     *                       This is required to be a non-zero positive value.
+     * @param[in] width      The width of the 2D tensor at 0th pyramid level
+     * @param[in] height     The height of the 2D tensor at 0th pyramid level
+     * @param[in] format     The format of all 2D tensors in the pyramid
+     *                       NV12, NV21, IYUV, UYVY and YUYV formats are not supported.
+     */
+    void init(size_t num_levels, float scale, size_t width, size_t height, Format format);
+    /** Initialize pyramid's metadata using TensorShape
+     *
+     * @param[in] num_levels   The number of pyramid levels. This is required to be a non-zero value
+     * @param[in] scale        Used to indicate the scale between the pyramid levels.
+     *                         This is required to be a non-zero positive value.
+     * @param[in] tensor_shape It specifies the size for each dimension of the tensor 0th pyramid level in number of elements
+     * @param[in] format       The format of all tensors in the pyramid
+     */
+    void init(size_t num_levels, float scale, const TensorShape &tensor_shape, Format format);
+    /** Return the number of the pyramid levels
+     *
+     *  @return The number of the pyramid levels
+     */
+    size_t num_levels() const;
+    /** Return the width of the 0th level tensor
+     *
+     *  @return The width of the 0th level tensor
+     */
+    size_t width() const;
+    /** Return the height of the 0th level tensor
+     *
+     *  @return The height of the 0th level tensor
+     */
+    size_t height() const;
+    /** Return the TensorShape of the o-th level tensor
+     *
+     * @return
+     */
+    const TensorShape &tensor_shape() const;
+    /** Return the image format of all tensor in the pyramid
+     *
+     *  @return The image format
+     */
+    Format format() const;
+    /** Return the scale factor of the pyramid
+     *
+     *  @return Return the scale factor
+     */
+    float scale() const;
+
+private:
+    size_t      _num_levels;
+    TensorShape _tensor_shape;
+    Format      _format;
+    float       _scale;
+};
+}
+#endif /*__ARM_COMPUTE_PYRAMIDINFO_H__ */
diff --git a/arm_compute/core/Size2D.h b/arm_compute/core/Size2D.h
new file mode 100644
index 0000000000..cb053ea2c4
--- /dev/null
+++ b/arm_compute/core/Size2D.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_SIZE2D_H__
+#define __ARM_COMPUTE_SIZE2D_H__
+
+#include <cstddef>
+
+namespace arm_compute
+{
+/** Class for specifying the size of an image or rectangle */
+class Size2D
+{
+public:
+    /** Default constructor */
+    Size2D()
+        : width(0), height(0)
+    {
+    }
+    /** Constructor. Initializes "width" and "height" respectively with "w" and "h"
+     *
+     * @param[in] w Width of the image or rectangle
+     * @param[in] h Height of the image or rectangle
+     */
+    Size2D(size_t w, size_t h)
+        : width(w), height(h)
+    {
+    }
+    /** Constructor. Initializes "width" and "height" with the dimensions of "size"
+     *
+     * @param[in] size Size data object
+     */
+    Size2D(const Size2D &size)
+        : width(size.width), height(size.height)
+    {
+    }
+    /** Copy assignment
+     *
+     * @param[in] size Constant reference input "Size2D" data object to copy
+     *
+     * @return Reference to the newly altered left hand side "Size2D" data object
+     */
+    Size2D &operator=(const Size2D &size)
+    {
+        width  = size.width;
+        height = size.height;
+        return *this;
+    }
+    /** The area of the image or rectangle calculated as (width * height)
+     *
+     * @return Area (width * height)
+     *
+     */
+    size_t area() const
+    {
+        return (width * height);
+    }
+
+public:
+    size_t width;  /**< Width of the image region or rectangle */
+    size_t height; /**< Height of the image region or rectangle */
+};
+}
+#endif /*__ARM_COMPUTE_SIZE2D_H__ */
diff --git a/arm_compute/core/Steps.h b/arm_compute/core/Steps.h
new file mode 100644
index 0000000000..33a88a2568
--- /dev/null
+++ b/arm_compute/core/Steps.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_STEPS_H__
+#define __ARM_COMPUTE_STEPS_H__
+
+#include "arm_compute/core/Dimensions.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Types.h"
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+
+namespace arm_compute
+{
+/** Class to describe a number of elements in each dimension. Similar to @ref
+ *  Strides but not in bytes but number of elements.
+ */
+class Steps : public Dimensions<unsigned int>
+{
+public:
+    /** Constructor to initialize the steps.
+     *
+     * @param[in] steps Values to initialize the steps.
+     */
+    template <typename... Ts>
+    Steps(Ts... steps)
+        : Dimensions{ steps... }
+    {
+        // Initialize empty dimensions to 1
+        std::fill(_id.begin() + _num_dimensions, _id.end(), 1);
+    }
+    /** Allow instances of this class to be copy constructed */
+    constexpr Steps(const Steps &) = default;
+    /** Allow instances of this class to be copied */
+    Steps &operator=(const Steps &) = default;
+    /** Allow instances of this class to be move constructed */
+    constexpr Steps(Steps &&) = default;
+    /** Allow instances of this class to be moved */
+    Steps &operator=(Steps &&) = default;
+    /** Default destructor */
+    ~Steps() = default;
+};
+}
+#endif /*__ARM_COMPUTE_STEPS_H__*/
diff --git a/arm_compute/core/Strides.h b/arm_compute/core/Strides.h
new file mode 100644
index 0000000000..329fafb5f8
--- /dev/null
+++ b/arm_compute/core/Strides.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_STRIDES_H__
+#define __ARM_COMPUTE_STRIDES_H__
+
+#include "arm_compute/core/Dimensions.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Types.h"
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+
+namespace arm_compute
+{
+/** Strides of an item in bytes */
+class Strides : public Dimensions<size_t>
+{
+public:
+    /** Constructor to initialize the strides.
+     *
+     * @param[in] strides Values to initialize the strides.
+     */
+    template <typename... Ts>
+    constexpr Strides(Ts... strides)
+        : Dimensions{ strides... }
+    {
+    }
+    /** Allow instances of this class to be copy constructed */
+    constexpr Strides(const Strides &) = default;
+    /** Allow instances of this class to be copied */
+    Strides &operator=(const Strides &) = default;
+    /** Allow instances of this class to be move constructed */
+    constexpr Strides(Strides &&) = default;
+    /** Allow instances of this class to be moved */
+    Strides &operator=(Strides &&) = default;
+    /** Default destructor */
+    ~Strides() = default;
+};
+}
+#endif /*__ARM_COMPUTE_STRIDES_H__*/
diff --git a/arm_compute/core/SubTensorInfo.h b/arm_compute/core/SubTensorInfo.h
new file mode 100644
index 0000000000..e2532fd487
--- /dev/null
+++ b/arm_compute/core/SubTensorInfo.h
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_SUBTENSORINFO_H__
+#define __ARM_COMPUTE_SUBTENSORINFO_H__
+
+#include "arm_compute/core/ITensorInfo.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Strides.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+/** Store the sub tensor's metadata */
+class SubTensorInfo final : public ITensorInfo
+{
+public:
+    /** Default constructor */
+    SubTensorInfo();
+    /** Default constructor
+     *
+     * @param[in] parent       Metadata of parent tensor.
+     * @param[in] tensor_shape Tensor shape. Shape must fit inside parent's shape.
+     *                         X and Y dimensions must match the parent's ones.
+     * @param[in] coords       Coordinates of starting element inside parent tensor.
+     */
+    SubTensorInfo(ITensorInfo *parent, const TensorShape &tensor_shape, const Coordinates &coords);
+    /** Default destructor */
+    ~SubTensorInfo() = default;
+    /** Allow instances of this class to be copy constructed */
+    SubTensorInfo(const SubTensorInfo &) = default;
+    /** Allow instances of this class to be copied */
+    SubTensorInfo &operator=(const SubTensorInfo &) = default;
+    /** Allow instances of this class to be move constructed */
+    SubTensorInfo(SubTensorInfo &&) = default;
+    /** Allow instances of this class to be moved */
+    SubTensorInfo &operator=(SubTensorInfo &&) = default;
+
+    // Inherited methods overridden:
+    void set_data_type(DataType data_type) override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        _parent->set_data_type(data_type);
+    };
+    void set_num_channels(int num_channels) override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        _parent->set_num_channels(num_channels);
+    };
+    void set_format(Format format) override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        _parent->set_format(format);
+    };
+    void set_fixed_point_position(int fixed_point_position) override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        _parent->set_fixed_point_position(fixed_point_position);
+    };
+    void set_tensor_shape(TensorShape shape) override;
+    bool auto_padding() override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->auto_padding();
+    };
+    bool extend_padding(const PaddingSize &padding) override;
+    size_t dimension(size_t index) const override
+    {
+        return _tensor_shape[index];
+    }
+    const Strides &strides_in_bytes() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->strides_in_bytes();
+    }
+    size_t offset_first_element_in_bytes() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->offset_element_in_bytes(_coords);
+    }
+    size_t offset_element_in_bytes(const Coordinates &pos) const override;
+    int fixed_point_position() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->fixed_point_position();
+    }
+    size_t element_size() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->element_size();
+    }
+    size_t num_dimensions() const override
+    {
+        return _tensor_shape.num_dimensions();
+    }
+    size_t num_channels() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->num_channels();
+    }
+    const TensorShape &tensor_shape() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _tensor_shape;
+    }
+    DataType data_type() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->data_type();
+    }
+    Format format() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->format();
+    }
+    size_t total_size() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->total_size();
+    }
+    PaddingSize padding() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->padding();
+    }
+    bool has_padding() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->has_padding();
+    }
+    bool is_resizable() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->is_resizable();
+    }
+    void set_is_resizable(bool is_resizable) override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        _parent->set_is_resizable(is_resizable);
+    }
+    ValidRegion valid_region() const override
+    {
+        return _valid_region;
+    }
+    void set_valid_region(ValidRegion valid_region) override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR_VALID_REGION(_parent->valid_region(), valid_region);
+        _valid_region = std::move(valid_region);
+    }
+
+private:
+    ITensorInfo *_parent;
+    TensorShape  _tensor_shape;
+    Coordinates  _coords;
+    ValidRegion  _valid_region;
+};
+}
+#endif /*__ARM_COMPUTE_SUBTENSORINFO_H__ */
diff --git a/arm_compute/core/TensorInfo.h b/arm_compute/core/TensorInfo.h
new file mode 100644
index 0000000000..35b9ccb9ff
--- /dev/null
+++ b/arm_compute/core/TensorInfo.h
@@ -0,0 +1,300 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TENSORINFO_H__
+#define __ARM_COMPUTE_TENSORINFO_H__
+
+#include "arm_compute/core/ITensorInfo.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Strides.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+class HOGInfo;
+
+/** Store the tensor's metadata */
+class TensorInfo final : public ITensorInfo
+{
+public:
+    /** Default constructor */
+    TensorInfo();
+    /** Default destructor */
+    ~TensorInfo() = default;
+    /** Allow instances of this class to be copy constructed */
+    TensorInfo(const ITensorInfo &info);
+    /** Allow instances of this class to be copy constructed */
+    TensorInfo(const TensorInfo &) = default;
+    /** Allow instances of this class to be copied */
+    TensorInfo &operator=(const TensorInfo &) = default;
+    /** Allow instances of this class to be move constructed */
+    TensorInfo(TensorInfo &&) = default;
+    /** Allow instances of this class to be moved */
+    TensorInfo &operator=(TensorInfo &&) = default;
+
+    /** Construct a tensor info with a format.
+     *
+     * Can be used for automatic derivation of the shape by the function.
+     *
+     * @param[in] format Format of the tensor.
+     */
+    TensorInfo(Format format);
+
+    /** 2D tensor constructor
+     *
+     * @param[in] width  Width of the 2D tensor
+     * @param[in] height Height of the 2D tensor
+     * @param[in] format Single plane format of the tensor.
+     */
+    TensorInfo(unsigned int width, unsigned int height, Format format);
+    /** Constructor
+     *
+     * @param[in] tensor_shape It specifies the size for each dimension of the tensor in number of elements.
+     * @param[in] format       Single plane format of the tensor.
+     */
+    TensorInfo(const TensorShape &tensor_shape, Format format);
+
+    /** Construct a tensor info with a data type and number of channels.
+     *
+     * Can be used for automatic derivation of the shape by the function.
+     *
+     * @param[in] num_channels         It indicates the number of channels for each tensor element
+     * @param[in] data_type            Data type to use for each tensor element
+     * @param[in] fixed_point_position (Optional) It specifies the fixed point position when the tensor data type is QS8, QS16 or QS32.
+     */
+    TensorInfo(size_t num_channels, DataType data_type, size_t fixed_point_position = 0);
+
+    /** Constructor
+     *
+     * @param[in] tensor_shape         It specifies the size for each dimension of the tensor in number of elements.
+     * @param[in] num_channels         It indicates the number of channels for each tensor element
+     * @param[in] data_type            Data type to use for each tensor element
+     * @param[in] fixed_point_position (Optional) Fixed point position that expresses the number of bits for the fractional part of the number when the tensor's data type is QS8 or QS16.
+     */
+    TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position = 0);
+    /** Constructor
+     *
+     * @param[in] hog_info HOG's metadata used to allocate normalized HOG space
+     * @param[in] width    Width of the 2D tensor where the HOG descriptor will be computed on
+     * @param[in] height   Height of the 2D tensor where the HOG descriptor will be computed on
+     */
+    TensorInfo(const HOGInfo &hog_info, unsigned int width, unsigned int height);
+
+    /** Initialize the tensor info with just a format.
+     *
+     * Can be used for automatic derivation of the shape by the function.
+     *
+     * @param[in] format Single plane format of the tensor.
+     */
+    void init(Format format);
+
+    /** Initialize the metadata structure with the given parameters
+     *
+     * @param[in] tensor_shape Size for each dimension of the tensor in number of elements.
+     * @param[in] format       Single plane format of the tensor.
+     */
+    void init(const TensorShape &tensor_shape, Format format);
+    /** Initialize the metadata structure with the given parameters
+     *
+     * @param[in] tensor_shape                  Size for each dimension of the tensor in number of elements.
+     * @param[in] format                        Single plane format of the tensor.
+     * @param[in] strides_in_bytes              Stride in bytes for accessing each dimension of the tensor.
+     * @param[in] offset_first_element_in_bytes Offset in bytes from the beginning of memory allocation to access the first element.
+     * @param[in] total_size_in_bytes           Size in bytes of the memory allocation (including the offset to the first element).
+     */
+    void init(const TensorShape &tensor_shape, Format format, const Strides &strides_in_bytes, size_t offset_first_element_in_bytes, size_t total_size_in_bytes);
+
+    /** Initialize the tensor info with just a format.
+     *
+     * Can be used for automatic derivation of the shape by the function.
+     *
+     * @param[in] num_channels         Desired number of channels for each tensor element.
+     * @param[in] data_type            Data type to use for each tensor element.
+     * @param[in] fixed_point_position (Optional) Fixed point position when the tensor data type is QS8, QS16 or QS32.
+     */
+    void init(size_t num_channels, DataType data_type, size_t fixed_point_position = 0);
+
+    /** Initialize the metadata structure with the given parameters
+     *
+     * @param[in] tensor_shape         Size for each dimension of the tensor in number of elements.
+     * @param[in] num_channels         Desired number of channels for each tensor element.
+     * @param[in] data_type            Data type to use for each tensor element.
+     * @param[in] fixed_point_position (Optional) Fixed point position that expresses the number of bits for the fractional part of the number when the tensor's data type is QS8 or QS16.
+     */
+    void init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position = 0);
+    /** Initialize the metadata structure with the given parameters
+     *
+     * @param[in] tensor_shape                  Size for each dimension of the tensor in number of elements.
+     * @param[in] num_channels                  Desired number of channels for each tensor element.
+     * @param[in] data_type                     Data type to use for each tensor element.
+     * @param[in] strides_in_bytes              Stride in bytes for accessing each dimension of the tensor.
+     * @param[in] offset_first_element_in_bytes Offset in bytes from the beginning of memory allocation to access the first element.
+     * @param[in] total_size_in_bytes           Size in bytes of the memory allocation (including the offset to the first element).
+     * @param[in] fixed_point_position          (Optional) Fixed point position that expresses the number of bits for the fractional part of the number when the tensor's data type is QS8 or QS16.
+     */
+    void init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, const Strides &strides_in_bytes, size_t offset_first_element_in_bytes,
+              size_t total_size_in_bytes, int fixed_point_position = 0);
+    /** Initialize the metadata structure for the given HOG's metadata
+     *
+     * @param[in] hog_info HOG's metadata used to allocate normalized HOG space
+     * @param[in] width    Width of the 2D tensor where the HOG descriptor will be computed on
+     * @param[in] height   Height of the 2D tensor where the HOG descriptor will be computed on
+     */
+    void init(const HOGInfo &hog_info, unsigned int width, unsigned int height);
+    /** Initialize the metadata structure for the given tensor shape and single-plane format, (Padding is automatically calculated)
+     *
+     * @note The padding used by this method is really conservative so that the tensor can be used for most functions.
+     *
+     * @param[in] tensor_shape It specifies the size for each dimension of the tensor in number of elements
+     * @param[in] format       Single plane format of the image.
+     *
+     * @return Total allocation size including padding in bytes.
+     */
+    size_t init_auto_padding(const TensorShape &tensor_shape, Format format);
+    /** Initialize the metadata structure for the given tensor shape, number of channels,
+     *  data type and fixed point position. (Padding is automatically calculated)
+     *
+     * @note The padding used by this method is really conservative so that the tensor can be used for most functions.
+     *
+     * @param[in] tensor_shape         It specifies the size for each dimension of the tensor in number of elements
+     * @param[in] num_channels         It indicates the number of channels for each tensor element
+     * @param[in] data_type            Data type to use for each tensor element
+     * @param[in] fixed_point_position (Optional) Fixed point position that expresses the number of bits for the fractional part of the number when the tensor's data type is QS8 or QS16.
+     *
+     * @return Total allocation size including padding in bytes.
+     */
+    size_t init_auto_padding(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position = 0);
+    /** Initialize the metadata structure for the given HOG's metadata
+     *
+     * @note init_auto_padding will be used for the tensor initialization.
+     *
+     * @param[in] hog_info HOG's metadata used to allocate normalized HOG space
+     * @param[in] width    Width of the 2D tensor where the HOG descriptor will be computed on
+     * @param[in] height   Height of the 2D tensor where the HOG descriptor will be computed on
+     */
+    size_t init_auto_padding(const HOGInfo &hog_info, unsigned int width, unsigned int height);
+
+    // Inherited methods overridden:
+    void set_data_type(DataType data_type) override;
+    void set_num_channels(int num_channels) override;
+    void set_format(Format format) override;
+    void set_tensor_shape(TensorShape shape) override;
+    void set_fixed_point_position(int fixed_point_position) override;
+    bool auto_padding() override;
+    bool extend_padding(const PaddingSize &padding) override;
+    size_t dimension(size_t index) const override
+    {
+        return _tensor_shape[index];
+    }
+    const Strides &strides_in_bytes() const override
+    {
+        return _strides_in_bytes;
+    }
+    size_t offset_first_element_in_bytes() const override
+    {
+        return _offset_first_element_in_bytes;
+    }
+    size_t offset_element_in_bytes(const Coordinates &pos) const override;
+    int fixed_point_position() const override
+    {
+        return _fixed_point_position;
+    }
+    size_t element_size() const override
+    {
+        return data_size_from_type(_data_type) * _num_channels;
+    }
+    size_t num_dimensions() const override
+    {
+        return _tensor_shape.num_dimensions();
+    }
+    size_t num_channels() const override
+    {
+        return _num_channels;
+    }
+    const TensorShape &tensor_shape() const override
+    {
+        return _tensor_shape;
+    }
+    DataType data_type() const override
+    {
+        return _data_type;
+    }
+    Format format() const override
+    {
+        return _format;
+    }
+    size_t total_size() const override
+    {
+        return _total_size;
+    }
+    PaddingSize padding() const override
+    {
+        return _padding;
+    }
+    bool has_padding() const override
+    {
+        return !_padding.empty();
+    }
+    bool is_resizable() const override
+    {
+        return _is_resizable;
+    }
+    void set_is_resizable(bool is_resizable) override
+    {
+        _is_resizable = is_resizable;
+    }
+    ValidRegion valid_region() const override
+    {
+        return _valid_region;
+    }
+    void set_valid_region(ValidRegion valid_region) override
+    {
+        _valid_region = std::move(valid_region);
+    }
+
+private:
+    /** Calculates strides, offset and total size resulting from the specified padding around the XY plane.
+     *
+     * @param[in] padding Padding around the XY plane in elements.
+     */
+    std::tuple<Strides, size_t, size_t> calculate_padding_requirements(const PaddingSize &padding);
+
+    size_t      _total_size;
+    int         _fixed_point_position;
+    size_t      _offset_first_element_in_bytes;
+    Strides     _strides_in_bytes;
+    size_t      _num_channels;
+    TensorShape _tensor_shape;
+    DataType    _data_type;
+    Format      _format;
+    bool        _is_resizable;
+    ValidRegion _valid_region;
+    PaddingSize _padding;
+};
+}
+#endif /*__ARM_COMPUTE_TENSORINFO_H__ */
diff --git a/arm_compute/core/TensorShape.h b/arm_compute/core/TensorShape.h
new file mode 100644
index 0000000000..f8b3181686
--- /dev/null
+++ b/arm_compute/core/TensorShape.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TENSORSHAPE_H__
+#define __ARM_COMPUTE_TENSORSHAPE_H__
+
+#include "arm_compute/core/Dimensions.h"
+#include "arm_compute/core/Error.h"
+
+#include <algorithm>
+#include <array>
+#include <functional>
+#include <numeric>
+
+namespace arm_compute
+{
+/** Shape of a tensor */
+class TensorShape : public Dimensions<size_t>
+{
+public:
+    /** Constructor to initialize the tensor shape.
+     *
+     * @param[in] dims Values to initialize the dimensions.
+     */
+    template <typename... Ts>
+    TensorShape(Ts... dims)
+        : Dimensions{ dims... }
+    {
+        // Initialize unspecified dimensions to 1
+        if(_num_dimensions > 0)
+        {
+            std::fill(_id.begin() + _num_dimensions, _id.end(), 1);
+        }
+
+        // Correct number dimensions to ignore trailing dimensions of size 1
+        apply_dimension_correction();
+    }
+    /** Allow instances of this class to be copy constructed */
+    TensorShape(const TensorShape &) = default;
+    /** Allow instances of this class to be copied */
+    TensorShape &operator=(const TensorShape &) = default;
+    /** Allow instances of this class to be move constructed */
+    TensorShape(TensorShape &&) = default;
+    /** Allow instances of this class to be moved */
+    TensorShape &operator=(TensorShape &&) = default;
+    /** Default destructor */
+    ~TensorShape() = default;
+
+    /** Accessor to set the value of one of the dimensions.
+     *
+     * @param[in] dimension Dimension for which the value is set.
+     * @param[in] value     Value to be set for the dimension.
+     */
+    void set(size_t dimension, size_t value)
+    {
+        ARM_COMPUTE_ERROR_ON(value < 1);
+
+        // Make sure all empty dimensions are filled with 1
+        std::fill(_id.begin() + _num_dimensions, _id.end(), 1);
+
+        // Set the specified dimension and increase the number of dimensions if
+        // necessary
+        Dimensions::set(dimension, value);
+
+        // Correct number dimensions to ignore trailing dimensions of size 1
+        apply_dimension_correction();
+    }
+
+    /** Collapse the first n dimensions.
+     *
+     * @param[in] first Dimensions into which the following @p n are collapsed.
+     * @param[in] n     Number of dimensions to collapse into @p first.
+     */
+    void collapse(size_t n, size_t first = 0)
+    {
+        Dimensions::collapse(n, first);
+
+        // Make sure all empty dimensions are filled with 1
+        std::fill(_id.begin() + _num_dimensions, _id.end(), 1);
+    }
+
+    /** Collapses all dimensions to a single linear total size.
+     *
+     * @return The total tensor size in terms of elements.
+     */
+    size_t total_size() const
+    {
+        return std::accumulate(_id.begin(), _id.end(), 1, std::multiplies<size_t>());
+    }
+    /** Collapses given dimension and above.
+     *
+     * @note Precondition: dimension < TensorShape::num_max_dimensions
+     *
+     * @param[in] dimension Size of the wanted dimension
+     *
+     * @return The linear size of the collapsed dimensions
+     */
+    size_t total_size_upper(size_t dimension) const
+    {
+        return std::accumulate(_id.begin() + dimension, _id.end(), 1, std::multiplies<size_t>());
+    }
+
+private:
+    /** Remove trailing dimensions of size 1 from the reported number of dimensions. */
+    void apply_dimension_correction()
+    {
+        for(int i = static_cast<int>(_num_dimensions) - 1; i >= 0; --i)
+        {
+            if(_id[i] == 1)
+            {
+                --_num_dimensions;
+            }
+            else
+            {
+                break;
+            }
+        }
+    }
+};
+}
+#endif /*__ARM_COMPUTE_TENSORSHAPE_H__*/
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
new file mode 100644
index 0000000000..725567b9ae
--- /dev/null
+++ b/arm_compute/core/Types.h
@@ -0,0 +1,636 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TYPES_H__
+#define __ARM_COMPUTE_TYPES_H__
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/TensorShape.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <utility>
+
+namespace arm_compute
+{
+/** Image colour formats */
+enum class Format
+{
+    UNKNOWN,  /** Unknown image format */
+    U8,       /** 1 channel, 1 U8 per channel */
+    S16,      /** 1 channel, 1 S16 per channel */
+    U16,      /** 1 channel, 1 U16 per channel */
+    S32,      /** 1 channel, 1 S32 per channel */
+    U32,      /** 1 channel, 1 U32 per channel */
+    F16,      /** 1 channel, 1 F16 per channel */
+    F32,      /** 1 channel, 1 F32 per channel */
+    UV88,     /** 2 channel, 1 U8 per channel */
+    RGB888,   /** 3 channels, 1 U8 per channel */
+    RGBA8888, /** 4 channels, 1 U8 per channel */
+    YUV444,   /** A 3 plane of 8 bit 4:4:4 sampled Y, U, V planes */
+    YUYV422,  /** A single plane of 32-bit macro pixel of Y0, U0, Y1, V0 bytes */
+    NV12,     /** A 2 plane YUV format of Luma (Y) and interleaved UV data at 4:2:0 sampling */
+    NV21,     /** A 2 plane YUV format of Luma (Y) and interleaved VU data at 4:2:0 sampling */
+    IYUV,     /** A 3 plane of 8-bit 4:2:0 sampled Y, U, V planes */
+    UYVY422   /** A single plane of 32-bit macro pixel of U0, Y0, V0, Y1 byte */
+};
+
+/** Available data types */
+enum class DataType
+{
+    UNKNOWN,
+    U8,
+    S8,
+    QS8,
+    U16,
+    S16,
+    QS16,
+    U32,
+    S32,
+    U64,
+    S64,
+    F16,
+    F32,
+    F64,
+    SIZET
+};
+
+/** Constant value of the border pixels when using BorderMode::CONSTANT */
+constexpr uint8_t CONSTANT_BORDER_VALUE = 199;
+
+/* Constant value used to indicate a half-scale pyramid */
+constexpr float SCALE_PYRAMID_HALF = 0.5f;
+
+/* Constant value used to indicate a ORB scaled pyramid */
+constexpr float SCALE_PYRAMID_ORB = 8.408964152537146130583778358414e-01;
+
+struct ValidRegion
+{
+    ValidRegion()
+        : anchor{}, shape{}
+    {
+    }
+
+    ValidRegion(const ValidRegion &) = default;
+    ValidRegion(ValidRegion &&)      = default;
+    ValidRegion &operator=(const ValidRegion &) = default;
+    ValidRegion &operator=(ValidRegion &&) = default;
+    ~ValidRegion()                         = default;
+
+    ValidRegion(Coordinates anchor, TensorShape shape)
+        : anchor{ anchor }, shape{ shape }
+    {
+    }
+
+    /** Return the start of the valid region for the given dimension @p d */
+    int start(unsigned int d) const
+    {
+        return anchor[d];
+    }
+
+    /** Return the end of the valid region for the given dimension @p d */
+    int end(unsigned int d) const
+    {
+        return anchor[d] + shape[d];
+    }
+
+    Coordinates anchor;
+    TensorShape shape;
+};
+
+/** Methods available to handle borders */
+enum class BorderMode
+{
+    UNDEFINED, /**< Borders are left undefined */
+    CONSTANT,  /**< Pixels outside the image are assumed to have a constant value */
+    REPLICATE  /**< Pixels outside the image are assumed to have the same value as the closest image pixel */
+};
+
+/** Container for 2D border size */
+struct BorderSize
+{
+    /** Empty border, i.e. no border */
+    constexpr BorderSize()
+        : top{ 0 }, right{ 0 }, bottom{ 0 }, left{ 0 }
+    {
+    }
+
+    /** Border with equal size around the 2D plane */
+    constexpr BorderSize(unsigned int size)
+        : top{ size }, right{ size }, bottom{ size }, left{ size }
+    {
+    }
+
+    /** Border with same size for top/bottom and left/right */
+    constexpr BorderSize(unsigned int top_bottom, unsigned int left_right)
+        : top{ top_bottom }, right{ left_right }, bottom{ top_bottom }, left{ left_right }
+    {
+    }
+
+    /** Border with different sizes */
+    constexpr BorderSize(unsigned int top, unsigned int right, unsigned int bottom, unsigned int left)
+        : top{ top }, right{ right }, bottom{ bottom }, left{ left }
+    {
+    }
+
+    /** Check if the entire border is zero */
+    constexpr bool empty() const
+    {
+        return top == 0 && right == 0 && bottom == 0 && left == 0;
+    }
+
+    /** Check if the border is the same size on all sides */
+    constexpr bool uniform() const
+    {
+        return top == right && top == bottom && top == left;
+    }
+
+    BorderSize &operator*=(float scale)
+    {
+        top *= scale;
+        right *= scale;
+        bottom *= scale;
+        left *= scale;
+
+        return *this;
+    }
+
+    BorderSize operator*(float scale)
+    {
+        BorderSize size = *this;
+        size *= scale;
+
+        return size;
+    }
+
+    void limit(const BorderSize &limit)
+    {
+        top    = std::min(top, limit.top);
+        right  = std::min(right, limit.right);
+        bottom = std::min(bottom, limit.bottom);
+        left   = std::min(left, limit.left);
+    }
+
+    unsigned int top;
+    unsigned int right;
+    unsigned int bottom;
+    unsigned int left;
+};
+
+using PaddingSize = BorderSize;
+
+/** Policy to handle overflow */
+enum class ConvertPolicy
+{
+    WRAP,    /**< Wrap around */
+    SATURATE /**< Saturate */
+};
+
+/** Interpolation method */
+enum class InterpolationPolicy
+{
+    NEAREST_NEIGHBOR, /**< Output values are defined to match the source pixel whose center is nearest to the sample position */
+    BILINEAR,         /**< Output values are defined by bilinear interpolation between the pixels */
+    AREA,             /**< Output values are determined by averaging the source pixels whose areas fall under the area of the destination pixel, projected onto the source image */
+};
+
+/** Bilinear Interpolation method used by LKTracker */
+enum class BilinearInterpolation
+{
+    BILINEAR_OLD_NEW,
+    BILINEAR_SCHARR
+};
+
+/** Threshold mode */
+enum class ThresholdType
+{
+    BINARY, /**< Threshold with one value */
+    RANGE   /**< Threshold with two values*/
+};
+
+/** Rounding method */
+enum class RoundingPolicy
+{
+    TO_ZERO,        /**< Truncates the least significand values that are lost in operations. */
+    TO_NEAREST_UP,  /**< Rounds to nearest value; half rounds up */
+    TO_NEAREST_EVEN /**< Rounds to nearest value; half rounds to nearest even */
+};
+
+/** Termination criteria */
+enum class Termination
+{
+    TERM_CRITERIA_EPSILON,
+    TERM_CRITERIA_ITERATIONS,
+    TERM_CRITERIA_BOTH
+};
+
+/** Magnitude calculation type. */
+enum class MagnitudeType
+{
+    L1NORM, /**< L1 normalization type */
+    L2NORM  /**< L2 normalization type */
+};
+
+/** Phase calculation type.
+ *
+ * @note When PhaseType == SIGNED, each angle is mapped to the range 0 to 255 inclusive otherwise angles between 0 and 180
+ */
+enum class PhaseType
+{
+    SIGNED,  /**< Angle range: [0, 360] */
+    UNSIGNED /**< Angle range: [0, 180] */
+};
+
+/** Keypoint type */
+struct KeyPoint
+{
+    int32_t x{ 0 };               /**< X coordinates */
+    int32_t y{ 0 };               /**< Y coordinates */
+    float   strength{ 0.f };      /**< Strength of the point */
+    float   scale{ 0.f };         /**< Scale initialized to 0 by the corner detector */
+    float   orientation{ 0.f };   /**< Orientation initialized to 0 by the corner detector */
+    int32_t tracking_status{ 0 }; /**< Status initialized to 1 by the corner detector, set to 0 when the point is lost */
+    float   error{ 0.f };         /**< Tracking error initialized to 0 by the corner detector */
+};
+
+using InternalKeypoint = std::tuple<float, float, float>; /* x,y,strength */
+
+/** Rectangle type */
+struct Rectangle
+{
+    uint16_t x;      /**< Top-left x coordinate */
+    uint16_t y;      /**< Top-left y coordinate */
+    uint16_t width;  /**< Width of the rectangle */
+    uint16_t height; /**< Height of the rectangle */
+};
+
+/** Coordinate type */
+struct Coordinates2D
+{
+    int32_t x; /**< X coordinates */
+    int32_t y; /**< Y coordinates */
+};
+
+/** Coordinate type */
+struct Coordinates3D
+{
+    uint32_t x; /**< X coordinates */
+    uint32_t y; /**< Y coordinates */
+    uint32_t z; /**< Z coordinates */
+};
+
+/** Available channels */
+enum class Channel
+{
+    UNKNOWN, /** Unknown channel format */
+    C0,      /**< First channel (used by formats with unknown channel types). */
+    C1,      /**< Second channel (used by formats with unknown channel types). */
+    C2,      /**< Third channel (used by formats with unknown channel types). */
+    C3,      /**< Fourth channel (used by formats with unknown channel types). */
+    R,       /**< Red channel. */
+    G,       /**< Green channel. */
+    B,       /**< Blue channel. */
+    A,       /**< Alpha channel. */
+    Y,       /**< Luma channel. */
+    U,       /**< Cb/U channel. */
+    V        /**< Cr/V/Value channel. */
+};
+
+/** Available matrix patterns */
+enum class MatrixPattern
+{
+    BOX,   /**< Box pattern matrix. */
+    CROSS, /**< Cross pattern matrix. */
+    DISK,  /**< Disk pattern matrix. */
+    OTHER  /**< Any other matrix pattern. */
+};
+
+/** Available non linear functions. */
+enum class NonLinearFilterFunction : unsigned
+{
+    MEDIAN = 0, /**< Non linear median filter. */
+    MIN    = 1, /**< Non linear erode. */
+    MAX    = 2, /**< Non linear dilate. */
+};
+
+/** The normalization type used for the normalization layer */
+enum class NormType
+{
+    IN_MAP_1D, /**< Normalization applied within the same map in 1D region */
+    IN_MAP_2D, /**< Normalization applied within the same map in 2D region */
+    CROSS_MAP  /**< Normalization applied cross maps */
+};
+
+/** Normalization type for Histogram of Oriented Gradients (HOG) */
+enum class HOGNormType
+{
+    L2_NORM    = 1, /**< L2-norm */
+    L2HYS_NORM = 2, /**< L2-norm followed by clipping */
+    L1_NORM    = 3  /**< L1 norm */
+};
+
+/** Detection window used for the object detection. The detection window keeps the following information:
+ *
+ *  -# Geometry of the rectangular window (x/y of top-left corner and width/height)
+ *  -# Index of the class used for evaluating which class the detection window belongs to
+ *  -# Confidence value (score) obtained with the classifier
+ */
+struct DetectionWindow
+{
+    uint16_t x{ 0 };         /**< Top-left x coordinate */
+    uint16_t y{ 0 };         /**< Top-left y coordinate */
+    uint16_t width{ 0 };     /**< Width of the detection window */
+    uint16_t height{ 0 };    /**< Height of the detection window */
+    uint16_t idx_class{ 0 }; /**< Index of the class */
+    float    score{ 0.f };   /**< Confidence value for the detection window */
+};
+
+/** Dimension rounding type when down-scaling on CNNs
+ * @note Used in pooling and convolution layer
+ */
+enum class DimensionRoundingType
+{
+    FLOOR, /**< Floor rounding */
+    CEIL   /**< Ceil rounding */
+};
+
+/** Available pooling types */
+enum class PoolingType
+{
+    MAX, /**< Max Pooling */
+    AVG  /**< Average Pooling */
+};
+
+/** Padding and stride information class */
+class PadStrideInfo
+{
+public:
+    /** Constructor
+     *
+     * @param[in] stride_x (Optional) Stride, in elements, across x. Defaults to 1.
+     * @param[in] stride_y (Optional) Stride, in elements, across y. Defaults to 1.
+     * @param[in] pad_x    (Optional) Padding, in elements, across x. Defaults to 0.
+     * @param[in] pad_y    (Optional) Padding, in elements, across y. Defaults to 0.
+     * @param[in] round    (Optional) Dimensions rounding. Defaults to @ref FLOOR.
+     */
+    PadStrideInfo(unsigned int stride_x = 1, unsigned int stride_y = 1,
+                  unsigned int pad_x = 0, unsigned int pad_y = 0,
+                  DimensionRoundingType round = DimensionRoundingType::FLOOR)
+        : _stride(std::make_pair(stride_x, stride_y)),
+          _pad(std::make_pair(pad_x, pad_y)),
+          _round_type(round)
+    {
+    }
+    std::pair<unsigned int, unsigned int> stride() const
+    {
+        return _stride;
+    }
+    std::pair<unsigned int, unsigned int> pad() const
+    {
+        return _pad;
+    }
+    DimensionRoundingType round() const
+    {
+        return _round_type;
+    }
+
+private:
+    std::pair<unsigned int, unsigned int> _stride;
+    std::pair<unsigned int, unsigned int> _pad;
+    DimensionRoundingType _round_type;
+};
+
+/** Pooling Layer Information class */
+class PoolingLayerInfo
+{
+public:
+    /** Default Constructor
+     *
+     * @param[in] pool_type       Pooling type @ref PoolingType. Defaults to @ref PoolingType::MAX
+     * @param[in] pool_size       (Optional) Pooling size, in elements, across  x and y. Defaults to 2.
+     * @param[in] pad_stride_info (Optional) Padding and stride information @ref PadStrideInfo
+     */
+    PoolingLayerInfo(PoolingType pool_type = PoolingType::MAX, unsigned int pool_size = 2, PadStrideInfo pad_stride_info = PadStrideInfo())
+        : _pool_type(pool_type), _pool_size(pool_size), _pad_stride_info(pad_stride_info)
+    {
+    }
+    PoolingType pool_type() const
+    {
+        return _pool_type;
+    }
+    unsigned int pool_size() const
+    {
+        return _pool_size;
+    }
+    PadStrideInfo pad_stride_info() const
+    {
+        return _pad_stride_info;
+    }
+
+private:
+    PoolingType   _pool_type;
+    unsigned int  _pool_size;
+    PadStrideInfo _pad_stride_info;
+};
+
+/** Activation Layer Information class */
+class ActivationLayerInfo
+{
+public:
+    /** Available activation functions */
+    enum class ActivationFunction
+    {
+        LOGISTIC,     /**< Logistic */
+        TANH,         /**< Hyperbolic tangent */
+        RELU,         /**< Rectifier */
+        BOUNDED_RELU, /**< Bounded Rectifier */
+        SOFT_RELU,    /**< Soft Rectifier */
+        ABS,          /**< Absolute */
+        SQUARE,       /**< Square */
+        SQRT,         /**< Square root */
+        LINEAR        /**< Linear */
+    };
+
+    /** Default Constructor
+     *
+     * @param[in] f The activation function to use.
+     * @param[in] a (Optional) The alpha parameter used by some activation functions
+     *              (@ref ActivationFunction::BOUNDED_RELU, @ref ActivationFunction::LINEAR, @ref ActivationFunction::TANH).
+     * @param[in] b (Optional) The beta parameter used by some activation functions (@ref ActivationFunction::LINEAR, @ref ActivationFunction::TANH).
+     */
+    ActivationLayerInfo(ActivationFunction f, float a = 0.0f, float b = 0.0f)
+        : _act(f), _a(a), _b(b)
+    {
+    }
+    ActivationFunction activation() const
+    {
+        return _act;
+    }
+    float a() const
+    {
+        return _a;
+    }
+    float b() const
+    {
+        return _b;
+    }
+
+private:
+    ActivationFunction _act;
+    float              _a;
+    float              _b;
+};
+
+/** Normalization Layer Information class */
+class NormalizationLayerInfo
+{
+public:
+    /** Default Constructor
+     *
+     * @param[in] type      The normalization type. Can be @ref NormType::IN_MAP_1D, @ref NormType::IN_MAP_2D or @ref NORM_TYPE::CROSS_MAP
+     * @param[in] norm_size The normalization size is the number of elements to normalize across. Defaults to 5.
+     * @param[in] alpha     Alpha parameter used by normalization equation. Defaults to 0.0001.
+     * @param[in] beta      Beta parameter used by normalization equation. Defaults to 0.5.
+     * @param[in] kappa     Kappa parameter used by [Krichevksy 2012] Across Channel Local Brightness Normalization equation.
+     */
+    NormalizationLayerInfo(NormType type, uint32_t norm_size = 5, float alpha = 0.0001f, float beta = 0.5f, float kappa = 1.f)
+        : _type(type), _norm_size(norm_size), _alpha(alpha), _beta(beta), _kappa(kappa)
+    {
+    }
+    NormType type() const
+    {
+        return _type;
+    }
+    uint32_t norm_size() const
+    {
+        return _norm_size;
+    }
+    float alpha() const
+    {
+        return _alpha;
+    }
+    float beta() const
+    {
+        return _beta;
+    }
+    float kappa() const
+    {
+        return _kappa;
+    }
+    /** Return the scaling factor of the normalization function. If kappa is not
+     * 1 then [Krichevksy 2012] normalization scaling is specified. Scaling
+     * factor takes into account the total number of elements used for the
+     * normalization, so in case of 2 dimensions this is _norm_size^2.
+     *
+     * @return The normalization scaling factor.
+     */
+    float scale_coeff() const
+    {
+        const uint32_t size = (_type == NormType::IN_MAP_2D) ? _norm_size * _norm_size : _norm_size;
+        return (_kappa == 1.f) ? (_alpha / size) : _alpha;
+    }
+
+private:
+    NormType _type;
+    uint32_t _norm_size;
+    float    _alpha;
+    float    _beta;
+    float    _kappa;
+};
+
+/** Convolution Layer Weights Information class */
+class WeightsInfo
+{
+public:
+    WeightsInfo()
+        : _are_reshaped(false), _kernel_size(0)
+    {
+    }
+    /** Constructor
+     *
+     * @param[in] are_reshaped True if the weights have been reshaped
+     * @param[in] kernel_size  The size of the kernel.
+     */
+    WeightsInfo(bool are_reshaped, unsigned int kernel_size)
+        : _are_reshaped(are_reshaped), _kernel_size(kernel_size)
+    {
+    }
+
+    bool are_reshaped() const
+    {
+        return _are_reshaped;
+    };
+    unsigned int kernel_size() const
+    {
+        return _kernel_size;
+    }
+
+private:
+    const bool         _are_reshaped;
+    const unsigned int _kernel_size;
+};
+
+/** IO formatting information class*/
+struct IOFormatInfo
+{
+    /** Precision type used when printing floating point numbers */
+    enum class PrecisionType
+    {
+        Default, /**< Default precision to the one that the current stream has */
+        Custom,  /**< Custom precision specified by the user using the precision parameter */
+        Full     /**< The maximum precision of the floating point representation */
+    };
+
+    /** Specifies the area to be printed, used by Tensor objects */
+    enum class PrintRegion
+    {
+        ValidRegion, /**< Prints the valid region of the Tensor object */
+        NoPadding,   /**< Prints the Tensor object without the padding */
+        Full         /**< Print the tensor object including padding */
+    };
+
+    IOFormatInfo(PrintRegion   print_region   = PrintRegion::ValidRegion,
+                 PrecisionType precision_type = PrecisionType::Default,
+                 unsigned int  precision      = 10,
+                 bool          align_columns  = true,
+                 std::string   element_delim  = " ",
+                 std::string   row_delim      = "\n")
+        : print_region(print_region),
+          precision_type(precision_type),
+          precision(precision),
+          element_delim(element_delim),
+          row_delim(row_delim),
+          align_columns(align_columns)
+    {
+    }
+
+    PrintRegion   print_region;
+    PrecisionType precision_type;
+    unsigned int  precision;
+    std::string   element_delim;
+    std::string   row_delim;
+    bool          align_columns;
+};
+}
+#endif /* __ARM_COMPUTE_TYPES_H__ */
diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h
new file mode 100644
index 0000000000..9d3ff0a1bd
--- /dev/null
+++ b/arm_compute/core/Utils.h
@@ -0,0 +1,740 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_UTILS_H__
+#define __ARM_COMPUTE_UTILS_H__
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Types.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdlib>
+#include <numeric>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+namespace arm_compute
+{
+/** Computes the smallest number larger or equal to value that is a multiple of divisor. */
+template <typename S, typename T>
+inline auto ceil_to_multiple(S value, T divisor) -> decltype(((value + divisor - 1) / divisor) * divisor)
+{
+    ARM_COMPUTE_ERROR_ON(value < 0 || divisor <= 0);
+    return ((value + divisor - 1) / divisor) * divisor;
+}
+
+/** Computes the largest number smaller or equal to value that is a multiple of divisor. */
+template <typename S, typename T>
+inline auto floor_to_multiple(S value, T divisor) -> decltype((value / divisor) * divisor)
+{
+    ARM_COMPUTE_ERROR_ON(value < 0 || divisor <= 0);
+    return (value / divisor) * divisor;
+}
+
+/** Calculate the rounded up quotient of val / m. */
+template <typename S, typename T>
+constexpr auto DIV_CEIL(S val, T m) -> decltype((val + m - 1) / m)
+{
+    return (val + m - 1) / m;
+}
+
+/** Returns the arm_compute library build information
+ *
+ * Contains the version number and the build options used to build the library
+ *
+ * @return The arm_compute library build information
+ */
+std::string build_information();
+
+/** Load an entire file in memory
+ *
+ * @param[in] filename Name of the file to read.
+ * @param[in] binary   Is it a binary file ?
+ *
+ * @return The content of the file.
+ */
+std::string read_file(const std::string &filename, bool binary);
+
+/** Return a value as a string
+ *
+ * @param[in] val Input value.
+ *
+ * @return Value represented as a string
+ */
+template <typename T>
+const std::string val_to_string(T val)
+{
+    return static_cast<const std::ostringstream &>(std::ostringstream() << val).str();
+}
+
+/** The size in bytes of the data type
+ *
+ * @param[in] data_type Input data type
+ *
+ * @return The size in bytes of the data type
+ */
+inline size_t data_size_from_type(DataType data_type)
+{
+    switch(data_type)
+    {
+        case DataType::U8:
+        case DataType::S8:
+        case DataType::QS8:
+            return 1;
+        case DataType::U16:
+        case DataType::S16:
+        case DataType::F16:
+        case DataType::QS16:
+            return 2;
+        case DataType::F32:
+        case DataType::U32:
+        case DataType::S32:
+            return 4;
+        case DataType::F64:
+        case DataType::U64:
+        case DataType::S64:
+            return 8;
+        case DataType::SIZET:
+            return sizeof(size_t);
+        default:
+            ARM_COMPUTE_ERROR("Invalid data type");
+            return 0;
+    }
+}
+
+/** The size in bytes of the pixel format
+ *
+ * @param[in] format Input format
+ *
+ * @return The size in bytes of the pixel format
+ */
+inline size_t pixel_size_from_format(Format format)
+{
+    switch(format)
+    {
+        case Format::U8:
+            return 1;
+        case Format::U16:
+        case Format::S16:
+        case Format::F16:
+        case Format::UV88:
+        case Format::YUYV422:
+        case Format::UYVY422:
+            return 2;
+        case Format::RGB888:
+            return 3;
+        case Format::RGBA8888:
+            return 4;
+        case Format::U32:
+        case Format::S32:
+        case Format::F32:
+            return 4;
+        //Doesn't make sense for planar formats:
+        case Format::NV12:
+        case Format::NV21:
+        case Format::IYUV:
+        case Format::YUV444:
+        default:
+            ARM_COMPUTE_ERROR("Undefined pixel size for given format");
+            return 0;
+    }
+}
+
+/** The size in bytes of the data type
+ *
+ * @param[in] dt Input data type
+ *
+ * @return The size in bytes of the data type
+ */
+inline size_t element_size_from_data_type(DataType dt)
+{
+    switch(dt)
+    {
+        case DataType::S8:
+        case DataType::U8:
+        case DataType::QS8:
+            return 1;
+        case DataType::U16:
+        case DataType::S16:
+        case DataType::QS16:
+        case DataType::F16:
+            return 2;
+        case DataType::U32:
+        case DataType::S32:
+        case DataType::F32:
+            return 4;
+        default:
+            ARM_COMPUTE_ERROR("Undefined element size for given data type");
+            return 0;
+    }
+}
+
+/** Return the data type used by a given single-planar pixel format
+ *
+ * @param[in] format Input format
+ *
+ * @return The size in bytes of the pixel format
+ */
+inline DataType data_type_from_format(Format format)
+{
+    switch(format)
+    {
+        case Format::U8:
+        case Format::UV88:
+        case Format::RGB888:
+        case Format::RGBA8888:
+        case Format::YUYV422:
+        case Format::UYVY422:
+            return DataType::U8;
+        case Format::U16:
+            return DataType::U16;
+        case Format::S16:
+            return DataType::S16;
+        case Format::U32:
+            return DataType::U32;
+        case Format::S32:
+            return DataType::S32;
+        case Format::F16:
+            return DataType::F16;
+        case Format::F32:
+            return DataType::F32;
+        //Doesn't make sense for planar formats:
+        case Format::NV12:
+        case Format::NV21:
+        case Format::IYUV:
+        case Format::YUV444:
+        default:
+            ARM_COMPUTE_ERROR("Not supported data_type for given format");
+            return DataType::UNKNOWN;
+    }
+}
+
+/** Return the plane index of a given channel given an input format.
+ *
+ * @param[in] format  Input format
+ * @param[in] channel Input channel
+ *
+ * @return The plane index of the specific channel of the specific format
+ */
+inline int plane_idx_from_channel(Format format, Channel channel)
+{
+    switch(format)
+    {
+        case Format::NV12:
+        case Format::NV21:
+        {
+            switch(channel)
+            {
+                case Channel::Y:
+                    return 0;
+                case Channel::U:
+                case Channel::V:
+                    return 1;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported channel");
+                    return 0;
+            }
+        }
+        case Format::IYUV:
+        case Format::YUV444:
+        {
+            switch(channel)
+            {
+                case Channel::Y:
+                    return 0;
+                case Channel::U:
+                    return 1;
+                case Channel::V:
+                    return 2;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported channel");
+                    return 0;
+            }
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported format");
+            return 0;
+    }
+}
+
+/** Return the number of planes for a given format
+ *
+ * @param[in] format Input format
+ *
+ * @return The number of planes for a given image format.
+ */
+inline size_t num_planes_from_format(Format format)
+{
+    switch(format)
+    {
+        case Format::U8:
+        case Format::S16:
+        case Format::U16:
+        case Format::S32:
+        case Format::U32:
+        case Format::F16:
+        case Format::F32:
+        case Format::RGB888:
+        case Format::RGBA8888:
+        case Format::YUYV422:
+        case Format::UYVY422:
+            return 1;
+        case Format::NV12:
+        case Format::NV21:
+            return 2;
+        case Format::IYUV:
+        case Format::YUV444:
+            return 3;
+        default:
+            ARM_COMPUTE_ERROR("Not supported format");
+            return 0;
+    }
+}
+
+/** Return the number of channels for a given single-planar pixel format
+ *
+ * @param[in] format Input format
+ *
+ * @return The number of channels for a given image format.
+ */
+inline size_t num_channels_from_format(Format format)
+{
+    switch(format)
+    {
+        case Format::U8:
+        case Format::U16:
+        case Format::S16:
+        case Format::U32:
+        case Format::S32:
+        case Format::F16:
+        case Format::F32:
+            return 1;
+        // Because the U and V channels are subsampled
+        // these formats appear like having only 2 channels:
+        case Format::YUYV422:
+        case Format::UYVY422:
+            return 2;
+        case Format::UV88:
+            return 2;
+        case Format::RGB888:
+            return 3;
+        case Format::RGBA8888:
+            return 4;
+        //Doesn't make sense for planar formats:
+        case Format::NV12:
+        case Format::NV21:
+        case Format::IYUV:
+        case Format::YUV444:
+        default:
+            return 0;
+    }
+}
+
+/** Separate a 2D convolution into two 1D convolutions
+*
+* @param[in]  conv     2D convolution
+* @param[out] conv_col 1D vertical convolution
+* @param[out] conv_row 1D horizontal convolution
+* @param[in]  size     Size of the 2D convolution
+*
+* @return true if the separation was successful
+*/
+inline bool separate_matrix(const int16_t *conv, int16_t *conv_col, int16_t *conv_row, uint8_t size)
+{
+    int32_t min_col     = -1;
+    int16_t min_col_val = -1;
+
+    for(int32_t i = 0; i < size; ++i)
+    {
+        if(conv[i] != 0 && (min_col < 0 || abs(min_col_val) > abs(conv[i])))
+        {
+            min_col     = i;
+            min_col_val = conv[i];
+        }
+    }
+
+    if(min_col < 0)
+    {
+        return false;
+    }
+
+    for(uint32_t j = 0; j < size; ++j)
+    {
+        conv_col[j] = conv[min_col + j * size];
+    }
+
+    for(uint32_t i = 0; i < size; i++)
+    {
+        if(static_cast<int>(i) == min_col)
+        {
+            conv_row[i] = 1;
+        }
+        else
+        {
+            int16_t coeff = conv[i] / conv[min_col];
+
+            for(uint32_t j = 1; j < size; ++j)
+            {
+                if(conv[i + j * size] != (conv_col[j] * coeff))
+                {
+                    return false;
+                }
+            }
+
+            conv_row[i] = coeff;
+        }
+    }
+
+    return true;
+}
+
+/** Calculate the scale of the given square matrix
+ *
+ * The scale is the absolute value of the sum of all the coefficients in the matrix.
+ *
+ * @note If the coefficients add up to 0 then the scale is set to 1.
+ *
+ * @param[in] matrix      Matrix coefficients
+ * @param[in] matrix_size Number of elements per side of the square matrix. (Number of coefficients = matrix_size * matrix_size).
+ *
+ * @return The absolute value of the sum of the coefficients if they don't add up to 0, otherwise 1.
+ */
+inline uint32_t calculate_matrix_scale(const int16_t *matrix, unsigned int matrix_size)
+{
+    const size_t size = matrix_size * matrix_size;
+
+    return std::max(1, std::abs(std::accumulate(matrix, matrix + size, 0)));
+}
+
+/** Calculate accurary required by the horizontal and vertical convolution computations
+ *
+ * @param[in] conv_col Pointer to the vertical vector of the separated convolution filter
+ * @param[in] conv_row Pointer to the horizontal vector of the convolution filter
+ * @param[in] size     Number of elements per vector of the separated matrix
+ *
+ * @return The return type is a pair. The first element of the pair is the biggest data type needed for the first stage. The second
+ * element of the pair is the biggest data type needed for the second stage.
+ */
+inline std::pair<DataType, DataType> data_type_for_convolution(const int16_t *conv_col, const int16_t *conv_row, size_t size)
+{
+    DataType first_stage  = DataType::UNKNOWN;
+    DataType second_stage = DataType::UNKNOWN;
+
+    auto gez = [](const int16_t &v)
+    {
+        return v >= 0;
+    };
+
+    auto accu_neg = [](const int &first, const int &second)
+    {
+        return first + (second < 0 ? second : 0);
+    };
+
+    auto accu_pos = [](const int &first, const int &second)
+    {
+        return first + (second > 0 ? second : 0);
+    };
+
+    const bool only_positive_coefficients = std::all_of(conv_row, conv_row + size, gez) && std::all_of(conv_col, conv_col + size, gez);
+
+    if(only_positive_coefficients)
+    {
+        const int max_row_value = std::accumulate(conv_row, conv_row + size, 0) * UINT8_MAX;
+        const int max_value     = std::accumulate(conv_col, conv_col + size, 0) * max_row_value;
+
+        first_stage = (max_row_value <= UINT16_MAX) ? DataType::U16 : DataType::S32;
+
+        second_stage = (max_value <= UINT16_MAX) ? DataType::U16 : DataType::S32;
+    }
+    else
+    {
+        const int min_row_value  = std::accumulate(conv_row, conv_row + size, 0, accu_neg) * UINT8_MAX;
+        const int max_row_value  = std::accumulate(conv_row, conv_row + size, 0, accu_pos) * UINT8_MAX;
+        const int neg_coeffs_sum = std::accumulate(conv_col, conv_col + size, 0, accu_neg);
+        const int pos_coeffs_sum = std::accumulate(conv_col, conv_col + size, 0, accu_pos);
+        const int min_value      = neg_coeffs_sum * max_row_value + pos_coeffs_sum * min_row_value;
+        const int max_value      = neg_coeffs_sum * min_row_value + pos_coeffs_sum * max_row_value;
+
+        first_stage = ((INT16_MIN <= min_row_value) && (max_row_value <= INT16_MAX)) ? DataType::S16 : DataType::S32;
+
+        second_stage = ((INT16_MIN <= min_value) && (max_value <= INT16_MAX)) ? DataType::S16 : DataType::S32;
+    }
+
+    return std::make_pair(first_stage, second_stage);
+}
+
+/** Calculate the accuracy required by the squared convolution calculation.
+ *
+ *
+ * @param[in] conv Pointer to the squared convolution matrix
+ * @param[in] size The total size of the convolution matrix
+ *
+ * @return The return is the biggest data type needed to do the convolution
+ */
+inline DataType data_type_for_convolution_matrix(const int16_t *conv, size_t size)
+{
+    auto gez = [](const int16_t v)
+    {
+        return v >= 0;
+    };
+
+    const bool only_positive_coefficients = std::all_of(conv, conv + size, gez);
+
+    if(only_positive_coefficients)
+    {
+        const int max_conv_value = std::accumulate(conv, conv + size, 0) * UINT8_MAX;
+        if(max_conv_value <= UINT16_MAX)
+        {
+            return DataType::U16;
+        }
+        else
+        {
+            return DataType::S32;
+        }
+    }
+    else
+    {
+        const int min_value = std::accumulate(conv, conv + size, 0, [](int a, int b)
+        {
+            return b < 0 ? a + b : a;
+        })
+        * UINT8_MAX;
+
+        const int max_value = std::accumulate(conv, conv + size, 0, [](int a, int b)
+        {
+            return b > 0 ? a + b : a;
+        })
+        * UINT8_MAX;
+
+        if((INT16_MIN <= min_value) && (INT16_MAX >= max_value))
+        {
+            return DataType::S16;
+        }
+        else
+        {
+            return DataType::S32;
+        }
+    }
+}
+
+/** Returns expected width and height of output scaled tensor depending on dimensions rounding mode.
+ *
+ * @param[in] width       Width of input tensor (Number of columns)
+ * @param[in] height      Height of input tensor (Number of rows)
+ * @param[in] kernel_size Kernel size.
+ * @param[in] stride_x    Stride of the operation in the x dimension.
+ * @param[in] stride_y    Stride of the operation in the y dimension.
+ * @param[in] pad_x       Padding size in the x dimension.
+ * @param[in] pad_y       Padding size in the y dimension.
+ * @param[in] round_type  Dimensions rounding mode.
+ *
+ * @return A pair with the new width in the first position and the new height in the second.
+ */
+const std::pair<unsigned int, unsigned int> scaled_dimensions(unsigned int width, unsigned int height, unsigned int kernel_size,
+                                                              unsigned int stride_x, unsigned int stride_y,
+                                                              unsigned int pad_x, unsigned int pad_y,
+                                                              DimensionRoundingType round_type);
+
+/** Convert a tensor format into a string.
+ *
+ * @param[in] format @ref Format to be translated to string.
+ *
+ * @return The string describing the format.
+ */
+const std::string &string_from_format(Format format);
+
+/** Convert a channel identity into a string.
+ *
+ * @param[in] channel @ref Channel to be translated to string.
+ *
+ * @return The string describing the channel.
+ */
+const std::string &string_from_channel(Channel channel);
+
+/** Convert a data type identity into a string.
+ *
+ * @param[in] dt @ref DataType to be translated to string.
+ *
+ * @return The string describing the data type.
+ */
+const std::string &string_from_data_type(DataType dt);
+/** Convert a matrix pattern into a string.
+ *
+ * @param[in] pattern @ref MatrixPattern to be translated to string.
+ *
+ * @return The string describing the matrix pattern.
+ */
+const std::string &string_from_matrix_pattern(MatrixPattern pattern);
+/** Translates a given activation function to a string.
+ *
+ * @param[in] act @ref ActivationLayerInfo::ActivationFunction to be translated to string.
+ *
+ * @return The string describing the activation function.
+ */
+const std::string &string_from_activation_func(ActivationLayerInfo::ActivationFunction act);
+/** Translates a given non linear function to a string.
+ *
+ * @param[in] function @ref NonLinearFilterFunction to be translated to string.
+ *
+ * @return The string describing the non linear function.
+ */
+const std::string &string_from_non_linear_filter_function(NonLinearFilterFunction function);
+/** Translates a given interpolation policy to a string.
+ *
+ * @param[in] policy @ref InterpolationPolicy to be translated to string.
+ *
+ * @return The string describing the interpolation policy.
+ */
+const std::string &string_from_interpolation_policy(InterpolationPolicy policy);
+/** Translates a given border mode policy to a string.
+ *
+ * @param[in] border_mode @ref BorderMode to be translated to string.
+ *
+ * @return The string describing the border mode.
+ */
+const std::string &string_from_border_mode(BorderMode border_mode);
+/** Translates a given normalization type to a string.
+ *
+ * @param[in] type @ref NormType to be translated to string.
+ *
+ * @return The string describing the normalization type.
+ */
+const std::string &string_from_norm_type(NormType type);
+/** Lower a given string.
+ *
+ * @param[in] val Given string to lower.
+ *
+ * @return The lowered string
+ */
+std::string lower_string(const std::string &val);
+
+/** Check if a given data type is of floating point type
+ *
+ * @param[in] dt Input data type.
+ *
+ * @return True if data type is of floating point type, else false.
+ */
+inline bool is_data_type_float(DataType dt)
+{
+    switch(dt)
+    {
+        case DataType::F16:
+        case DataType::F32:
+            return true;
+        default:
+            return false;
+    }
+}
+
+/** Check if a given data type is of fixed point type
+ *
+ * @param[in] dt Input data type.
+ *
+ * @return True if data type is of fixed point type, else false.
+ */
+inline bool is_data_type_fixed_point(DataType dt)
+{
+    switch(dt)
+    {
+        case DataType::QS8:
+        case DataType::QS16:
+            return true;
+        default:
+            return false;
+    }
+}
+
+/** Print consecutive elements to an output stream.
+ *
+ * @param[out] s             Output stream to print the elements to.
+ * @param[in]  ptr           Pointer to print the elements from.
+ * @param[in]  n             Number of elements to print.
+ * @param[in]  stream_width  (Optional) Width of the stream. If set to 0 the element's width is used. Defaults to 0.
+ * @param[in]  element_delim (Optional) Delimeter among the consecutive elements. Defaults to space delimeter
+ */
+template <typename T>
+void print_consecutive_elements_impl(std::ostream &s, const T *ptr, unsigned int n, int stream_width = 0, const std::string &element_delim = " ")
+{
+    using print_type = typename std::conditional<std::is_floating_point<T>::value, T, int>::type;
+
+    for(unsigned int i = 0; i < n; ++i)
+    {
+        // Set stream width as it is not a "sticky" stream manipulator
+        if(stream_width != 0)
+        {
+            s.width(stream_width);
+        }
+        s << std::right << static_cast<print_type>(ptr[i]) << element_delim;
+    }
+}
+
+/** Identify the maximum width of n consecutive elements.
+ *
+ * @param[in] s   The output stream which will be used to print the elements. Used to extract the stream format.
+ * @param[in] ptr Pointer to the elements.
+ * @param[in] n   Number of elements.
+ *
+ * @return The maximum width of the elements.
+ */
+template <typename T>
+int max_consecutive_elements_display_width_impl(std::ostream &s, const T *ptr, unsigned int n)
+{
+    using print_type = typename std::conditional<std::is_floating_point<T>::value, T, int>::type;
+
+    int max_width = -1;
+    for(unsigned int i = 0; i < n; ++i)
+    {
+        std::stringstream ss;
+        ss.copyfmt(s);
+        ss << static_cast<print_type>(ptr[i]);
+        max_width = std::max<int>(max_width, ss.str().size());
+    }
+    return max_width;
+}
+
+/** Print consecutive elements to an output stream.
+ *
+ * @param[out] s             Output stream to print the elements to.
+ * @param[in]  dt            Data type of the elements
+ * @param[in]  ptr           Pointer to print the elements from.
+ * @param[in]  n             Number of elements to print.
+ * @param[in]  stream_width  (Optional) Width of the stream. If set to 0 the element's width is used. Defaults to 0.
+ * @param[in]  element_delim (Optional) Delimeter among the consecutive elements. Defaults to space delimeter
+ */
+void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n, int stream_width, const std::string &element_delim = " ");
+
+/** Identify the maximum width of n consecutive elements.
+ *
+ * @param[in] s   Output stream to print the elements to.
+ * @param[in] dt  Data type of the elements
+ * @param[in] ptr Pointer to print the elements from.
+ * @param[in] n   Number of elements to print.
+ *
+ * @return The maximum width of the elements.
+ */
+int max_consecutive_elements_display_width(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n);
+}
+#endif /*__ARM_COMPUTE_UTILS_H__ */
diff --git a/arm_compute/core/Validate.h b/arm_compute/core/Validate.h
new file mode 100644
index 0000000000..48eba70adf
--- /dev/null
+++ b/arm_compute/core/Validate.h
@@ -0,0 +1,563 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_VALIDATE_H__
+#define __ARM_COMPUTE_VALIDATE_H__
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/HOGInfo.h"
+#include "arm_compute/core/IKernel.h"
+#include "arm_compute/core/IMultiHOG.h"
+#include "arm_compute/core/IMultiImage.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/MultiImageInfo.h"
+#include "arm_compute/core/Window.h"
+
+#include <algorithm>
+
+namespace arm_compute
+{
+namespace detail
+{
+/* Check whether two dimension objects differ.
+ *
+ * @param[in] dim1      First object to be compared.
+ * @param[in] dim2      Second object to be compared.
+ * @param[in] upper_dim The dimension from which to check.
+ *
+ * @return Return true if the two objects are different.
+ */
+template <typename T>
+inline bool have_different_dimensions(const Dimensions<T> &dim1, const Dimensions<T> &dim2, unsigned int upper_dim)
+{
+    for(unsigned int i = upper_dim; i < arm_compute::Dimensions<T>::num_max_dimensions; ++i)
+    {
+        if(dim1[i] != dim2[i])
+        {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+/** Functor to compare two @ref Dimensions objects and throw an error on mismatch.
+ *
+ * @param[in] dim      Object to compare against.
+ * @param[in] function Function in which the error occured.
+ * @param[in] file     File in which the error occured.
+ * @param[in] line     Line in which the error occured.
+ */
+template <typename T>
+class compare_dimension
+{
+public:
+    compare_dimension(const Dimensions<T> &dim, const char *function, const char *file, int line)
+        : _dim{ dim }, _function{ function }, _file{ file }, _line{ line }
+    {
+    }
+
+    /** Compare the given object against the stored one.
+     *
+     * @param[in] dim To be compared object.
+     */
+    void operator()(const Dimensions<T> &dim)
+    {
+        ARM_COMPUTE_ERROR_ON_LOC_MSG(have_different_dimensions(_dim, dim, 0), _function, _file, _line,
+                                     "Objects have different dimensions");
+    }
+
+private:
+    const Dimensions<T> &_dim;
+    const char *const    _function;
+    const char *const    _file;
+    const int            _line;
+};
+} // namespace detail
+/** Throw an error if one of the pointers is a nullptr.
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] pointers Pointers to check against nullptr.
+ */
+template <typename... Ts>
+void error_on_nullptr(const char *function, const char *file, const int line, Ts &&... pointers)
+{
+    auto is_nullptr = [&](const void *ptr)
+    {
+        ARM_COMPUTE_ERROR_ON_LOC(ptr == nullptr, function, file, line);
+    };
+
+    for_each(is_nullptr, std::forward<Ts>(pointers)...);
+}
+#define ARM_COMPUTE_ERROR_ON_NULLPTR(...) ::arm_compute::error_on_nullptr(__func__, __FILE__, __LINE__, __VA_ARGS__)
+
+/** Throw an error if the passed window is invalid.
+ *
+ * The subwindow is invalid if:
+ * - It is not a valid window.
+ * - Its dimensions don't match the full window's ones
+ * - The step for each of its dimension is not identical to the corresponding one of the full window.
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] full     Full size window
+ *  @param[in] win      Window to validate.
+ */
+void error_on_mismatching_windows(const char *function, const char *file, const int line,
+                                  const Window &full, const Window &win);
+#define ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(f, w) ::arm_compute::error_on_mismatching_windows(__func__, __FILE__, __LINE__, f, w)
+
+/** Throw an error if the passed subwindow is invalid.
+ *
+ * The subwindow is invalid if:
+ * - It is not a valid window.
+ * - It is not fully contained inside the full window
+ * - The step for each of its dimension is not identical to the corresponding one of the full window.
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] full     Full size window
+ *  @param[in] sub      Sub-window to validate.
+ */
+void error_on_invalid_subwindow(const char *function, const char *file, const int line,
+                                const Window &full, const Window &sub);
+#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s) ::arm_compute::error_on_invalid_subwindow(__func__, __FILE__, __LINE__, f, s)
+
+/** Throw an error if the passed coordinates have too many dimensions.
+ *
+ * The coordinates have too many dimensions if any of the dimensions greater or equal to max_dim is different from 0.
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] pos      Coordinates to validate
+ *  @param[in] max_dim  Maximum number of dimensions allowed.
+ */
+void error_on_coordinates_dimensions_gte(const char *function, const char *file, const int line,
+                                         const Coordinates &pos, unsigned int max_dim);
+#define ARM_COMPUTE_ERROR_ON_COORDINATES_DIMENSIONS_GTE(p, md) ::arm_compute::error_on_coordinates_dimensions_gte(__func__, __FILE__, __LINE__, p, md)
+
+/** Throw an error if the passed window has too many dimensions.
+ *
+ * The window has too many dimensions if any of the dimension greater or equal to max_dim is different from 0.
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] win      Window to validate
+ *  @param[in] max_dim  Maximum number of dimensions allowed.
+ */
+void error_on_window_dimensions_gte(const char *function, const char *file, const int line,
+                                    const Window &win, unsigned int max_dim);
+#define ARM_COMPUTE_ERROR_ON_WINDOW_DIMENSIONS_GTE(w, md) ::arm_compute::error_on_window_dimensions_gte(__func__, __FILE__, __LINE__, w, md)
+
+/** Throw an error if the passed dimension objects differ.
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] dim1     The first object to be compared.
+ *  @param[in] dim2     The second object to be compared.
+ *  @param[in] dims     (Optional) Further allowed objects.
+ */
+template <typename T, typename... Ts>
+void error_on_mismatching_dimensions(const char *function, const char *file, int line,
+                                     const Dimensions<T> &dim1, const Dimensions<T> &dim2, Ts &&... dims)
+{
+    ARM_COMPUTE_UNUSED(function);
+    ARM_COMPUTE_UNUSED(file);
+    ARM_COMPUTE_UNUSED(line);
+
+    for_each(detail::compare_dimension<T>(dim1, function, file, line), dim2, std::forward<Ts>(dims)...);
+}
+#define ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(...) ::arm_compute::error_on_mismatching_dimensions(__func__, __FILE__, __LINE__, __VA_ARGS__)
+
+/** Throw an error if the passed two tensors have different shapes from the given dimension
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] tensor_1 The first tensor to be compared.
+ *  @param[in] tensor_2 The second tensor to be compared.
+ *  @param[in] tensors  (Optional) Further allowed tensors.
+ */
+template <typename... Ts>
+void error_on_mismatching_shapes(const char *function, const char *file, const int line,
+                                 const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
+{
+    error_on_mismatching_shapes(function, file, line, 0U, tensor_1, tensor_2, std::forward<Ts>(tensors)...);
+}
+
+/** Throw an error if the passed two tensors have different shapes from the given dimension
+ *
+ *  @param[in] function  Function in which the error occurred.
+ *  @param[in] file      Name of the file where the error occurred.
+ *  @param[in] line      Line on which the error occurred.
+ *  @param[in] upper_dim The dimension from which to check.
+ *  @param[in] tensor_1  The first tensor to be compared.
+ *  @param[in] tensor_2  The second tensor to be compared.
+ *  @param[in] tensors   (Optional) Further allowed tensors.
+ */
+template <typename... Ts>
+void error_on_mismatching_shapes(const char *function, const char *file, const int line,
+                                 unsigned int upper_dim, const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
+{
+    ARM_COMPUTE_UNUSED(function);
+    ARM_COMPUTE_UNUSED(file);
+    ARM_COMPUTE_UNUSED(line);
+
+    const std::array < const ITensor *, 2 + sizeof...(Ts) > tensors_array{ { tensor_1, tensor_2, std::forward<Ts>(tensors)... } };
+    ARM_COMPUTE_UNUSED(tensors_array);
+
+    ARM_COMPUTE_ERROR_ON_LOC(tensors_array.cbegin() == nullptr, function, file, line);
+
+    ARM_COMPUTE_ERROR_ON_LOC_MSG(std::any_of(std::next(tensors_array.cbegin()), tensors_array.cend(), [&](const ITensor * tensor)
+    {
+        ARM_COMPUTE_ERROR_ON_LOC(tensor == nullptr, function, file, line);
+        return detail::have_different_dimensions((*tensors_array.cbegin())->info()->tensor_shape(), tensor->info()->tensor_shape(), upper_dim);
+    }),
+    function, file, line, "Tensors have different shapes");
+}
+#define ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(...) ::arm_compute::error_on_mismatching_shapes(__func__, __FILE__, __LINE__, __VA_ARGS__)
+
+/** Throw an error if the passed two tensors have different data types
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] tensor_1 The first tensor to be compared.
+ *  @param[in] tensor_2 The second tensor to be compared.
+ *  @param[in] tensors  (Optional) Further allowed tensors.
+ */
+template <typename... Ts>
+void error_on_mismatching_data_types(const char *function, const char *file, const int line,
+                                     const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
+{
+    ARM_COMPUTE_UNUSED(function);
+    ARM_COMPUTE_UNUSED(file);
+    ARM_COMPUTE_UNUSED(line);
+    ARM_COMPUTE_UNUSED(tensor_1);
+    ARM_COMPUTE_UNUSED(tensor_2);
+
+    DataType &&first_data_type = tensor_1->info()->data_type();
+    ARM_COMPUTE_UNUSED(first_data_type);
+
+    const std::array<const ITensor *, sizeof...(Ts)> tensors_array{ { std::forward<Ts>(tensors)... } };
+    ARM_COMPUTE_UNUSED(tensors_array);
+
+    ARM_COMPUTE_ERROR_ON_LOC_MSG(tensor_2->info()->data_type() != first_data_type || std::any_of(tensors_array.begin(), tensors_array.end(), [&](const ITensor * tensor)
+    {
+        return tensor->info()->data_type() != first_data_type;
+    }),
+    function, file, line, "Tensors have different data types");
+}
+
+#define ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(...) ::arm_compute::error_on_mismatching_data_types(__func__, __FILE__, __LINE__, __VA_ARGS__)
+
+/** Throw an error if the passed tensors have different fixed point data types or different fixed point positions
+ *
+ * @note: If the first tensor doesn't have fixed point data type, the function returns without throwing an error
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] tensor_1 The first tensor to be compared.
+ *  @param[in] tensor_2 The second tensor to be compared.
+ *  @param[in] tensors  (Optional) Further allowed tensors.
+ */
+template <typename... Ts>
+void error_on_mismatching_fixed_point(const char *function, const char *file, const int line,
+                                      const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
+{
+    ARM_COMPUTE_UNUSED(function);
+    ARM_COMPUTE_UNUSED(file);
+    ARM_COMPUTE_UNUSED(line);
+    ARM_COMPUTE_UNUSED(tensor_1);
+    ARM_COMPUTE_UNUSED(tensor_2);
+
+    DataType &&first_data_type            = tensor_1->info()->data_type();
+    const int  first_fixed_point_position = tensor_1->info()->fixed_point_position();
+    ARM_COMPUTE_UNUSED(first_data_type);
+    ARM_COMPUTE_UNUSED(first_fixed_point_position);
+
+    if((first_data_type != DataType::QS8) && (first_data_type != DataType::QS16))
+    {
+        return;
+    }
+
+    const std::array < const ITensor *, 1 + sizeof...(Ts) > tensors_array{ { tensor_2, std::forward<Ts>(tensors)... } };
+    ARM_COMPUTE_UNUSED(tensors_array);
+
+    ARM_COMPUTE_ERROR_ON_LOC_MSG(std::any_of(tensors_array.begin(), tensors_array.end(), [&](const ITensor * tensor)
+    {
+        return tensor->info()->data_type() != first_data_type;
+    }),
+    function, file, line, "Tensors have different fixed point data types");
+
+    ARM_COMPUTE_ERROR_ON_LOC_MSG(std::any_of(tensors_array.begin(), tensors_array.end(), [&](const ITensor * tensor)
+    {
+        return tensor->info()->fixed_point_position() != first_fixed_point_position;
+    }),
+    function, file, line, "Tensors have different fixed point positions");
+}
+
+#define ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(...) ::arm_compute::error_on_mismatching_fixed_point(__func__, __FILE__, __LINE__, __VA_ARGS__)
+
+/** Throw an error if the format of the passed tensor/multi-image does not match any of the formats provided.
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] object   Tensor/multi-image to validate.
+ *  @param[in] format   First format allowed.
+ *  @param[in] formats  (Optional) Further allowed formats.
+ */
+template <typename T, typename F, typename... Fs>
+void error_on_format_not_in(const char *function, const char *file, const int line,
+                            const T *object, F &&format, Fs &&... formats)
+{
+    ARM_COMPUTE_ERROR_ON_LOC(object == nullptr, function, file, line);
+
+    Format &&object_format = object->info()->format();
+    ARM_COMPUTE_UNUSED(object_format);
+
+    ARM_COMPUTE_ERROR_ON_LOC(object_format == Format::UNKNOWN, function, file, line);
+
+    const std::array<F, sizeof...(Fs)> formats_array{ { std::forward<Fs>(formats)... } };
+    ARM_COMPUTE_UNUSED(formats_array);
+
+    ARM_COMPUTE_ERROR_ON_LOC_MSG(object_format != format && std::none_of(formats_array.begin(), formats_array.end(), [&](const F & f)
+    {
+        return f == object_format;
+    }),
+    function, file, line, "Format %s not supported by this kernel", string_from_format(object_format).c_str());
+}
+#define ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(t, ...) ::arm_compute::error_on_format_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__)
+
+/** Throw an error if the data type of the passed tensor does not match any of the data types provided.
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] tensor   Tensor to validate.
+ *  @param[in] dt       First data type allowed.
+ *  @param[in] dts      (Optional) Further allowed data types.
+ */
+template <typename T, typename... Ts>
+void error_on_data_type_not_in(const char *function, const char *file, const int line,
+                               const ITensor *tensor, T &&dt, Ts &&... dts)
+{
+    ARM_COMPUTE_ERROR_ON_LOC(tensor == nullptr, function, file, line);
+
+    const DataType &tensor_dt = tensor->info()->data_type(); //NOLINT
+    ARM_COMPUTE_UNUSED(tensor_dt);
+
+    ARM_COMPUTE_ERROR_ON_LOC(tensor_dt == DataType::UNKNOWN, function, file, line);
+
+    const std::array<T, sizeof...(Ts)> dts_array{ { std::forward<Ts>(dts)... } };
+    ARM_COMPUTE_UNUSED(dts_array);
+
+    ARM_COMPUTE_ERROR_ON_LOC_MSG(tensor_dt != dt && std::none_of(dts_array.begin(), dts_array.end(), [&](const T & d)
+    {
+        return d == tensor_dt;
+    }),
+    function, file, line, "ITensor data type %s not supported by this kernel", string_from_data_type(tensor_dt).c_str());
+}
+#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(t, ...) ::arm_compute::error_on_data_type_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__)
+
+/** Throw an error if the data type or the number of channels of the passed tensor does not match any of the data types and number of channels provided.
+ *
+ *  @param[in] function     Function in which the error occurred.
+ *  @param[in] file         Name of the file where the error occurred.
+ *  @param[in] line         Line on which the error occurred.
+ *  @param[in] tensor       Tensor to validate.
+ *  @param[in] num_channels Number of channels to check
+ *  @param[in] dt           First data type allowed.
+ *  @param[in] dts          (Optional) Further allowed data types.
+ */
+template <typename T, typename... Ts>
+void error_on_data_type_channel_not_in(const char *function, const char *file, const int line,
+                                       const ITensor *tensor, size_t num_channels, T &&dt, Ts &&... dts)
+{
+    error_on_data_type_not_in(function, file, line, tensor, std::forward<T>(dt), std::forward<Ts>(dts)...);
+
+    const size_t tensor_nc = tensor->info()->num_channels();
+    ARM_COMPUTE_UNUSED(tensor_nc);
+
+    ARM_COMPUTE_ERROR_ON_LOC_MSG(tensor_nc != num_channels, function, file, line, "Number of channels %d. Required number of channels %d", tensor_nc, num_channels);
+}
+#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c, ...) ::arm_compute::error_on_data_type_channel_not_in(__func__, __FILE__, __LINE__, t, c, __VA_ARGS__)
+
+/** Throw an error if the tensor is not 2D.
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] tensor   Tensor to validate.
+ */
+void error_on_tensor_not_2d(const char *function, const char *file, const int line,
+                            const ITensor *tensor);
+#define ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(t) ::arm_compute::error_on_tensor_not_2d(__func__, __FILE__, __LINE__, t)
+
+/** Throw an error if the channel is not in channels.
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] cn       Input channel
+ *  @param[in] channel  First channel allowed.
+ *  @param[in] channels (Optional) Further allowed channels.
+ */
+template <typename T, typename... Ts>
+void error_on_channel_not_in(const char *function, const char *file, const int line,
+                             T cn, T &&channel, Ts &&... channels)
+{
+    ARM_COMPUTE_ERROR_ON_LOC(cn == Channel::UNKNOWN, function, file, line);
+
+    const std::array<T, sizeof...(Ts)> channels_array{ { std::forward<Ts>(channels)... } };
+    ARM_COMPUTE_UNUSED(channels_array);
+    ARM_COMPUTE_ERROR_ON_LOC(channel != cn && std::none_of(channels_array.begin(), channels_array.end(), [&](const T & f)
+    {
+        return f == cn;
+    }),
+    function, file, line);
+}
+#define ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN(c, ...) ::arm_compute::error_on_channel_not_in(__func__, __FILE__, __LINE__, c, __VA_ARGS__)
+
+/** Throw an error if the channel is not in format.
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] fmt      Input channel
+ *  @param[in] cn       First channel allowed.
+ */
+void error_on_channel_not_in_known_format(const char *function, const char *file, const int line,
+                                          Format fmt, Channel cn);
+#define ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(f, c) ::arm_compute::error_on_channel_not_in_known_format(__func__, __FILE__, __LINE__, f, c)
+
+/** Throw an error if the @ref IMultiHOG container is invalid
+ *
+ * An @ref IMultiHOG container is invalid if:
+ *
+ * -# it is a nullptr
+ * -# it doesn't contain models
+ * -# it doesn't have the HOG data objects with the same phase_type, normalization_type and l2_hyst_threshold (if normalization_type == L2HYS_NORM)
+ *
+ *  @param[in] function  Function in which the error occurred.
+ *  @param[in] file      Name of the file where the error occurred.
+ *  @param[in] line      Line on which the error occurred.
+ *  @param[in] multi_hog IMultiHOG container to validate
+ */
+void error_on_invalid_multi_hog(const char *function, const char *file, const int line,
+                                const IMultiHOG *multi_hog);
+#define ARM_COMPUTE_ERROR_ON_INVALID_MULTI_HOG(m) ::arm_compute::error_on_invalid_multi_hog(__func__, __FILE__, __LINE__, m)
+
+/** Throw an error if the kernel is not configured.
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] kernel   Kernel to validate.
+ */
+void error_on_unconfigured_kernel(const char *function, const char *file, const int line,
+                                  const IKernel *kernel);
+#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k) ::arm_compute::error_on_unconfigured_kernel(__func__, __FILE__, __LINE__, k)
+
+/** Throw an error if if the coordinates and shape of the subtensor are within the parent tensor.
+ *
+ * @param[in] function     Function in which the error occurred.
+ * @param[in] file         Name of the file where the error occurred.
+ * @param[in] line         Line on which the error occurred.
+ * @param[in] parent_shape Parent tensor shape
+ * @param[in] coords       Coordinates inside the parent tensor where the first element of the subtensor is
+ * @param[in] shape        Shape of the subtensor
+ */
+void error_on_invalid_subtensor(const char *function, const char *file, const int line,
+                                const TensorShape &parent_shape, const Coordinates &coords, const TensorShape &shape);
+#define ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(p, c, s) ::arm_compute::error_on_invalid_subtensor(__func__, __FILE__, __LINE__, p, c, s)
+
+/** Throw an error if the valid region of a subtensor is not inside the valid region of the parent tensor.
+ *
+ * @param[in] function            Function in which the error occurred.
+ * @param[in] file                Name of the file where the error occurred.
+ * @param[in] line                Line on which the error occurred.
+ * @param[in] parent_valid_region Parent valid region.
+ * @param[in] valid_region        Valid region of subtensor.
+ */
+void error_on_invalid_subtensor_valid_region(const char *function, const char *file, const int line,
+                                             const ValidRegion &parent_valid_region, const ValidRegion &valid_region);
+#define ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR_VALID_REGION(pv, sv) ::arm_compute::error_on_invalid_subtensor_valid_region(__func__, __FILE__, __LINE__, pv, sv)
+
+/** Throw an error if the input fixed-point positions are different.
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] tensor_1 The first tensor to be compared.
+ *  @param[in] tensor_2 The second tensor to be compared.
+ *  @param[in] tensors  (Optional) Further allowed tensors.
+ */
+template <typename... Ts>
+void error_on_mismatching_fixed_point_position(const char *function, const char *file, const int line,
+                                               const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
+{
+    const std::array < const ITensor *, 1 + sizeof...(Ts) > tensors_array{ { tensor_2, std::forward<Ts>(tensors)... } };
+    ARM_COMPUTE_UNUSED(tensors_array);
+
+    ARM_COMPUTE_ERROR_ON_LOC_MSG(std::any_of(tensors_array.begin(), tensors_array.end(), [&](const ITensor * tensor)
+    {
+        return tensor->info()->fixed_point_position() != tensor_1->info()->fixed_point_position();
+    }),
+    function, file, line, "Tensors have different fixed-point positions");
+}
+#define ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(...) ::arm_compute::error_on_mismatching_fixed_point_position(__func__, __FILE__, __LINE__, __VA_ARGS__)
+
+/** Throw an error if the fixed-point value is not representable in the specified Q format.
+ *
+ *  @param[in] function Function in which the error occurred.
+ *  @param[in] file     Name of the file where the error occurred.
+ *  @param[in] line     Line on which the error occurred.
+ *  @param[in] value    The floating point value to be checked.
+ *  @param[in] tensor   Input tensor that has information on data type and fixed-point position.
+ */
+template <typename... Ts>
+void error_on_value_not_representable_in_fixed_point(const char *function, const char *file, int line,
+                                                     float value, const ITensor *tensor)
+{
+    const int          fixed_point_position = tensor->info()->fixed_point_position();
+    const DataType     dt                   = tensor->info()->data_type();
+    const unsigned int q_max_range          = 0xFFFFFFFFu >> (((sizeof(unsigned int) - element_size_from_data_type(dt)) * 8) + 1);
+    const float        max_range            = q_max_range / (static_cast<float>(1 << fixed_point_position));
+    ARM_COMPUTE_UNUSED(max_range);
+
+    ARM_COMPUTE_ERROR_ON_LOC_MSG(value > max_range, function, file, line,
+                                 "Value %f is not representable in %s with fixed-point position %d", value, string_from_data_type(dt).c_str(), fixed_point_position);
+}
+#define ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(...) ::arm_compute::error_on_value_not_representable_in_fixed_point(__func__, __FILE__, __LINE__, __VA_ARGS__)
+}
+#endif /* __ARM_COMPUTE_VALIDATE_H__*/
diff --git a/arm_compute/core/Window.h b/arm_compute/core/Window.h
new file mode 100644
index 0000000000..6e7ef22531
--- /dev/null
+++ b/arm_compute/core/Window.h
@@ -0,0 +1,355 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_WINDOW_H__
+#define __ARM_COMPUTE_WINDOW_H__
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Utils.h"
+
+namespace arm_compute
+{
+/** Describe a multidimensional execution window. */
+class Window
+{
+public:
+    /** Alias for dimension 0 also known as X dimension */
+    static constexpr size_t DimX = 0;
+    /** Alias for dimension 1 also known as Y dimension */
+    static constexpr size_t DimY = 1;
+    /** Alias for dimension 2 also known as Z dimension */
+    static constexpr size_t DimZ = 2;
+
+    /** Default constructor: create a window containing a single element. */
+    constexpr Window()
+        : _dims(), _thread_id(0), _num_threads(1)
+    {
+    }
+    /** Copy constructor
+     *
+     * @param[in] src Copy the values from src to a new object
+     */
+    Window(const Window &src);
+
+    /** Describe one of the image's dimensions with a start, end and step.
+     *
+     * Iteration through the elements of the dimension is done like this:
+     * for(int v = start(); v < end(); v += step())
+     * {
+     *   ...
+     * }
+     */
+    class Dimension
+    {
+    public:
+        /** Constructor, by default creates a dimension of 1.
+         *
+         * @param[in] start Start of the dimension
+         * @param[in] end   End of the dimension
+         * @param[in] step  Step between two elements of the dimension when iterating.
+         *
+         */
+        constexpr Dimension(int start = 0, int end = 1, int step = 1)
+            : _start(start), _end(end), _step(step)
+        {
+        }
+        /** Default assignment operator to allow dimensions to be copied */
+        Dimension &operator=(const Dimension &d) = default;
+        /** Return the start of the dimension */
+        constexpr int start() const
+        {
+            return _start;
+        }
+        /** Return the end of the dimension */
+        constexpr int end() const
+        {
+            return _end;
+        }
+        /** Return the step of the dimension */
+        constexpr int step() const
+        {
+            return _step;
+        }
+        /** Set the dimension's step
+         *
+         * @param[in] step The new step
+         */
+        void set_step(int step)
+        {
+            _step = step;
+        }
+
+    private:
+        int _start; /**< Start of the dimension */
+        int _end;   /**< End of the dimension */
+        int _step;
+    };
+
+    /** Read only access to a given dimension of the window
+     *
+     * @note Precondition: dimension < Coordinates::num_max_dimensions
+     *
+     * @param[in] dimension The dimension to access
+     *
+     * @return The requested dimension
+     */
+    constexpr const Dimension &operator[](size_t dimension) const;
+
+    /** Alias to access the first dimension of the window
+     *
+     * @return First dimension of the window
+     */
+    constexpr const Dimension &x() const
+    {
+        return _dims.at(Window::DimX);
+    }
+
+    /** Alias to access the second dimension of the window
+     *
+     * @return Second dimension of the window
+     */
+    constexpr const Dimension &y() const
+    {
+        return _dims.at(Window::DimY);
+    }
+
+    /** Alias to access the third dimension of the window
+     *
+     * @return Third dimension of the window
+     */
+    constexpr const Dimension &z() const
+    {
+        return _dims.at(Window::DimZ);
+    }
+
+    /** Set the values of a given dimension
+     *
+     * @param[in] dimension The dimension to set
+     * @param[in] dim       The values to set the dimension to
+     */
+    void set(size_t dimension, const Dimension &dim);
+
+    /** Use the tensor's dimensions to fill the window dimensions.
+     *
+     * @param[in] info            Tensor information to copy the dimensions from.
+     * @param[in] first_dimension Only copy dimensions which are greater or equal to this value.
+     */
+    void use_tensor_dimensions(const ITensorInfo *info, size_t first_dimension = Window::DimX);
+
+    /** Shift the values of a given dimension by the given shift_value
+     *
+     * @param[in] dimension   The dimension to shift
+     * @param[in] shift_value Value to shift the start and end values of.
+     */
+    void shift(size_t dimension, int shift_value);
+
+    /** Adjust the start or end of a given dimension by the given value
+     *
+     * @param[in] dimension    The dimension to adjust
+     * @param[in] adjust_value The adjusted value.
+     * @param[in] is_at_start  The flag to indicate whether adjust the start or end of the dimension.
+     */
+    void adjust(size_t dimension, int adjust_value, bool is_at_start);
+
+    /** Scale the values of a given dimension by the given scale_value
+     *
+     * @note The end of the window is rounded up to be a multiple of step after the scaling.
+     *
+     * @param[in] dimension   The dimension to scale
+     * @param[in] scale_value Value to scale the start, end and step values of.
+     */
+    void scale(size_t dimension, float scale_value);
+
+    /** Set the step of a given dimension.
+     *
+     * @param[in] dimension Dimension to update
+     * @param[in] step      The new dimension's step value
+     */
+    void set_dimension_step(size_t dimension, int step);
+
+    /** Will validate all the window's dimensions' values when asserts are enabled
+     *
+     * No-op when asserts are disabled
+     */
+    void validate() const;
+
+    /** Return the number of iterations needed to iterate through a given dimension
+     *
+     * @param[in] dimension The requested dimension
+     *
+     * @return The number of iterations
+     */
+    constexpr size_t num_iterations(size_t dimension) const;
+
+    /** Split a window into a set of sub windows along a given dimension
+     *
+     * For example to split a window into 3 sub-windows along the Y axis, you would have to do:<br/>
+     * Window sub0 = window.split_window( 1, 0, 3);<br/>
+     * Window sub1 = window.split_window( 1, 1, 3);<br/>
+     * Window sub2 = window.split_window( 1, 2, 3);<br/>
+     *
+     * @param[in] dimension Dimension along which the split will be performed
+     * @param[in] id        Id of the sub-window to return. Must be in the range (0, total-1)
+     * @param[in] total     Total number of sub-windows the window will be split into.
+     *
+     * @return The subwindow "id" out of "total"
+     */
+    Window split_window(size_t dimension, size_t id, size_t total) const;
+    /** First 1D slice of the window
+     *
+     * @return The first slice of the window.
+     */
+    Window first_slice_window_1D() const
+    {
+        return first_slice_window<1>();
+    };
+    /** First 2D slice of the window
+     *
+     * @return The first slice of the window.
+     */
+    Window first_slice_window_2D() const
+    {
+        return first_slice_window<2>();
+    };
+    /** First 3D slice of the window
+     *
+     * @return The first slice of the window.
+     */
+    Window first_slice_window_3D() const
+    {
+        return first_slice_window<3>();
+    };
+    /** Slide the passed 1D window slice.
+     *
+     * If slice contains the last slice then it will remain unchanged and false will be returned.
+     *
+     * @param[in,out] slice Current slice, to be updated to the next slice.
+     *
+     * @return true if slice contains a new slice, false if slice already contained the last slice
+     */
+    bool slide_window_slice_1D(Window &slice) const
+    {
+        return slide_window_slice<1>(slice);
+    }
+    /** Slide the passed 2D window slice.
+     *
+     * If slice contains the last slice then it will remain unchanged and false will be returned.
+     *
+     * @param[in,out] slice Current slice, to be updated to the next slice.
+     *
+     * @return true if slice contains a new slice, false if slice already contained the last slice
+     */
+    bool slide_window_slice_2D(Window &slice) const
+    {
+        return slide_window_slice<2>(slice);
+    }
+    /** Slide the passed 3D window slice.
+     *
+     * If slice contains the last slice then it will remain unchanged and false will be returned.
+     *
+     * @param[in,out] slice Current slice, to be updated to the next slice.
+     *
+     * @return true if slice contains a new slice, false if slice already contained the last slice
+     */
+    bool slide_window_slice_3D(Window &slice) const
+    {
+        return slide_window_slice<3>(slice);
+    }
+    /** Slide the passed 4D window slice.
+     *
+     * If slice contains the last slice then it will remain unchanged and false will be returned.
+     *
+     * @param[in,out] slice Current slice, to be updated to the next slice.
+     *
+     * @return true if slice contains a new slice, false if slice already contained the last slice
+     */
+    bool slide_window_slice_4D(Window &slice) const
+    {
+        return slide_window_slice<4>(slice);
+    }
+    /** Sets the ID of the thread that the window is associated with.
+     *
+     * @param id ID of the thread that the window is associated with.
+     */
+    void set_thread_id(unsigned int id)
+    {
+        _thread_id = id;
+    }
+    /** Sets the number of threads dispatched that the window is associated with.
+     *
+     * @param num_threads The number of threads dispatched that the window is associated with.
+     */
+    void set_num_threads(unsigned int num_threads)
+    {
+        _num_threads = num_threads;
+    }
+    /** Get the ID of the thread that the window is associated with.
+     *
+     * @return ID of the thread that the window is associated with.
+     */
+    constexpr unsigned int thread_id() const
+    {
+        return _thread_id;
+    }
+    /** Get the number of threads dispatched that the window is associated with.
+     *
+     * @return The number of threads dispatched that the window is associated with.
+     */
+    constexpr unsigned int num_threads() const
+    {
+        return _num_threads;
+    }
+
+private:
+    /** First slice of the window
+     *
+     * @return The first slice of the window.
+     */
+    template <unsigned int window_dimension>
+    Window                 first_slice_window() const;
+
+    /** Slide the passed window slice.
+     *
+     * If slice contains the last slice then it will remain unchanged and false will be returned.
+     *
+     * @param[in,out] slice Current slice, to be updated to the next slice.
+     *
+     * @return true if slice contains a new slice, false if slice already contained the last slice
+     */
+    template <unsigned int window_dimension>
+    bool slide_window_slice(Window &slice) const;
+
+private:
+    std::array<Dimension, Coordinates::num_max_dimensions> _dims;
+    unsigned int _thread_id;
+    unsigned int _num_threads;
+};
+}
+#include "Window.inl"
+#endif /*__ARM_COMPUTE_WINDOW_H__ */
diff --git a/arm_compute/core/Window.inl b/arm_compute/core/Window.inl
new file mode 100644
index 0000000000..75428a145b
--- /dev/null
+++ b/arm_compute/core/Window.inl
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+namespace arm_compute
+{
+inline Window::Window(const Window &src)
+    : _dims(), _thread_id(src._thread_id), _num_threads(src._num_threads)
+{
+    for(size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
+    {
+        set(i, src[i]);
+    }
+}
+
+inline constexpr const Window::Dimension &Window::operator[](const size_t dimension) const
+{
+    // Precondition: dimension < Coordinates::num_max_dimensions
+    return _dims.at(dimension);
+}
+inline void Window::set(const size_t dimension, const Window::Dimension &dim)
+{
+    ARM_COMPUTE_ERROR_ON(dimension >= Coordinates::num_max_dimensions);
+    _dims[dimension] = dim;
+}
+
+inline void Window::shift(const size_t dimension, const int shift_value)
+{
+    ARM_COMPUTE_ERROR_ON(dimension >= Coordinates::num_max_dimensions);
+    Window::Dimension &d = _dims[dimension];
+    d                    = Window::Dimension(d.start() + shift_value, d.end() + shift_value, d.step());
+}
+
+inline void Window::adjust(size_t dimension, int adjust_value, bool is_at_start)
+{
+    ARM_COMPUTE_ERROR_ON(dimension >= Coordinates::num_max_dimensions);
+    Window::Dimension &d = _dims[dimension];
+
+    if(is_at_start)
+    {
+        d = Window::Dimension(d.start() + adjust_value, d.end(), d.step());
+    }
+    else
+    {
+        d = Window::Dimension(d.start(), d.end() + adjust_value, d.step());
+    }
+}
+
+inline void Window::scale(const size_t dimension, float scale_value)
+{
+    ARM_COMPUTE_ERROR_ON(dimension >= Coordinates::num_max_dimensions);
+    Window::Dimension &d           = _dims[dimension];
+    const int          scaled_step = d.step() * scale_value;
+    const int          scaled_end  = ceil_to_multiple(d.end() * scale_value, scaled_step);
+    d                              = Window::Dimension(d.start() * scale_value, scaled_end, scaled_step);
+}
+
+inline void Window::set_dimension_step(const size_t dimension, const int step)
+{
+    ARM_COMPUTE_ERROR_ON(dimension >= Coordinates::num_max_dimensions);
+    _dims[dimension].set_step(step);
+}
+
+inline void Window::validate() const
+{
+    for(size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON(_dims[i].step() == 0);
+        ARM_COMPUTE_ERROR_ON(_dims[i].end() <= _dims[i].start());
+        ARM_COMPUTE_ERROR_ON((_dims[i].end() - _dims[i].start()) % _dims[i].step());
+    }
+}
+
+inline constexpr size_t Window::num_iterations(size_t dimension) const
+{
+    // Precondition: dimension < Coordinates::num_max_dimensions
+    // Precondition: (end - start) % step == 0
+    return (_dims.at(dimension).end() - _dims.at(dimension).start()) / _dims.at(dimension).step();
+}
+
+inline Window Window::split_window(const size_t dimension, const size_t id, const size_t total) const
+{
+    ARM_COMPUTE_ERROR_ON(id >= total);
+    ARM_COMPUTE_ERROR_ON(dimension >= Coordinates::num_max_dimensions);
+
+    Window out;
+
+    for(size_t d = 0; d < Coordinates::num_max_dimensions; ++d)
+    {
+        if(d == dimension)
+        {
+            int start          = _dims[d].start();
+            int end            = _dims[d].end();
+            int per_sub_window = (num_iterations(d) / total) * _dims[d].step();
+
+            start += id * per_sub_window;
+
+            if(id != total - 1)
+            {
+                end = start + per_sub_window;
+            }
+
+            out.set(d, Dimension(start, end, _dims[d].step()));
+        }
+        else
+        {
+            out.set(d, _dims[d]);
+        }
+    }
+
+    return out;
+}
+
+template <unsigned int window_dimension>
+inline bool Window::slide_window_slice(Window &slice) const
+{
+    for(unsigned int n = window_dimension; n < Coordinates::num_max_dimensions; ++n)
+    {
+        // Did we reach the end of this dimension?
+        const int v = slice._dims[n].start() + 1;
+
+        if(v < _dims[n].end())
+        {
+            // No: increment
+            slice._dims[n] = Dimension(v, v + 1, 1);
+
+            // Reset lower dimensions:
+            for(unsigned int lower = window_dimension; lower < n; ++lower)
+            {
+                slice._dims[lower] = Dimension(_dims[lower].start(), _dims[lower].start() + 1, 1);
+            }
+            return true;
+        }
+    }
+
+    // It was the last slice
+    return false; // Iteration over
+}
+
+template <unsigned int window_dimension>
+inline Window          Window::first_slice_window() const
+{
+    Window slice;
+
+    std::copy_n(_dims.begin(), window_dimension, slice._dims.begin());
+
+    //Initialise higher dimensions to be the first slice.
+    for(unsigned int n = window_dimension; n < Coordinates::num_max_dimensions; ++n)
+    {
+        slice._dims[n] = Dimension(_dims[n].start(), _dims[n].start() + 1, 1);
+    }
+
+    return slice;
+}
+
+inline void Window::use_tensor_dimensions(const ITensorInfo *info, const size_t first_dimension)
+{
+    for(unsigned int n = first_dimension; n < info->num_dimensions(); ++n)
+    {
+        set(n, Window::Dimension(0, std::max(info->dimension(n), static_cast<size_t>(1))));
+    }
+}
+}
diff --git a/arm_compute/runtime/Array.h b/arm_compute/runtime/Array.h
new file mode 100644
index 0000000000..c8a240e428
--- /dev/null
+++ b/arm_compute/runtime/Array.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ARRAY_H__
+#define __ARM_COMPUTE_ARRAY_H__
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/Types.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Basic implementation of the IArray interface which allocates a static number of T values  */
+template <class T>
+class Array : public IArray<T>
+{
+public:
+    /** Default constructor: empty array */
+    Array()
+        : IArray<T>(0), _values(nullptr)
+    {
+    }
+    /** Constructor: initializes an array which can contain up to max_num_points values
+     *
+     * @param[in] max_num_values Maximum number of values the array will be able to stored
+     */
+    Array(size_t max_num_values)
+        : IArray<T>(max_num_values), _values(arm_compute::cpp14::make_unique<T[]>(max_num_values))
+    {
+    }
+
+    // Inherited methods overridden:
+    T *buffer() const override
+    {
+        return _values.get();
+    }
+
+private:
+    std::unique_ptr<T[]> _values;
+};
+
+using KeyPointArray        = Array<KeyPoint>;
+using Coordinates2DArray   = Array<Coordinates2D>;
+using DetectionWindowArray = Array<DetectionWindow>;
+using Size2DArray          = Array<Size2D>;
+using UInt8Array           = Array<uint8_t>;
+using UInt16Array          = Array<uint16_t>;
+using UInt32Array          = Array<uint32_t>;
+using Int16Array           = Array<int16_t>;
+using Int32Array           = Array<int32_t>;
+using FloatArray           = Array<float>;
+}
+#endif /* __ARM_COMPUTE_ARRAY_H__ */
diff --git a/arm_compute/runtime/CL/CLArray.h b/arm_compute/runtime/CL/CLArray.h
new file mode 100644
index 0000000000..f4c2ef06d9
--- /dev/null
+++ b/arm_compute/runtime/CL/CLArray.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLARRAY_H__
+#define __ARM_COMPUTE_CLARRAY_H__
+
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+/** CLArray implementation  */
+template <class T>
+class CLArray : public ICLArray<T>
+{
+public:
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLArray(const CLArray &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    const CLArray &operator=(const CLArray &) = delete;
+    /** Constructor: initializes an array which can contain up to max_num_points values
+     *
+     * @param[in] max_num_values Maximum number of values the array will be able to stored
+     */
+    CLArray(size_t max_num_values)
+        : ICLArray<T>(max_num_values), _buffer(cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, max_num_values * sizeof(T)))
+    {
+    }
+    /** Enqueue a map operation of the allocated buffer.
+     *
+     * @param[in] blocking If true, then the mapping will be ready to use by the time
+     *                     this method returns, else it is the caller's responsibility
+     *                     to flush the queue and wait for the mapping operation to have completed.
+     */
+    void map(bool blocking = true)
+    {
+        ICLArray<T>::map(CLScheduler::get().queue(), blocking);
+    }
+    using ICLArray<T>::map;
+    /** Enqueue an unmap operation of the allocated and mapped buffer.
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     */
+    void unmap()
+    {
+        ICLArray<T>::unmap(CLScheduler::get().queue());
+    }
+    using ICLArray<T>::unmap;
+
+    // Inherited methods overridden:
+    const cl::Buffer &cl_buffer() const override
+    {
+        return _buffer;
+    }
+
+protected:
+    // Inherited methods overridden:
+    uint8_t *do_map(cl::CommandQueue &q, bool blocking) override
+    {
+        ARM_COMPUTE_ERROR_ON(nullptr == _buffer.get());
+        return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, this->max_num_values() * sizeof(T)));
+    }
+    void do_unmap(cl::CommandQueue &q, uint8_t *mapping) override
+    {
+        ARM_COMPUTE_ERROR_ON(nullptr == _buffer.get());
+        q.enqueueUnmapMemObject(_buffer, mapping);
+    }
+
+private:
+    cl::Buffer _buffer;
+};
+
+using CLKeyPointArray        = CLArray<KeyPoint>;
+using CLCoordinates2DArray   = CLArray<Coordinates2D>;
+using CLDetectionWindowArray = CLArray<DetectionWindow>;
+using CLSize2DArray          = CLArray<Size2D>;
+using CLUInt8Array           = CLArray<cl_uchar>;
+using CLUInt16Array          = CLArray<cl_ushort>;
+using CLUInt32Array          = CLArray<cl_uint>;
+using CLInt16Array           = CLArray<cl_short>;
+using CLInt32Array           = CLArray<cl_int>;
+using CLFloatArray           = CLArray<cl_float>;
+}
+#endif /* __ARM_COMPUTE_CLARRAY_H__ */
diff --git a/arm_compute/runtime/CL/CLDistribution1D.h b/arm_compute/runtime/CL/CLDistribution1D.h
new file mode 100644
index 0000000000..55dd1247ed
--- /dev/null
+++ b/arm_compute/runtime/CL/CLDistribution1D.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLDISTRIBUTION1D_H__
+#define __ARM_COMPUTE_CLDISTRIBUTION1D_H__
+
+#include "arm_compute/core/CL/ICLDistribution1D.h"
+#include "arm_compute/core/CL/OpenCL.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_compute
+{
+/** CLDistribution1D object class */
+class CLDistribution1D : public ICLDistribution1D
+{
+public:
+    /** Constructor: Creates a 1D CLDistribution of a consecutive interval [offset, offset + range - 1]
+     *               defined by a start offset and valid range, divided equally into num_bins parts.
+     *
+     * @param[in] num_bins The number of bins the distribution is divided in.
+     * @param[in] offset   The start of the values to use.
+     * @param[in] range    The total number of the consecutive values of the distribution interval.
+     */
+    CLDistribution1D(size_t num_bins, int32_t offset, uint32_t range);
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLDistribution1D(const CLDistribution1D &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLDistribution1D &operator=(const CLDistribution1D &) = delete;
+    /** Enqueue a map operation of the allocated buffer.
+     *
+     * @param[in] blocking If true, then the mapping will be ready to use by the time
+     *                     this method returns, else it is the caller's responsibility
+     *                     to flush the queue and wait for the mapping operation to have completed.
+     */
+    void map(bool blocking = true);
+    using ICLDistribution1D::map;
+    /** Enqueue an unmap operation of the allocated and mapped buffer.
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     */
+    void unmap();
+    using ICLDistribution1D::unmap;
+
+    // Inherited methods overridden:
+    cl::Buffer &cl_buffer() override;
+
+protected:
+    // Inherited methods overridden:
+    uint32_t *do_map(cl::CommandQueue &q, bool blocking) override;
+    void do_unmap(cl::CommandQueue &q) override;
+
+private:
+    cl::Buffer _mem;
+};
+}
+#endif /* __ARM_COMPUTE_CLDISTRIBUTION1D_H__ */
diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h
new file mode 100644
index 0000000000..82929ba139
--- /dev/null
+++ b/arm_compute/runtime/CL/CLFunctions.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLFUNCTIONS_H__
+#define __ARM_COMPUTE_CLFUNCTIONS_H__
+
+/* Header regrouping all the CL functions */
+#include "arm_compute/runtime/CL/functions/CLAbsoluteDifference.h"
+#include "arm_compute/runtime/CL/functions/CLAccumulate.h"
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLArithmeticAddition.h"
+#include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
+#include "arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLBitwiseAnd.h"
+#include "arm_compute/runtime/CL/functions/CLBitwiseNot.h"
+#include "arm_compute/runtime/CL/functions/CLBitwiseOr.h"
+#include "arm_compute/runtime/CL/functions/CLBitwiseXor.h"
+#include "arm_compute/runtime/CL/functions/CLBox3x3.h"
+#include "arm_compute/runtime/CL/functions/CLCannyEdge.h"
+#include "arm_compute/runtime/CL/functions/CLChannelCombine.h"
+#include "arm_compute/runtime/CL/functions/CLChannelExtract.h"
+#include "arm_compute/runtime/CL/functions/CLColorConvert.h"
+#include "arm_compute/runtime/CL/functions/CLConvolution.h"
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLDepthConcatenate.h"
+#include "arm_compute/runtime/CL/functions/CLDepthConvert.h"
+#include "arm_compute/runtime/CL/functions/CLDerivative.h"
+#include "arm_compute/runtime/CL/functions/CLDilate.h"
+#include "arm_compute/runtime/CL/functions/CLEqualizeHistogram.h"
+#include "arm_compute/runtime/CL/functions/CLErode.h"
+#include "arm_compute/runtime/CL/functions/CLFastCorners.h"
+#include "arm_compute/runtime/CL/functions/CLFillBorder.h"
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+#include "arm_compute/runtime/CL/functions/CLGEMM.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMLowp.h"
+#include "arm_compute/runtime/CL/functions/CLGaussian3x3.h"
+#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
+#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
+#include "arm_compute/runtime/CL/functions/CLHOGDescriptor.h"
+#include "arm_compute/runtime/CL/functions/CLHOGDetector.h"
+#include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
+#include "arm_compute/runtime/CL/functions/CLHOGMultiDetection.h"
+#include "arm_compute/runtime/CL/functions/CLHarrisCorners.h"
+#include "arm_compute/runtime/CL/functions/CLHistogram.h"
+#include "arm_compute/runtime/CL/functions/CLIntegralImage.h"
+#include "arm_compute/runtime/CL/functions/CLLaplacianPyramid.h"
+#include "arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h"
+#include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h"
+#include "arm_compute/runtime/CL/functions/CLMagnitude.h"
+#include "arm_compute/runtime/CL/functions/CLMeanStdDev.h"
+#include "arm_compute/runtime/CL/functions/CLMedian3x3.h"
+#include "arm_compute/runtime/CL/functions/CLMinMaxLocation.h"
+#include "arm_compute/runtime/CL/functions/CLNonLinearFilter.h"
+#include "arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h"
+#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLOpticalFlow.h"
+#include "arm_compute/runtime/CL/functions/CLPhase.h"
+#include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
+#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+#include "arm_compute/runtime/CL/functions/CLRemap.h"
+#include "arm_compute/runtime/CL/functions/CLScale.h"
+#include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
+#include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
+#include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
+#include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
+#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
+#include "arm_compute/runtime/CL/functions/CLTableLookup.h"
+#include "arm_compute/runtime/CL/functions/CLThreshold.h"
+#include "arm_compute/runtime/CL/functions/CLTranspose.h"
+#include "arm_compute/runtime/CL/functions/CLWarpAffine.h"
+#include "arm_compute/runtime/CL/functions/CLWarpPerspective.h"
+
+#endif /* __ARM_COMPUTE_CLFUNCTIONS_H__ */
diff --git a/arm_compute/runtime/CL/CLHOG.h b/arm_compute/runtime/CL/CLHOG.h
new file mode 100644
index 0000000000..9b4a303eca
--- /dev/null
+++ b/arm_compute/runtime/CL/CLHOG.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHOG_H__
+#define __ARM_COMPUTE_CLHOG_H__
+
+#include "arm_compute/core/CL/ICLHOG.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/HOGInfo.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+/** OpenCL implementation of HOG data-object */
+class CLHOG : public ICLHOG
+{
+public:
+    /** Default constructor */
+    CLHOG();
+    /** Allocate the HOG descriptor using the given HOG's metadata
+     *
+     * @param[in] input HOG's metadata used to allocate the HOG descriptor
+     */
+    void init(const HOGInfo &input);
+
+    /** Enqueue a map operation of the allocated buffer.
+     *
+     * @param[in] blocking If true, then the mapping will be ready to use by the time
+     *                     this method returns, else it is the caller's responsibility
+     *                     to flush the queue and wait for the mapping operation to have completed.
+     */
+    void map(bool blocking = true);
+    using ICLHOG::map;
+
+    /** Enqueue an unmap operation of the allocated and mapped buffer.
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     */
+    void unmap();
+    using ICLHOG::unmap;
+
+    // Inherited method overridden:
+    void              free() override;
+    const HOGInfo    *info() const override;
+    const cl::Buffer &cl_buffer() const override;
+
+protected:
+    // Inherited methods overridden:
+    uint8_t *do_map(cl::CommandQueue &q, bool blocking) override;
+    void do_unmap(cl::CommandQueue &q) override;
+
+private:
+    HOGInfo    _info;
+    cl::Buffer _buffer;
+};
+}
+#endif /* __ARM_COMPUTE_CLHOG_H__ */
diff --git a/arm_compute/runtime/CL/CLLut.h b/arm_compute/runtime/CL/CLLut.h
new file mode 100644
index 0000000000..9bac2b44c3
--- /dev/null
+++ b/arm_compute/runtime/CL/CLLut.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLLUT_H__
+#define __ARM_COMPUTE_CLLUT_H__
+
+#include "arm_compute/core/CL/ICLLut.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLLutAllocator.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_compute
+{
+class ILutAllocator;
+
+/** Basic implementation of the OpenCL lut interface */
+class CLLut : public ICLLut
+{
+public:
+    /** Constructor */
+    CLLut();
+    /** Constructor: initializes a LUT which can contain num_values values of data_type type.
+     *
+     * @param[in] num_elements Number of elements of the LUT.
+     * @param[in] data_type    Data type of each element.
+     */
+    CLLut(size_t num_elements, DataType data_type);
+    /** Return a pointer to the lut's allocator
+     *
+     * @return A pointer to the lut's allocator
+     */
+    ILutAllocator *allocator();
+    /** Enqueue a map operation of the allocated buffer.
+     *
+     * @param[in] blocking If true, then the mapping will be ready to use by the time
+     *                     this method returns, else it is the caller's responsibility
+     *                     to flush the queue and wait for the mapping operation to have completed.
+     */
+    void map(bool blocking = true);
+    using ICLLut::map;
+    /** Enqueue an unmap operation of the allocated and mapped buffer.
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     */
+    void unmap();
+    using ICLLut::unmap;
+
+    // Inherited methods overridden:
+    size_t            num_elements() const override;
+    uint32_t          index_offset() const override;
+    size_t            size_in_bytes() const override;
+    DataType          type() const override;
+    const cl::Buffer &cl_buffer() const override;
+    void              clear() override;
+
+protected:
+    // Inherited methods overridden:
+    uint8_t *do_map(cl::CommandQueue &q, bool blocking) override;
+    void do_unmap(cl::CommandQueue &q) override;
+
+private:
+    CLLutAllocator _allocator; /**< Instance of the OpenCL lut allocator */
+};
+}
+#endif /*__ARM_COMPUTE_CLLUT_H__ */
diff --git a/arm_compute/runtime/CL/CLLutAllocator.h b/arm_compute/runtime/CL/CLLutAllocator.h
new file mode 100644
index 0000000000..4648ffb51f
--- /dev/null
+++ b/arm_compute/runtime/CL/CLLutAllocator.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLLUTALLOCATOR_H__
+#define __ARM_COMPUTE_CLLUTALLOCATOR_H__
+
+#include "arm_compute/runtime/ILutAllocator.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+/** Basic implementation of a CL memory LUT allocator. */
+class CLLutAllocator : public ILutAllocator
+{
+public:
+    /** Default constructor. */
+    CLLutAllocator();
+    /** Default destructor. */
+    ~CLLutAllocator() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLLutAllocator(const CLLutAllocator &) = delete;
+    /** Prevent instances of this class from being copy assigned (As this class contains pointers). */
+    const CLLutAllocator &operator=(const CLLutAllocator &) = delete;
+    /** Interface to be implemented by the child class to return the pointer to the mapped data. */
+    uint8_t *data();
+    /** Interface to be implemented by the child class to return the pointer to the CL data. */
+    const cl::Buffer &cl_data() const;
+    /** Enqueue a map operation of the allocated buffer on the given queue.
+     *
+     * @param[in,out] q        The CL command queue to use for the mapping operation.
+     * @param[in]     blocking If true, then the mapping will be ready to use by the time
+     *                         this method returns, else it is the caller's responsibility
+     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
+     *
+     * @return The mapping address.
+     */
+    uint8_t *map(cl::CommandQueue &q, bool blocking);
+    /** Enqueue an unmap operation of the allocated buffer on the given queue.
+     *
+     * @note This method simply enqueue the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     *
+     * @param[in,out] q       The CL command queue to use for the mapping operation.
+     * @param[in]     mapping The cpu mapping to unmap.
+     */
+    void unmap(cl::CommandQueue &q, uint8_t *mapping);
+
+protected:
+    /** Allocate num_elements() * sizeof(type()) of OpenCL memory. */
+    void allocate() override;
+    /** Call map() on the OpenCL buffer.
+     *
+     * @return A pointer to the beginning of the LUT's allocation.
+     */
+    uint8_t *lock() override;
+    /** Call unmap() on the OpenCL buffer. */
+    void unlock() override;
+
+private:
+    cl::Buffer _buffer;  /**< OpenCL buffer containing the LUT data. */
+    uint8_t   *_mapping; /**< Pointer to the CPU mapping of the OpenCL buffer. */
+};
+}
+
+#endif /* __ARM_COMPUTE_CLLUTALLOCATOR_H__ */
diff --git a/arm_compute/runtime/CL/CLMultiHOG.h b/arm_compute/runtime/CL/CLMultiHOG.h
new file mode 100644
index 0000000000..17bb4e03c1
--- /dev/null
+++ b/arm_compute/runtime/CL/CLMultiHOG.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLMULTIHOG_H__
+#define __ARM_COMPUTE_CLMULTIHOG_H__
+
+#include "arm_compute/core/CL/ICLMultiHOG.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLHOG.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Basic implementation of the CL multi HOG data-objects */
+class CLMultiHOG : public ICLMultiHOG
+{
+public:
+    /** Constructor
+     *
+     * @param[in] num_models Number of HOG data objects to contain
+     *
+     */
+    CLMultiHOG(size_t num_models);
+
+    // Inherited methods overridden:
+    size_t  num_models() const override;
+    ICLHOG *cl_model(size_t index) override;
+    const ICLHOG *cl_model(size_t index) const override;
+
+private:
+    size_t                   _num_models;
+    std::unique_ptr<CLHOG[]> _model;
+};
+}
+#endif /*__ARM_COMPUTE_CLMULTIHOG_H__ */
diff --git a/arm_compute/runtime/CL/CLMultiImage.h b/arm_compute/runtime/CL/CLMultiImage.h
new file mode 100644
index 0000000000..f70929db07
--- /dev/null
+++ b/arm_compute/runtime/CL/CLMultiImage.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLMULTIIMAGE_H__
+#define __ARM_COMPUTE_CLMULTIIMAGE_H__
+
+#include "arm_compute/core/CL/ICLMultiImage.h"
+#include "arm_compute/core/MultiImageInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include <array>
+
+namespace arm_compute
+{
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Basic implementation of the CL multi-planar image interface */
+class CLMultiImage : public ICLMultiImage
+{
+public:
+    /** Constructor */
+    CLMultiImage();
+    /** Init the multi-planar image
+     *
+     *  @param[in] width  Width of the whole image
+     *  @param[in] height Heigth of the whole image
+     *  @param[in] format Format of the whole image
+     */
+    void init(unsigned int width, unsigned int height, Format format);
+    /** Init the multi-planar image
+     *
+     * @note Uses conservative padding strategy which fits all kernels.
+     *
+     *  @param[in] width  Width of the whole image
+     *  @param[in] height Height of the whole image
+     *  @param[in] format Format of the whole image
+     */
+    void init_auto_padding(unsigned int width, unsigned int height, Format format);
+    /** Allocated a previously initialised multi image
+     *
+     * @note The multi image must not already be allocated when calling this function.
+     *
+     **/
+    void allocate();
+
+    // Inherited methods overridden:
+    const MultiImageInfo *info() const override;
+    CLImage *cl_plane(unsigned int index) override;
+    const CLImage *cl_plane(unsigned int index) const override;
+
+private:
+    /** Init the multi-planar image
+     *
+     *  @param[in] width        Width of the whole image
+     *  @param[in] height       Height of the whole image
+     *  @param[in] format       Format of the whole image
+     *  @param[in] auto_padding Specifies whether the image uses auto padding
+     */
+    void internal_init(unsigned int width, unsigned int height, Format format, bool auto_padding);
+
+    MultiImageInfo _info;          /** Instance of the multi-planar image's meta data */
+    std::array<CLImage, 3> _plane; /* Instance CLImage to hold the planar's information */
+};
+}
+#endif /*__ARM_COMPUTE_CLMULTIIMAGE_H__ */
diff --git a/arm_compute/runtime/CL/CLPyramid.h b/arm_compute/runtime/CL/CLPyramid.h
new file mode 100644
index 0000000000..5e0afb3c63
--- /dev/null
+++ b/arm_compute/runtime/CL/CLPyramid.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLPYRAMID_H__
+#define __ARM_COMPUTE_CLPYRAMID_H__
+
+#include "arm_compute/core/IPyramid.h"
+#include "arm_compute/core/PyramidInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include <cstddef>
+#include <memory>
+
+namespace arm_compute
+{
+class CLTensor;
+
+/** Basic implementation of the OpenCL pyramid interface */
+class CLPyramid : public IPyramid
+{
+public:
+    /** Default constructor */
+    CLPyramid();
+    /** Initialize pyramid data-object using the given Pyramid's metadata
+     *
+     * @param[in] info Pyramid's metadata
+     */
+    void init(const PyramidInfo &info);
+
+    /** Initialize pyramid data-object using the given Pyramid's metadata
+     *
+     * @note Uses conservative padding strategy which fits all kernels.
+     *
+     * @param[in] info Pyramid's metadata
+     */
+    void init_auto_padding(const PyramidInfo &info);
+
+    /** Allocate the planes in the pyramid
+     *
+     * @note The pyramid must not already be allocated when calling this function.
+     *
+     **/
+    void allocate();
+
+    // Inherited method overridden
+    const PyramidInfo *info() const override;
+    CLTensor *get_pyramid_level(size_t index) const override;
+
+private:
+    /** Initialize pyramid data-object using the given Pyramid's metadata
+     *
+     * @param[in] info         Pyramid's metadata
+     * @param[in] auto_padding Specifies whether the image in the pyramid use auto padding
+     */
+    void internal_init(const PyramidInfo &info, bool auto_padding);
+
+    PyramidInfo                 _info;
+    std::unique_ptr<CLTensor[]> _pyramid;
+};
+}
+#endif /*__ARM_COMPUTE_CLPYRAMID_H__ */
diff --git a/arm_compute/runtime/CL/CLScheduler.h b/arm_compute/runtime/CL/CLScheduler.h
new file mode 100644
index 0000000000..8e80259b59
--- /dev/null
+++ b/arm_compute/runtime/CL/CLScheduler.h
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSCHEDULER_H__
+#define __ARM_COMPUTE_CLSCHEDULER_H__
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLTypes.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLKernel;
+
+/** Provides global access to a CL context and command queue. */
+class CLScheduler
+{
+private:
+    /** Constructor */
+    CLScheduler();
+
+public:
+    /** Access the scheduler singleton.
+     *
+     * @return The scheduler
+     */
+    static CLScheduler &get();
+    /** Initialises the context and command queue used by the scheduler to default values
+     *  and sets a default device and kernel path for the @ref CLKernelLibrary.
+     */
+    void default_init()
+    {
+        CLKernelLibrary::get().init("./cl_kernels/", cl::Context::getDefault(), cl::Device::getDefault());
+        init(cl::Context::getDefault(), cl::CommandQueue::getDefault(), cl::Device::getDefault());
+    }
+    /** Schedule the execution of the passed kernel if possible.
+     *
+     * @param[in] kernel Kernel to execute.
+     * @param[in] flush  (Optional) Specifies if the command queue will be flushed after running the kernel.
+     */
+    void enqueue(ICLKernel &kernel, bool flush = true);
+
+    /** Initialises the context and command queue to be used by the scheduler.
+     *
+     * @param[in] context A CL context.
+     * @param[in] queue   A CL command queue.
+     * @param[in] device  A CL device.
+     */
+    void init(cl::Context context = cl::Context::getDefault(), cl::CommandQueue queue = cl::CommandQueue::getDefault(),
+              cl::Device device = cl::Device::getDefault())
+    {
+        _context = std::move(context);
+        _queue   = std::move(queue);
+        _target  = get_target_from_device(device);
+    }
+
+    /** Accessor for the associated CL context.
+     *
+     * @return A CL context.
+     */
+    cl::Context &context()
+    {
+        return _context;
+    }
+
+    /** Accessor to set the CL context to be used by the scheduler.
+     *
+     * @param[in] context A CL context.
+     */
+    void set_context(cl::Context context)
+    {
+        _context = std::move(context);
+    }
+
+    /** Accessor for the associated CL command queue.
+     *
+     * @return A CL command queue.
+     */
+    cl::CommandQueue &queue()
+    {
+        return _queue;
+    }
+
+    /** Get the target GPU.
+     *
+     * @return The target GPU.
+     */
+    GPUTarget target() const
+    {
+        return _target;
+    }
+
+    /** Accessor to set the CL command queue to be used by the scheduler.
+     *
+     * @param[in] queue A CL command queue.
+     */
+    void set_queue(cl::CommandQueue queue)
+    {
+        _queue = std::move(queue);
+    }
+
+    /** Accessor to set target GPU to be used by the scheduler.
+     *
+     * @param[in] target The target GPU.
+     */
+    void set_target(GPUTarget target)
+    {
+        _target = target;
+    }
+
+    /** Blocks until all commands in the associated command queue have finished. */
+    void sync()
+    {
+        _queue.finish();
+    }
+
+    /** Enqueues a marker into the associated command queue and return the event.
+     *
+     * @return An event that can be waited on to block the executing thread.
+     */
+    cl::Event enqueue_sync_event()
+    {
+        cl::Event event;
+        _queue.enqueueMarker(&event);
+
+        return event;
+    }
+
+private:
+    cl::Context      _context;
+    cl::CommandQueue _queue;
+    GPUTarget        _target;
+};
+}
+#endif /* __ARM_COMPUTE_CLSCHEDULER_H__ */
diff --git a/arm_compute/runtime/CL/CLSubTensor.h b/arm_compute/runtime/CL/CLSubTensor.h
new file mode 100644
index 0000000000..4bab164779
--- /dev/null
+++ b/arm_compute/runtime/CL/CLSubTensor.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSUBTENSOR_H__
+#define __ARM_COMPUTE_CLSUBTENSOR_H__
+
+#include "arm_compute/core/SubTensorInfo.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensorInfo;
+
+/** Basic implementation of the OpenCL sub-tensor interface */
+class CLSubTensor : public ICLTensor
+{
+public:
+    /** Constructor
+     *
+     * @param[in] parent       Parent tensor
+     * @param[in] tensor_shape Shape of the subtensor
+     * @param[in] coords       Coordinates of the first subtensor element inside the parent tensor.
+     */
+    CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords);
+    /** Destructor: free the tensor's memory */
+    ~CLSubTensor() = default;
+    /** Restrict instances of this class to be copy constructed */
+    CLSubTensor(const CLSubTensor &) = delete;
+    /** Restrict instances of this class to be copied */
+    CLSubTensor &operator=(const CLSubTensor &) = delete;
+    /** Allow instances of this class to be move constructed */
+    CLSubTensor(CLSubTensor &&) = default;
+    /** Allow instances of this class to be moved */
+    CLSubTensor &operator=(CLSubTensor &&) = default;
+
+    /** Enqueue a map operation of the allocated buffer.
+     *
+     * @note Mapping a subtensor will lead to the mapping of the whole parent tensor for now.
+     *
+     * @param[in] blocking If true, then the mapping will be ready to use by the time
+     *                     this method returns, else it is the caller's responsibility
+     *                     to flush the queue and wait for the mapping operation to have completed.
+     */
+    void map(bool blocking = true);
+    using ICLTensor::map;
+    /** Enqueue an unmap operation of the allocated and mapped buffer.
+     *
+     * @note Unmapping a subtensor will lead to the unmapping of the whole parent tensor for now.
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     */
+    void unmap();
+    using ICLTensor::unmap;
+
+    /** Return the parent tensor of the subtensor
+     *
+     * @return Parent tensor
+     */
+    ICLTensor *parent();
+
+    // Inherited methods overridden:
+    ITensorInfo      *info() const override;
+    ITensorInfo      *info() override;
+    const cl::Buffer &cl_buffer() const override;
+
+protected:
+    // Inherited methods overridden:
+    uint8_t *do_map(cl::CommandQueue &q, bool blocking) override;
+    void do_unmap(cl::CommandQueue &q) override;
+
+private:
+    ICLTensor            *_parent;
+    mutable SubTensorInfo _info;
+};
+}
+#endif /*__ARM_COMPUTE_CLSUBTENSOR_H__ */
diff --git a/arm_compute/runtime/CL/CLTensor.h b/arm_compute/runtime/CL/CLTensor.h
new file mode 100644
index 0000000000..2c685d1ed1
--- /dev/null
+++ b/arm_compute/runtime/CL/CLTensor.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLTENSOR_H__
+#define __ARM_COMPUTE_CLTENSOR_H__
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensorAllocator;
+class ITensorInfo;
+
+/** Basic implementation of the OpenCL tensor interface */
+class CLTensor : public ICLTensor
+{
+public:
+    /** Constructor */
+    CLTensor();
+    /** Return a pointer to the tensor's allocator
+     *
+     * @return A pointer to the tensor's allocator
+     */
+    ITensorAllocator *allocator();
+    /** Enqueue a map operation of the allocated buffer.
+     *
+     * @param[in] blocking If true, then the mapping will be ready to use by the time
+     *                     this method returns, else it is the caller's responsibility
+     *                     to flush the queue and wait for the mapping operation to have completed.
+     */
+    void map(bool blocking = true);
+    using ICLTensor::map;
+    /** Enqueue an unmap operation of the allocated and mapped buffer.
+     *
+     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     */
+    void unmap();
+    using ICLTensor::unmap;
+
+    // Inherited methods overridden:
+    TensorInfo       *info() const override;
+    TensorInfo       *info() override;
+    const cl::Buffer &cl_buffer() const override;
+
+protected:
+    // Inherited methods overridden:
+    uint8_t *do_map(cl::CommandQueue &q, bool blocking) override;
+    void do_unmap(cl::CommandQueue &q) override;
+
+private:
+    mutable CLTensorAllocator _allocator; /**< Instance of the OpenCL tensor allocator */
+};
+
+using CLImage = CLTensor;
+}
+#endif /*__ARM_COMPUTE_CLTENSOR_H__ */
diff --git a/arm_compute/runtime/CL/CLTensorAllocator.h b/arm_compute/runtime/CL/CLTensorAllocator.h
new file mode 100644
index 0000000000..ed371e0642
--- /dev/null
+++ b/arm_compute/runtime/CL/CLTensorAllocator.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLTENSORALLOCATOR_H__
+#define __ARM_COMPUTE_CLTENSORALLOCATOR_H__
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+/** Basic implementation of a CL memory tensor allocator. */
+class CLTensorAllocator : public ITensorAllocator
+{
+public:
+    /** Default constructor. */
+    CLTensorAllocator();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    CLTensorAllocator(const CLTensorAllocator &) = delete;
+    /** Prevent instances of this class from being copy assigned (As this class contains pointers). */
+    CLTensorAllocator &operator=(const CLTensorAllocator &) = delete;
+    /** Allow instances of this class to be moved */
+    CLTensorAllocator(CLTensorAllocator &&) = default;
+    /** Allow instances of this class to be moved */
+    CLTensorAllocator &operator=(CLTensorAllocator &&) = default;
+    /** Default destructor */
+    ~CLTensorAllocator() = default;
+
+    /** Interface to be implemented by the child class to return the pointer to the mapped data. */
+    uint8_t *data();
+    /** Interface to be implemented by the child class to return the pointer to the CL data. */
+    const cl::Buffer &cl_data() const;
+    /** Enqueue a map operation of the allocated buffer on the given queue.
+     *
+     * @param[in,out] q        The CL command queue to use for the mapping operation.
+     * @param[in]     blocking If true, then the mapping will be ready to use by the time
+     *                         this method returns, else it is the caller's responsibility
+     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
+     *
+     * @return The mapping address.
+     */
+    uint8_t *map(cl::CommandQueue &q, bool blocking);
+    /** Enqueue an unmap operation of the allocated buffer on the given queue.
+     *
+     * @note This method simply enqueue the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
+     *       the memory is accessed by the device.
+     *
+     * @param[in,out] q       The CL command queue to use for the mapping operation.
+     * @param[in]     mapping The cpu mapping to unmap.
+     */
+    void unmap(cl::CommandQueue &q, uint8_t *mapping);
+
+    /** Allocate size specified by TensorInfo of OpenCL memory.
+     *
+     * @note: The tensor must not already be allocated when calling this function.
+     *
+     */
+    void allocate() override;
+
+    /** Free allocated OpenCL memory.
+     *
+     * @note The tensor must have been allocated when calling this function.
+     *
+     */
+    void free() override;
+
+protected:
+    /** Call map() on the OpenCL buffer.
+     *
+     * @return A pointer to the beginning of the tensor's allocation.
+     */
+    uint8_t *lock() override;
+    /** Call unmap() on the OpenCL buffer. */
+    void unlock() override;
+
+private:
+    cl::Buffer _buffer;  /**< OpenCL buffer containing the tensor data. */
+    uint8_t   *_mapping; /**< Pointer to the CPU mapping of the OpenCL buffer. */
+};
+}
+#endif /* __ARM_COMPUTE_CLTENSORALLOCATOR_H__ */
diff --git a/arm_compute/runtime/CL/ICLSimpleFunction.h b/arm_compute/runtime/CL/ICLSimpleFunction.h
new file mode 100644
index 0000000000..130c58a98c
--- /dev/null
+++ b/arm_compute/runtime/CL/ICLSimpleFunction.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ICLSIMPLEFUNCTION_H__
+#define __ARM_COMPUTE_ICLSIMPLEFUNCTION_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Basic interface for functions which have a single OpenCL kernel */
+class ICLSimpleFunction : public IFunction
+{
+public:
+    /** Default constructor */
+    ICLSimpleFunction();
+
+    // Inherited methods overridden:
+    void run() override final;
+
+protected:
+    std::unique_ptr<ICLKernel> _kernel;         /**< Kernel to run */
+    CLFillBorderKernel         _border_handler; /**< Kernel to handle  borders */
+};
+}
+#endif /*__ARM_COMPUTE_ICLSIMPLEFUNCTION_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLAbsoluteDifference.h b/arm_compute/runtime/CL/functions/CLAbsoluteDifference.h
new file mode 100644
index 0000000000..40ee396644
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLAbsoluteDifference.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLABSOLUTEDIFFERENCE_H__
+#define __ARM_COMPUTE_CLABSOLUTEDIFFERENCE_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLAbsoluteDifferenceKernel
+ *
+ * @note The tensor data types for the inputs must be U8 or S16.
+ * @note The function calculates the absolute difference also when the 2 inputs have different tensor data types.
+ */
+class CLAbsoluteDifference : public ICLSimpleFunction
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  input1 First input tensor. Data types supported: U8, S16
+     * @param[in]  input2 Second input tensor. Data types supported: U8, S16
+     * @param[out] output Output tensor. Data types supported: U8, S16
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+};
+}
+#endif /* __ARM_COMPUTE_CLABSOLUTEDIFFERENCE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLAccumulate.h b/arm_compute/runtime/CL/functions/CLAccumulate.h
new file mode 100644
index 0000000000..51f6df9acb
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLAccumulate.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLACCUMULATE_H__
+#define __ARM_COMPUTE_CLACCUMULATE_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLAccumulateKernel */
+class CLAccumulate : public ICLSimpleFunction
+{
+public:
+    /** Set the input and accumulation tensors.
+     *
+     * @param[in]  input Source tensor. Data types supported: U8.
+     * @param[out] accum Destination tensor. Data types supported: S16.
+     */
+    void configure(const ICLTensor *input, ICLTensor *accum);
+};
+
+/** Basic function to run @ref CLAccumulateWeightedKernel */
+class CLAccumulateWeighted : public ICLSimpleFunction
+{
+public:
+    /** Set the input and accumulation tensors, and the scale value.
+     *
+     * @param[in]     input Source tensor. Data types supported: U8.
+     * @param[in]     alpha The input scalar value with a value input the range of [0, 1.0]. Data types supported: F32.
+     * @param[in,out] accum Accumulated tensor. Data types supported: U8.
+     */
+    void configure(const ICLTensor *input, float alpha, ICLTensor *accum);
+};
+
+/** Basic function to run @ref CLAccumulateSquaredKernel */
+class CLAccumulateSquared : public ICLSimpleFunction
+{
+public:
+    /** Set the input and accumulation tensors and the shift value.
+     *
+     * @param[in]     input Source tensor. Data types supported: U8.
+     * @param[in]     shift The input with a value input the range of [0, 15]. Data types supported: U32.
+     * @param[in,out] accum Accumulated tensor. Data types supported: S16.
+     */
+    void configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum);
+};
+}
+#endif /*__ARM_COMPUTE_CLACCUMULATE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLActivationLayer.h b/arm_compute/runtime/CL/functions/CLActivationLayer.h
new file mode 100644
index 0000000000..6468c996a2
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLActivationLayer.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLACTIVATIONLAYER_H__
+#define __ARM_COMPUTE_CLACTIVATIONLAYER_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLActivationLayerKernel
+ *
+ * @note The function simulates an activation layer with the specified activation function.
+ */
+class CLActivationLayer : public ICLSimpleFunction
+{
+public:
+    /** Set the input and output tensor.
+     *
+     * @param[in]  input    Source tensor. Data types supported: F16, F32, U16, S16.
+     * @param[out] output   Destination tensor. Data type should match the input data type.
+     * @param[in]  act_info Activation layer parameters.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info);
+};
+}
+#endif /* __ARM_COMPUTE_CLACTIVATIONLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLArithmeticAddition.h b/arm_compute/runtime/CL/functions/CLArithmeticAddition.h
new file mode 100644
index 0000000000..feadf39820
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLArithmeticAddition.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLARITHMETICADDITION_H__
+#define __ARM_COMPUTE_CLARITHMETICADDITION_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLArithmeticAdditionKernel
+ *
+ * @note The tensor data type for the inputs must be U8, S16, F16, F32.
+ * @note The function performs an arithmetic addition between two tensors.
+ */
+class CLArithmeticAddition : public ICLSimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output and convertion policy.
+     *
+     * @param[in]  input1 First tensor input. Data types supported: U8, S16, F16, F32.
+     * @param[in]  input2 Second tensor input. Data types supported: U8, S16, F16, F32.
+     * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), S16, F16, F32.
+     * @param[in]  policy Policy to use to handle overflow.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy);
+};
+}
+#endif /* __ARM_COMPUTE_CLARITHMETICADDITION_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h b/arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h
new file mode 100644
index 0000000000..d7bb21144e
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLARITHMETICSUBTRACTION_H__
+#define __ARM_COMPUTE_CLARITHMETICSUBTRACTION_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLArithmeticSubtractionKernel
+ *
+ * @note The tensor data type for the inputs must be U8, S16, F16, F32
+ * @note The function performs an arithmetic subtraction between two tensors.
+ */
+class CLArithmeticSubtraction : public ICLSimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output and convertion policy.
+     *
+     * @param[in]  input1 First tensor input. Data types supported: U8, S16, F16, F32.
+     * @param[in]  input2 Second tensor input. Data types supported: U8, S16, F16, F32.
+     * @param[out] output Output tensor. Data types supported: U8 (Only if both inputs are U8), S16, F16, F32.
+     * @param[in]  policy Policy to use to handle overflow.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy);
+};
+}
+#endif /* __ARM_COMPUTE_CLARITHMETICSUBTRACTION_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
new file mode 100644
index 0000000000..d766d1c69c
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLBATCHNORMALIZATIONLAYER_H__
+#define __ARM_COMPUTE_CLBATCHNORMALIZATIONLAYER_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLNormalizationLayerKernel and simulate a batch normalization layer.
+ *
+ * Batch normalization is calculated by:
+ * @f[ out_i = \gamma * (\frac{in_i - \mu_{B}}{\sqrt{\sigma^2_{B} + \epsilon}}) + \beta \equiv BN_{\gamma,\beta}(in_i) @f]
+ *
+ */
+class CLBatchNormalizationLayer : public IFunction
+{
+public:
+    /** Default constructor */
+    CLBatchNormalizationLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input   Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, FM].
+     *                     The rest are optional and used for representing batches. Data types supported: F32.
+     * @param[in]  mean    Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  var     Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  gamma   Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  beta    Beta values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  epsilon Small value to avoid division with zero.
+     * @param[out] output  Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLBatchNormalizationLayerKernel _norm_kernel; /**< BatchNormalization layer kernel to run */
+};
+}
+#endif /* __ARM_COMPUTE_CLBATCHNORMALIZATIONLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLBitwiseAnd.h b/arm_compute/runtime/CL/functions/CLBitwiseAnd.h
new file mode 100644
index 0000000000..a4a523baaa
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLBitwiseAnd.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLBITWISEAND_H__
+#define __ARM_COMPUTE_CLBITWISEAND_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLBitwiseAndKernel.
+ *
+ * @note The tensor data type for the inputs must be U8.
+ * @note The function performs a bitwise AND operation using the two input tensors.
+ */
+class CLBitwiseAnd : public ICLSimpleFunction
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  input1 Input tensor. Data types supported: U8.
+     * @param[in]  input2 Input tensor. Data types supported: U8.
+     * @param[out] output Output tensor. Data types supported: U8.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+};
+}
+#endif /* __ARM_COMPUTE_CLBITWISEAND_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLBitwiseNot.h b/arm_compute/runtime/CL/functions/CLBitwiseNot.h
new file mode 100644
index 0000000000..0ff16af870
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLBitwiseNot.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLBITWISENOT_H__
+#define __ARM_COMPUTE_CLBITWISENOT_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLBitwiseNotKernel.
+ *
+ * @note The tensor data type for the inputs must be U8.
+ * @note The function performs a bitwise NOT operation on input tensor.
+ */
+class CLBitwiseNot : public ICLSimpleFunction
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  input  Input tensor. Data types supported: U8.
+     * @param[out] output Output tensor. Data types supported: U8.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+};
+}
+#endif /* __ARM_COMPUTE_CLBITWISENOT_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLBitwiseOr.h b/arm_compute/runtime/CL/functions/CLBitwiseOr.h
new file mode 100644
index 0000000000..880c4762be
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLBitwiseOr.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLBITWISEOR_H__
+#define __ARM_COMPUTE_CLBITWISEOR_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLBitwiseOrKernel.
+ *
+ * @note The tensor data type for the inputs must be U8.
+ * @note The function performs a bitwise OR operation using the two input tensors.
+ */
+class CLBitwiseOr : public ICLSimpleFunction
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  input1 Input tensor. Data types supported: U8.
+     * @param[in]  input2 Input tensor. Data types supported: U8.
+     * @param[out] output Output tensor. Data types supported: U8.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+};
+}
+#endif /* __ARM_COMPUTE_CLBITWISEOR_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLBitwiseXor.h b/arm_compute/runtime/CL/functions/CLBitwiseXor.h
new file mode 100644
index 0000000000..772dec22ea
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLBitwiseXor.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLBITWISEXOR_H__
+#define __ARM_COMPUTE_CLBITWISEXOR_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLBitwiseXorKernel.
+ *
+ * @note The tensor data type for the inputs must be U8.
+ * @note The function performs a bitwise XOR operation using the two input tensors.
+ */
+class CLBitwiseXor : public ICLSimpleFunction
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  input1 Input tensor. Data types supported: U8.
+     * @param[in]  input2 Input tensor. Data types supported: U8.
+     * @param[out] output Output tensor. Data types supported: U8.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+};
+}
+#endif /* __ARM_COMPUTE_CLBITWISEXOR_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLBox3x3.h b/arm_compute/runtime/CL/functions/CLBox3x3.h
new file mode 100644
index 0000000000..5e51c1a390
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLBox3x3.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLBOX3X3_H__
+#define __ARM_COMPUTE_CLBOX3X3_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute box filter 3x3. This function calls the following OpenCL kernels:
+ *
+ *  -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ *  -# @ref CLBox3x3Kernel
+ *
+ */
+class CLBox3x3 : public ICLSimpleFunction
+{
+public:
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Destination tensor, Data types supported: U8.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_CLBOX3X3_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLCannyEdge.h b/arm_compute/runtime/CL/functions/CLCannyEdge.h
new file mode 100644
index 0000000000..e5a82b2263
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLCannyEdge.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLCANNYEDGE_H__
+#define __ARM_COMPUTE_CLCANNYEDGE_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/CL/kernels/CLCannyEdgeKernel.h"
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute canny edge on OpenCL. This function calls the following OpenCL kernels and functions:
+ *
+ * -# @ref CLFillBorderKernel (if border_mode == REPLICATE or border_mode == CONSTANT)
+ * -# @ref CLSobel3x3 (if gradient_size == 3) or @ref CLSobel5x5 (if gradient_size == 5) or @ref CLSobel7x7 (if gradient_size == 7)
+ * -# @ref CLGradientKernel
+ * -# @ref CLEdgeNonMaxSuppressionKernel
+ * -# @ref CLEdgeTraceKernel
+ *
+ */
+class CLCannyEdge : public IFunction
+{
+public:
+    /** Constructor */
+    CLCannyEdge();
+    /** Initialise the function's source, destination, thresholds, gradient size, normalization type and border mode.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for border_mode != UNDEFINED)
+     * @param[out]    output                Destination tensor. Data types supported: U8.
+     * @param[in]     upper_thr             Upper threshold used for the hysteresis.
+     * @param[in]     lower_thr             Lower threshold used for the hysteresis.
+     * @param[in]     gradient_size         Gradient size (3, 5 or 7).
+     * @param[in]     norm_type             Normalization type. if 1, L1-Norm otherwise L2-Norm.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type,
+                   BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited methods overridden:
+    virtual void run() override;
+
+private:
+    std::unique_ptr<IFunction>    _sobel;                                           /**< Pointer to Sobel kernel. */
+    CLGradientKernel              _gradient;                                        /**< Gradient kernel. */
+    CLFillBorderKernel            _border_mag_gradient;                             /**< Fill border on magnitude tensor kernel */
+    CLEdgeNonMaxSuppressionKernel _non_max_suppr;                                   /**< Non-Maxima suppression kernel. */
+    CLEdgeTraceKernel             _edge_trace;                                      /**< Edge tracing kernel. */
+    CLImage                       _gx;                                              /**< Source tensor - Gx component. */
+    CLImage                       _gy;                                              /**< Source tensor - Gy component. */
+    CLImage                       _mag;                                             /**< Source tensor - Magnitude. */
+    CLImage                       _phase;                                           /**< Source tensor - Phase. */
+    CLImage                       _nonmax;                                          /**< Source tensor - Non-Maxima suppressed. */
+    CLImage                       _visited, _recorded, _l1_list_counter, _l1_stack; /**< Temporary tensors */
+};
+}
+
+#endif /* __ARM_COMPUTE_CLCANNYEDGE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLChannelCombine.h b/arm_compute/runtime/CL/functions/CLChannelCombine.h
new file mode 100644
index 0000000000..337e6b4820
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLChannelCombine.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLCHANNELCOMBINE_H__
+#define __ARM_COMPUTE_CLCHANNELCOMBINE_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLMultiImage;
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Basic function to run @ref CLChannelCombineKernel to perform channel combination. */
+class CLChannelCombine : public ICLSimpleFunction
+{
+public:
+    /** Initialize function's inputs and outputs.
+     *
+     * @param[in]  plane0 The 2D plane that forms channel 0. Must be of U8 format.
+     * @param[in]  plane1 The 2D plane that forms channel 1. Must be of U8 format.
+     * @param[in]  plane2 The 2D plane that forms channel 2. Must be of U8 format.
+     * @param[in]  plane3 The 2D plane that forms channel 3. Must be of U8 format.
+     * @param[out] output The single planar output tensor.
+     */
+    void configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output);
+    /** Initialize function's inputs and outputs.
+     *
+     * @param[in]  plane0 The 2D plane that forms channel 0. Must be of U8 format.
+     * @param[in]  plane1 The 2D plane that forms channel 1. Must be of U8 format.
+     * @param[in]  plane2 The 2D plane that forms channel 2. Must be of U8 format.
+     * @param[out] output The multi planar output image.
+     */
+    void configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output);
+};
+}
+#endif /*__ARM_COMPUTE_CLCHANNELCOMBINE_H__*/
diff --git a/arm_compute/runtime/CL/functions/CLChannelExtract.h b/arm_compute/runtime/CL/functions/CLChannelExtract.h
new file mode 100644
index 0000000000..1753374622
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLChannelExtract.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLCHANNELEXTRACT_H__
+#define __ARM_COMPUTE_CLCHANNELEXTRACT_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLMultiImage;
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Basic function to run @ref CLChannelExtractKernel to perform channel extraction. */
+class CLChannelExtract : public ICLSimpleFunction
+{
+public:
+    /** Initialize the function's source, destination
+     *
+     * @param[in]  input   The input tensor to extract the channel from. Formats supported: Any single planar.
+     * @param[in]  channel The channel to extract.
+     * @param[out] output  The extracted channel. Must be of U8 format.
+     */
+    void configure(const ICLTensor *input, Channel channel, ICLTensor *output);
+    /** Initialize the function's source, destination
+     *
+     * @param[in]  input   The multi-planar input image to extract channel from.
+     * @param[in]  channel The channel to extract.
+     * @param[out] output  The extracted 2D channel. Must be of U8 format.
+     */
+    void configure(const ICLMultiImage *input, Channel channel, ICLImage *output);
+};
+}
+#endif /*__ARM_COMPUTE_CLCHANNELEXTRACT_H__*/
diff --git a/arm_compute/runtime/CL/functions/CLColorConvert.h b/arm_compute/runtime/CL/functions/CLColorConvert.h
new file mode 100644
index 0000000000..12457a0cf2
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLColorConvert.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLCOLORCONVERT_H__
+#define __ARM_COMPUTE_CLCOLORCONVERT_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLMultiImage;
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Basic function to run @ref CLColorConvertKernel
+ *
+ * @note The function performs color convert between images.
+ */
+class CLColorConvert : public ICLSimpleFunction
+{
+public:
+    /** Initialize the function's source, destination
+     *
+     * @param[in] input  The input single-planar tensor from which to convert
+     * @param[in] output The converted single-planar output tensor
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+    /** Initialize the function's source, destination
+     *
+     * @param[in] input  The multi-planar input image from which to convert
+     * @param[in] output The converted single-planar output image
+     */
+    void configure(const ICLMultiImage *input, ICLImage *output);
+    /** Initialize the function's source, destination
+     *
+     * @param[in] input  The single-planar input image from which to convert
+     * @param[in] output The converted multi-planar output image
+     */
+    void configure(const ICLImage *input, ICLMultiImage *output);
+    /** Initialize the function's source, destination
+     *
+     * @param[in] input  The multi-planar input image from which to convert
+     * @param[in] output The converted multi-planar output image
+     */
+    void configure(const ICLMultiImage *input, ICLMultiImage *output);
+};
+}
+#endif /* __ARM_COMPUTE_CLCOLORCONVERT_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLConvolution.h b/arm_compute/runtime/CL/functions/CLConvolution.h
new file mode 100644
index 0000000000..f526f6ff4a
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLConvolution.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLCONVOLUTION_H__
+#define __ARM_COMPUTE_CLCONVOLUTION_H__
+
+#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute convolution of size 3x3. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref CLConvolution3x3Kernel
+ *
+ */
+class CLConvolution3x3 : public ICLSimpleFunction
+{
+public:
+    /** Initialize the function's source, destination, conv and border_mode.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Destination tensor, Data types supported: U8 or S16.
+     * @param[in]     conv                  matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer.
+     * @param[in]     scale                 Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
+     * @param[in]     border_mode           Strategy to use for borders.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+
+/** Basic function to execute square convolution.Currently it supports 5x5, 7x7, 9x9. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref CLConvolutionKernel or<br/>
+ *    @ref CLSeparableConvolutionHorKernel and @ref CLSeparableConvolutionVertKernel (if convolution matrix is separable)
+ *
+ */
+template <unsigned int matrix_size>
+class CLConvolutionSquare : public IFunction
+{
+public:
+    /** Default constructor */
+    CLConvolutionSquare();
+    /** Initialize the function's source, destination, conv and border_mode.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Destination tensor, Data types supported: U8 or S16.
+     * @param[in]     conv                  matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer.
+     * @param[in]     scale                 Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
+     * @param[in]     border_mode           Strategy to use for borders.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited methods overriden:
+    void run() override;
+
+private:
+    CLTensor                                      _tmp;            /**< temporary buffer for output of horizontal pass */
+    bool                                          _is_separable;   /**< true if the convolution can be separated */
+    CLSeparableConvolutionHorKernel<matrix_size>  _kernel_hor;     /**< kernel for horizontal pass of separated convolution */
+    CLSeparableConvolutionVertKernel<matrix_size> _kernel_vert;    /**< kernel for vertical pass of separated convolution */
+    CLConvolutionKernel<matrix_size>              _kernel;         /**< kernel for non-separated convolution **/
+    CLFillBorderKernel                            _border_handler; /**< kernel for border handling */
+};
+
+/** Basic function to run 5x5 convolution. */
+using CLConvolution5x5 = CLConvolutionSquare<5>;
+/** Basic function to run 7x7 convolution. */
+using CLConvolution7x7 = CLConvolutionSquare<7>;
+/** Basic function to run 9x9 convolution. */
+using CLConvolution9x9 = CLConvolutionSquare<9>;
+
+/** Basic function to execute non-square convolution. This function calls the following CL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref CLConvolutionRectangleKernel or<br/>
+ *
+ * @note Convolution rectangle should have dimensions of 3, 5, 7, 9
+ */
+class CLConvolutionRectangle : public ICLSimpleFunction
+{
+public:
+    /** Initialize the function's source, destination, conv and border_mode.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Destination tensor, Data types supported: U8 or S16.
+     * @param[in]     conv                  Matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer.
+     * @param[in]     rows                  Rows of convolution kernel.
+     * @param[in]     cols                  Columns of convolution kernel.
+     * @param[in]     scale                 Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
+     * @param[in]     border_mode           Strategy to use for borders.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_CLCONVOLUTION_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
new file mode 100644
index 0000000000..6a40396f9a
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLCONVOLUTIONLAYER_H__
+#define __ARM_COMPUTE_CLCONVOLUTIONLAYER_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/CL/kernels/CLCol2ImKernel.h"
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
+#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Function to reshape and transpose the weights. This function calls the following kernels:
+ * -# @ref CLWeightsReshapeKernel
+ * -# @ref CLGEMMTranspose1xWKernel
+ */
+class CLConvolutionLayerReshapeWeights : public IFunction
+{
+public:
+    /** Constructor */
+    CLConvolutionLayerReshapeWeights();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: F32.
+     * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
+     * @param[out] output       Destination tensor. Data types supported: Same as @p weights.
+     * @param[in]  transpose1xW True if the weights are to undergo a 1xW transposition after reshaping (in case of GEMM operation), false otherwise.
+     *                          Data types supported: Same as @p weights.
+     */
+    void configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose1xW);
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLConvolutionLayerWeightsReshapeKernel _weights_reshape_kernel;
+    CLGEMMTranspose1xWKernel               _weights_transposed_kernel;
+    CLTensor                               _weights_reshaped;
+    bool                                   _transpose1xW;
+};
+
+/** Basic function to compute the convolution layer. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLConvolutionLayerWeightsReshapeKernel (executed only once for each configuration)
+ * -# @ref CLGEMMTranspose1xWKernel               (executed only once for each configuration)
+ * -# @ref CLIm2ColKernel
+ * -# @ref CLGEMMInterleave4x4Kernel
+ * -# @ref CLGEMMMatrixMultiplyKernel
+ * -# @ref CLCol2ImKernel
+ */
+class CLConvolutionLayer : public IFunction
+{
+public:
+    /** Default constructor */
+    CLConvolutionLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                          while every optional dimension from 4 and above represent a batch of inputs.
+     *                          Data types supported: F16, F32.
+     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
+     * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input.
+     * @param[out] output       Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                          Data types supported: Same as @p input.
+     * @param[in]  conv_info    Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
+     *                          tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input.
+     */
+    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLConvolutionLayerReshapeWeights _reshape_weights;
+    CLIm2ColKernel                   _input_im2col_kernel;
+    CLGEMMInterleave4x4Kernel        _input_interleave_kernel;
+    CLGEMMMatrixMultiplyKernel       _mm_kernel;
+    CLCol2ImKernel                   _output_col2im_kernel;
+    CLTensor                         _input_im2col_reshaped;
+    CLTensor                         _input_interleaved_reshaped;
+    CLTensor                         _weights_reshaped;
+    CLTensor                         _weights_transposed;
+    CLTensor                         _gemm_output;
+    bool                             _has_bias;
+    bool                             _is_fully_connected_convolution;
+    bool                             _are_weights_reshaped;
+};
+}
+#endif /* __ARM_COMPUTE_CLCONVOLUTIONLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLDepthConcatenate.h b/arm_compute/runtime/CL/functions/CLDepthConcatenate.h
new file mode 100644
index 0000000000..3199936b82
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLDepthConcatenate.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLDEPTHCONCATENATE_H__
+#define __ARM_COMPUTE_CLDEPTHCONCATENATE_H__
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+#include <vector>
+
+namespace arm_compute
+{
+class ICLTensor;
+class CLDepthConcatenateKernel;
+class CLFillBorderKernel;
+
+/** Basic function to execute concatenate tensors along z axis. This function calls the following kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if input's lowest two dimensions are smaller than respective output's dimensions)
+ * -# @ref CLDepthConcatenateKernel
+ *
+ */
+class CLDepthConcatenate : public IFunction
+{
+public:
+    /** Default constructor */
+    CLDepthConcatenate();
+    /** Initialise the kernel's inputs vector and output.
+     *
+     * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported:  F32.
+     * @param[out]    output        Output tensor. Data types supported: F32.
+     */
+    void configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    std::vector<ICLTensor *>                    _inputs_vector;
+    std::unique_ptr<CLDepthConcatenateKernel[]> _concat_kernels_vector;
+    std::unique_ptr<CLFillBorderKernel[]>       _border_handlers_vector;
+    unsigned int                                _num_inputs;
+};
+}
+#endif /* __ARM_COMPUTE_CLDEPTHCONCATENATE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLDepthConvert.h b/arm_compute/runtime/CL/functions/CLDepthConvert.h
new file mode 100644
index 0000000000..f11027656d
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLDepthConvert.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLDEPTHCONVERT_H__
+#define __ARM_COMPUTE_CLDEPTHCONVERT_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLDepthConvertKernel. */
+class CLDepthConvert : public ICLSimpleFunction
+{
+public:
+    /** Initialize the function's source, destination
+     *
+     * Input data type must be different than output data type.
+     *
+     * Valid conversions Input -> Output :
+     *
+     *   - U8 -> U16, S16, U32, S32
+     *   - U16 -> U8, U32, S32
+     *   - S16 -> U8, U32, S32
+     *   - U32 -> U8, U16, S16
+     *   - S32 -> U8, U16, S16
+     *
+     * @param[in]  input  The input tensor to convert. Data types supported: U8, U16, S16, U32 or S32.
+     * @param[out] output The output tensor. Data types supported: U8, U16, S16, U32 or S32.
+     * @param[in]  policy Conversion policy.
+     * @param[in]  shift  Value for down/up conversions. Must be 0 <= shift < 8.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift);
+};
+}
+#endif /*__ARM_COMPUTE_CLDEPTHCONVERT_H__*/
diff --git a/arm_compute/runtime/CL/functions/CLDerivative.h b/arm_compute/runtime/CL/functions/CLDerivative.h
new file mode 100644
index 0000000000..05033e8172
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLDerivative.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLDERIVATIVE_H__
+#define __ARM_COMPUTE_CLDERIVATIVE_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute first order derivative operator. This function calls the following CL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref CLDerivativeKernel
+ *
+ */
+class CLDerivative : public ICLSimpleFunction
+{
+public:
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @note At least one of output_x or output_y must be not NULL.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output_x              (optional) Destination tensor. Derivative along the X direction. Data types supported: S16.
+     * @param[out]    output_y              (optional) Destination tensor. Derivative along the Y direction. Data types supported: S16.
+     * @param[in]     border_mode           Border mode to use
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /* __ARM_COMPUTE_CLDERIVATIVE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLDilate.h b/arm_compute/runtime/CL/functions/CLDilate.h
new file mode 100644
index 0000000000..8534139c86
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLDilate.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLDILATE_H__
+#define __ARM_COMPUTE_CLDILATE_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute dilate. This function calls the following OpenCL kernels:
+*
+* -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+* -# @ref CLDilateKernel
+*
+*/
+class CLDilate : public ICLSimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output and border mode.
+     *
+     * @param[in,out] input                 First tensor input. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Output tensor. Data types supported: U8.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value);
+};
+}
+#endif /*__ARM_COMPUTE_CLDILATE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLEqualizeHistogram.h b/arm_compute/runtime/CL/functions/CLEqualizeHistogram.h
new file mode 100644
index 0000000000..d7182756b5
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLEqualizeHistogram.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLEQUALIZEHISTOGRAM_H__
+#define __ARM_COMPUTE_CLEQUALIZEHISTOGRAM_H__
+
+#include "arm_compute/core/CL/kernels/CLHistogramKernel.h"
+#include "arm_compute/core/CL/kernels/CLTableLookupKernel.h"
+#include "arm_compute/runtime/CL/CLDistribution1D.h"
+#include "arm_compute/runtime/CL/CLLut.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Basic function to execute histogram equalization. This function calls the following CL kernels:
+ *
+ * -# @ref CLHistogramKernel
+ * -# @ref CLTableLookupKernel
+ *
+ */
+class CLEqualizeHistogram : public IFunction
+{
+public:
+    /** Default Constructor. */
+    CLEqualizeHistogram();
+    /** Initialise the kernel's inputs.
+     *
+     * @param[in]  input  Input image. Data types supported: U8.
+     * @param[out] output Output of same data type with equalized brightness and contrast.
+     */
+    void configure(const ICLImage *input, ICLImage *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLHistogramKernel       _histogram_kernel;        /**< Kernel that calculates the histogram of input. */
+    CLHistogramBorderKernel _border_histogram_kernel; /**< Kernel that calculates the histogram on the borders. */
+    CLTableLookupKernel     _map_histogram_kernel;    /**< Kernel that maps the input to output using the lut. */
+    CLDistribution1D        _hist;                    /**< Distribution that holds the histogram of the input image. */
+    CLDistribution1D        _cum_dist;                /**< Distribution that holds the cummulative distribution of the input histogram. */
+    CLLut                   _cd_lut;                  /**< Holds the equalization lookuptable. */
+    static const uint32_t   max_range = 256;          /**< Histogram range of the internal histograms. */
+    static const uint32_t   nr_bins   = 256;          /**< Histogram bins of the internal histograms. */
+};
+}
+#endif /*__ARM_COMPUTE_CLEQUALIZEHISTOGRAM_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLErode.h b/arm_compute/runtime/CL/functions/CLErode.h
new file mode 100644
index 0000000000..cd2f5516e2
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLErode.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLERODE_H__
+#define __ARM_COMPUTE_CLERODE_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute erode. This function calls the following OpenCL kernels:
+*
+* -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+* -# @ref CLErodeKernel
+*
+*/
+class CLErode : public ICLSimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output and border mode
+     *
+     * @param[in,out] input                 First tensor input. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Output tensor. Data types supported: U8.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value);
+};
+}
+#endif /*__ARM_COMPUTE_CLERODE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLFastCorners.h b/arm_compute/runtime/CL/functions/CLFastCorners.h
new file mode 100644
index 0000000000..79d82af462
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLFastCorners.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLFASTCORNERS_H__
+#define __ARM_COMPUTE_CLFASTCORNERS_H__
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/CL/kernels/CLFastCornersKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/CL/CLArray.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Basic function to execute fast corners. This function calls the following CL kernels:
+ *
+ * -# @ref CLFastCornersKernel
+ * -# @ref CLNonMaximaSuppression3x3Kernel (executed if nonmax_suppression == true)
+ * -# @ref CLCopyToArrayKernel
+ *
+ */
+class CLFastCorners : public IFunction
+{
+public:
+    /** Constructor */
+    CLFastCorners();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFastCorners(const CLFastCorners &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    const CLFastCorners &operator=(const CLFastCorners &) = delete;
+    /** Initialize the function's source, destination, conv and border_mode.
+     *
+     * @param[in]     input                 Source image. Data types supported: U8.
+     * @param[in]     threshold             Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3.
+     * @param[in]     nonmax_suppression    If true, non-maximum suppression is applied to detected corners before being placed in the array.
+     * @param[out]    corners               Array of keypoints to store the results.
+     * @param[in,out] num_corners           Record number of corners in the array
+     * @param[in]     border_mode           Strategy to use for borders.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(const ICLImage *input, float threshold, bool nonmax_suppression, CLKeyPointArray *corners, unsigned int *num_corners,
+                   BorderMode border_mode, uint8_t constant_border_value = 0);
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLFastCornersKernel       _fast_corners_kernel;
+    CLNonMaximaSuppression3x3 _suppr_func;
+    CLCopyToArrayKernel       _copy_array_kernel;
+    CLImage                   _output;
+    CLImage                   _suppr;
+    Window                    _win;
+    bool                      _non_max;
+    unsigned int             *_num_corners;
+    cl::Buffer                _num_buffer;
+    CLKeyPointArray          *_corners;
+    uint8_t                   _constant_border_value;
+};
+}
+#endif /*__ARM_COMPUTE_CLFASTCORNERS_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLFillBorder.h b/arm_compute/runtime/CL/functions/CLFillBorder.h
new file mode 100644
index 0000000000..b4855475c3
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLFillBorder.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLFILLBORDER_H__
+#define __ARM_COMPUTE_CLFILLBORDER_H__
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLFillBorderKernel */
+class CLFillBorder : public ICLSimpleFunction
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in,out] tensor                Source tensor. Data types supported: U8, S16
+     * @param[in]     border_width          The border width
+     * @param[in]     border_mode           Strategy to use for borders.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+};
+}
+#endif /*__ARM_COMPUTE_FILLBORDER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
new file mode 100644
index 0000000000..826f445bd8
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLFULLYCONNECTEDLAYER_H__
+#define __ARM_COMPUTE_CLFULLYCONNECTEDLAYER_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
+#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+namespace arm_compute
+{
+/** Basic function to reshape the weights of Fully Connected layer with OpenCL. This function calls the following kernels:
+ *
+ *  -# @ref CLTransposeKernel        (if @p transpose_weights is set to true)
+ *  -# @ref CLGEMMTranspose1xWKernel (if @p is_batched_fc_layer is set to true)
+ *
+ * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ */
+class CLFullyConnectedLayerReshapeWeights : public IFunction
+{
+public:
+    /** Constructor */
+    CLFullyConnectedLayerReshapeWeights();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input               Weights tensor. The weights must be 2 dimensional. Data types supported: QS8/F32.
+     * @param[out] output              Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  transpose_weights   True if the weights must be transposed. Data types supported: Same as @p weights.
+     * @param[in]  is_batched_fc_layer True if it is a batched fully connected layer
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, bool transpose_weights, bool is_batched_fc_layer);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLTransposeKernel        _transpose_kernel;
+    CLGEMMTranspose1xWKernel _transpose1xW_kernel;
+    CLTensor                 _transpose_output;
+    bool                     _transpose_weights;
+    bool                     _is_batched_fc_layer;
+};
+
+/** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following OpenCL kernels:
+ *
+ *  -# @ref CLIm2ColKernel (called when the input comes from a convolutional layer)
+ *  -# @ref CLFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped is set to false) (called once)
+ *  -# @ref CLGEMMInterleave4x4Kernel (called if we have a multi-batch input)
+ *  -# @ref CLGEMMMatrixMultiplyKernel
+ *  -# @ref CLGEMMMatrixAccumulateBiasesKernel (if @p biases is not equal to nullptr)
+ *
+ * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ */
+class CLFullyConnectedLayer : public IFunction
+{
+public:
+    /** Constructor */
+    CLFullyConnectedLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input                Source tensor. Data type supported: F16/F32.
+     * @param[in]  weights              Weights tensor. The weights must be 2 dimensional. Data type supported: Same as @p input
+     * @param[in]  biases               Bias tensor. It can be nullptr. Data type supported:Same as @p input.
+     * @param[out] output               Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  transpose_weights    (Optional) Transpose weights if true. Defaults to true.
+     * @param[in]  are_weights_reshaped (Optional) Reshape the weights tensor if false. Defaults to false.
+     */
+    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights = true, bool are_weights_reshaped = false);
+
+    //Inherited methods override
+    void run() override;
+
+private:
+    void configure_fc_fc_wb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output);
+    void configure_fc_fc_nb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output);
+    void configure_conv_fc_wb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output);
+    void configure_conv_fc_nb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output);
+
+    CLIm2ColKernel                      _im2col_kernel;
+    CLFullyConnectedLayerReshapeWeights _reshape_weights_kernel;
+    CLGEMMInterleave4x4Kernel           _interleave4x4_kernel;
+    CLGEMMMatrixMultiplyKernel          _mm_kernel;
+    CLGEMMMatrixAccumulateBiasesKernel  _accumulate_biases_kernel;
+    CLTensor                            _im2col_output;
+    CLTensor                            _interleave4x4_output;
+    CLTensor                            _reshape_weights_output;
+    bool                                _are_weights_reshaped;
+    bool                                _is_fc_after_conv;
+    bool                                _is_batched_fc_layer;
+    bool                                _accumulate_biases;
+};
+}
+#endif /* __ARM_COMPUTE_CLFULLYCONNECTEDLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLGEMM.h b/arm_compute/runtime/CL/functions/CLGEMM.h
new file mode 100644
index 0000000000..043b2b8115
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLGEMM.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMM_H__
+#define __ARM_COMPUTE_CLGEMM_H__
+
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute GEMM on OpenCL. Data types supported: F32, F16. This function calls the following OpenCL kernels:
+ *
+ *  -# @ref CLGEMMInterleave4x4Kernel (if the output tensor is a matrix)
+ *  -# @ref CLGEMMTranspose1xWKernel (if the output tensor is a matrix)
+ *  -# @ref CLGEMMMatrixMultiplyKernel
+ *  -# @ref CLGEMMMatrixAdditionKernel (if c != nullptr and beta != 0.0)
+ *
+ */
+class CLGEMM : public IFunction
+{
+public:
+    /** Default constructor. */
+    CLGEMM();
+    /** Initialise the kernel's inputs and output
+     *
+     * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
+     *
+     * @note All tensors must have the same data type. Data types supported: F32, F16
+     *
+     * @note Whilst the first input tensor can be a vector, the second input tensor must be at least a matrix
+     *
+     * @param[in]  a      First input tensor  (Matrix or Vector A). Data types supported: F32, F16
+     * @param[in]  b      Second input tensor (Matrix B). Data type supported: same as @p a.
+     * @param[in]  c      Third input tensor  (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a.
+     * @param[out] output Output tensor. Data type supported: same as @p a
+     * @param[in]  alpha  Weight of the matrix product
+     * @param[in]  beta   Weight of matrix C
+     */
+    void configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLGEMMInterleave4x4Kernel  _interleave_kernel;
+    CLGEMMTranspose1xWKernel   _transpose_kernel;
+    CLGEMMMatrixMultiplyKernel _mm_kernel;
+    CLGEMMMatrixAdditionKernel _ma_kernel;
+    CLTensor                   _tmp_a;
+    CLTensor                   _tmp_b;
+    bool                       _run_vector_matrix_multiplication;
+    bool                       _run_addition;
+};
+}
+
+#endif /* __ARM_COMPUTE_CLGEMM_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h b/arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h
new file mode 100644
index 0000000000..b80136b328
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMINTERLEAVE4X4_H__
+#define __ARM_COMPUTE_CLGEMMINTERLEAVE4X4_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute CLGEMMInterleave4x4Kernel. This function calls the following OpenCL kernel:
+ *
+ *  -# @ref CLGEMMInterleave4x4Kernel
+ *
+ */
+class CLGEMMInterleave4x4 : public ICLSimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output
+     *
+     * @param[in]  input  First input tensor. Data types supported: U8/S8/U16/S16/F16/U32/S32/F32
+     * @param[out] output Output tensor. Data type supported: same as @p input
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+};
+}
+
+#endif /* __ARM_COMPUTE_CLGEMMINTERLEAVE4X4_H__ */
+\ No newline at end of file
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowp.h b/arm_compute/runtime/CL/functions/CLGEMMLowp.h
new file mode 100644
index 0000000000..da8883c3f8
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowp.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMLOWP_H__
+#define __ARM_COMPUTE_CLGEMMLOWP_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute GEMMLowp on OpenCL. This function calls the following OpenCL kernels:
+*
+*  -# @ref CLGEMMInterleave4x4Kernel
+*  -# @ref CLGEMMTranspose1xWKernel
+*  -# @ref CLGEMMLowpMatrixMultiplyKernel
+*
+*/
+class CLGEMMLowp : public IFunction
+{
+public:
+    /** Constructor */
+    CLGEMMLowp();
+    /** Initialise the kernel's inputs, output
+    *
+    * @note GEMM_LOWP:  low precision matrix multiply kernel
+    *  This kernel performs the following computation:
+    *
+    *  -# Convert a values from uint8 to int32 and add a_offset to each of them.
+    *  -# Convert b values from uint8 to int32 and add b_offset to each of them.
+    *  -# Compute the int32 matrix product of the resulting a * b.
+    *  -# Add output_offset to each entry of the result.
+    *  -# Multiply each entry of the result and round to the nearest integer
+    *  -# Clamp the resulting int32 values to the [0..255] range and cast to uint8.
+    *
+    * @param[in]  a               First input tensor  (Matrix A). Data types supported: U8.
+    * @param[in]  b               Second input tensor (Matrix B). Data types supported: same as @p a.
+    * @param[out] output          Output tensor. Data types supported: same as @p a.
+    * @param[in]  a_offset        Offset to be added to each element of the matrix A.
+    * @param[in]  b_offset        Offset to be added to each element of the matrix B.
+    * @param[in]  output_offset   Offset to be added to each element of the output matrix
+    * @param[in]  output_mult_int Multiplied with each element of the output matrix
+    * @param[in]  shift           Number of bits to shift right the result.
+    */
+    void configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLGEMMInterleave4x4Kernel      _interleave_kernel;
+    CLGEMMTranspose1xWKernel       _transpose_kernel;
+    CLGEMMLowpMatrixMultiplyKernel _mm_kernel;
+    CLTensor                       _tmp_a;
+    CLTensor                       _tmp_b;
+};
+}
+#endif /*__ARM_COMPUTE_CLGEMMLOWP_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLGaussian3x3.h b/arm_compute/runtime/CL/functions/CLGaussian3x3.h
new file mode 100644
index 0000000000..f8223bc5f5
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLGaussian3x3.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGAUSSIAN3X3_H__
+#define __ARM_COMPUTE_CLGAUSSIAN3X3_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute gaussian filter 3x3. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref CLGaussian3x3Kernel
+ *
+ */
+class CLGaussian3x3 : public ICLSimpleFunction
+{
+public:
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Destination tensor, Data types supported: U8.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_CLGAUSSIAN3X3_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLGaussian5x5.h b/arm_compute/runtime/CL/functions/CLGaussian5x5.h
new file mode 100644
index 0000000000..148b9a9924
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLGaussian5x5.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGAUSSIAN5X5_H__
+#define __ARM_COMPUTE_CLGAUSSIAN5X5_H__
+
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute gaussian filter 5x5. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref CLGaussian5x5HorKernel
+ * -# @ref CLGaussian5x5VertKernel
+ *
+ */
+class CLGaussian5x5 : public IFunction
+{
+public:
+    /** Default Constructor. */
+    CLGaussian5x5();
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Destination tensor, Data types supported: U8.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+protected:
+    CLGaussian5x5HorKernel  _kernel_hor;     /**< Horizontal pass kernel */
+    CLGaussian5x5VertKernel _kernel_vert;    /**< Vertical pass kernel */
+    CLFillBorderKernel      _border_handler; /**< Kernel to handle image borders */
+    CLImage                 _tmp;            /**< Temporary buffer */
+};
+}
+#endif /*__ARM_COMPUTE_CLGAUSSIAN5X5_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLGaussianPyramid.h b/arm_compute/runtime/CL/functions/CLGaussianPyramid.h
new file mode 100644
index 0000000000..97935193dc
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLGaussianPyramid.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGAUSSIANPYRAMID_H__
+#define __ARM_COMPUTE_CLGAUSSIANPYRAMID_H__
+
+#include "arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h"
+
+#include "arm_compute/core/CL/kernels/CLScaleKernel.h"
+#include "arm_compute/core/IPyramid.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLPyramid.h"
+#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstdint>
+#include <memory>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Common interface for all Gaussian pyramid functions
+ */
+class CLGaussianPyramid : public IFunction
+{
+public:
+    /** Constructor */
+    CLGaussianPyramid();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGaussianPyramid(const CLGaussianPyramid &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGaussianPyramid &operator=(const CLGaussianPyramid &) = delete;
+    /** Allow instances of this class to be moved */
+    CLGaussianPyramid(CLGaussianPyramid &&) = default;
+    /** Allow instances of this class to be moved */
+    CLGaussianPyramid &operator=(CLGaussianPyramid &&) = default;
+    /** Default destructor */
+    virtual ~CLGaussianPyramid() = default;
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @param[in, out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     pyramid               Destination pyramid tensors, Data types supported at each level: U8.
+     * @param[in]      border_mode           Border mode to use.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    virtual void configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value = 0) = 0;
+
+protected:
+    ICLTensor *_input;
+    CLPyramid *_pyramid;
+    CLPyramid  _tmp;
+};
+
+/** Basic function to execute gaussian pyramid with HALF scale factor. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref CLGaussianPyramidHorKernel
+ * -# @ref CLGaussianPyramidVertKernel
+ */
+class CLGaussianPyramidHalf : public CLGaussianPyramid
+{
+public:
+    /** Constructor */
+    CLGaussianPyramidHalf();
+
+    // Inherited methods overridden:
+    void configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) override;
+    void run() override;
+
+private:
+    std::unique_ptr<CLFillBorderKernel[]>          _border_handler;
+    std::unique_ptr<CLGaussianPyramidHorKernel[]>  _horizontal_reduction;
+    std::unique_ptr<CLGaussianPyramidVertKernel[]> _vertical_reduction;
+};
+
+/** Basic function to execute gaussian pyramid with ORB scale factor. This function calls the following OpenCL kernels and functions:
+ *
+ * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref CLGaussian5x5
+ * -# @ref CLScaleKernel
+ */
+class CLGaussianPyramidOrb : public CLGaussianPyramid
+{
+public:
+    /** Constructor */
+    CLGaussianPyramidOrb();
+
+    // Inherited methods overridden:
+    void configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) override;
+    void run() override;
+
+private:
+    std::unique_ptr<CLGaussian5x5[]> _gauss5x5;
+    std::unique_ptr<CLScaleKernel[]> _scale_nearest;
+};
+}
+#endif /*__ARM_COMPUTE_CLGAUSSIANPYRAMID_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLHOGDescriptor.h b/arm_compute/runtime/CL/functions/CLHOGDescriptor.h
new file mode 100644
index 0000000000..cdb23bff33
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLHOGDescriptor.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHOGDESCRIPTOR_H__
+#define __ARM_COMPUTE_CLHOGDESCRIPTOR_H__
+
+#include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class IHOG;
+/** Basic function to calculate HOG descriptor. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLHOGGradient
+ * -# @ref CLHOGOrientationBinningKernel
+ * -# @ref CLHOGBlockNormalizationKernel
+ *
+ */
+class CLHOGDescriptor : public IFunction
+{
+public:
+    /** Default constructor */
+    CLHOGDescriptor();
+    /** Initialise the function's source, destination, HOG data-object and border mode
+     *
+     * @param[in, out] input                 Input tensor. Data type supported: U8
+     *                                       (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output                Output tensor which stores the HOG descriptor. DataType supported: F32. The number of channels is equal to the number of histogram bins per block
+     * @param[in]      hog                   HOG data object which describes the HOG descriptor
+     * @param[in]      border_mode           Border mode to use.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited method overridden:
+    void run() override;
+
+private:
+    CLHOGGradient                 _gradient;
+    CLHOGOrientationBinningKernel _orient_bin;
+    CLHOGBlockNormalizationKernel _block_norm;
+    CLTensor                      _mag;
+    CLTensor                      _phase;
+    CLTensor                      _hog_space;
+};
+}
+
+#endif /* __ARM_COMPUTE_CLHOGDESCRIPTOR_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLHOGDetector.h b/arm_compute/runtime/CL/functions/CLHOGDetector.h
new file mode 100644
index 0000000000..0b4fad7766
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLHOGDetector.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHOGDETECTOR_H__
+#define __ARM_COMPUTE_CLHOGDETECTOR_H__
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h"
+#include "arm_compute/core/IHOG.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+/** Basic function to execute HOG detector based on linear SVM. This function calls the following OpenCL kernel:
+ *
+ * -# @ref CLHOGDetectorKernel
+ *
+ */
+class CLHOGDetector : public IFunction
+{
+public:
+    /** Default constructor */
+    CLHOGDetector();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGDetector(const CLHOGDetector &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGDetector &operator=(const CLHOGDetector &) = delete;
+    /** Allow instances of this class to be moved */
+    CLHOGDetector(CLHOGDetector &&) = default;
+    /** Allow instances of this class to be moved */
+    CLHOGDetector &operator=(CLHOGDetector &&) = default;
+    /** Default destructor */
+    ~CLHOGDetector() = default;
+    /** Initialise the kernel's input, output, HOG data object, detection window stride, threshold and index class
+     *
+     * @attention The function does not reset the number of values in @ref IDetectionWindowArray so it is caller's responsibility to clear it.
+     *
+     * @param[in]  input                   Input tensor. It is the output of @ref NEHOGDescriptor. Data type supported: F32
+     * @param[in]  hog                     HOG data-object that describes the HOG descriptor
+     * @param[out] detection_windows       Array of @ref DetectionWindow used to store the detected objects
+     * @param[in]  detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions.
+     *                                     It must be multiple of the block stride stored in hog
+     * @param[in]  threshold               (Optional) Threshold for the distance between features and SVM classifying plane
+     * @param[in]  idx_class               (Optional) Index of the class used for evaluating which class the detection window belongs to
+     */
+    void configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold = 0.0f, size_t idx_class = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLHOGDetectorKernel      _hog_detector_kernel;
+    ICLDetectionWindowArray *_detection_windows;
+    cl::Buffer               _num_detection_windows;
+};
+}
+
+#endif /* __ARM_COMPUTE_CLHOGDETECTOR_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLHOGGradient.h b/arm_compute/runtime/CL/functions/CLHOGGradient.h
new file mode 100644
index 0000000000..e74a68497f
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLHOGGradient.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHOGGRADIENT_H__
+#define __ARM_COMPUTE_CLHOGGRADIENT_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLDerivative.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+/** Basic function to calculate the gradient for HOG. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLDerivative
+ * -# @ref CLMagnitudePhaseKernel
+ *
+ */
+class CLHOGGradient : public IFunction
+{
+public:
+    /** Default constructor */
+    CLHOGGradient();
+    /** Initialise the function's source, destinations, phase type and border mode
+     *
+     * @param[in, out] input                 Input tensor. Data type supported: U8.
+     *                                       (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output_magnitude      Output tensor (magnitude). Data type supported: U16.
+     * @param[out]     output_phase          Output tensor.(phase). Format supported: U8
+     * @param[in]      phase_type            Type of @ref PhaseType
+     * @param[in]      border_mode           Border mode to use
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output_magnitude, ICLTensor *output_phase, PhaseType phase_type, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited method overridden:
+    void run() override;
+
+private:
+    CLDerivative           _derivative;
+    CLMagnitudePhaseKernel _mag_phase;
+    CLTensor               _gx;
+    CLTensor               _gy;
+};
+}
+#endif /*__ARM_COMPUTE_CLHOGGRADIENT_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h b/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h
new file mode 100644
index 0000000000..3fe0fa932a
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHOGMULTIDETECTION_H__
+#define __ARM_COMPUTE_CLHOGMULTIDETECTION_H__
+
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/ICLMultiHOG.h"
+#include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h"
+#include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLHOGDetector.h"
+#include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+/** Basic function to detect multiple objects (or the same object at different scales) on the same input image using HOG. This function calls the following kernels:
+ *
+ * -# @ref CLHOGGradient
+ * -# @ref CLHOGOrientationBinningKernel
+ * -# @ref CLHOGBlockNormalizationKernel
+ * -# @ref CLHOGDetector
+ * -# @ref CPPDetectionWindowNonMaximaSuppressionKernel (executed if non_maxima_suppression == true)
+ *
+ * @note This implementation works if all the HOG data-objects within the IMultiHOG container have the same:
+ *       -# Phase type
+         -# Normalization type
+         -# L2 hysteresis threshold if the normalization type is L2HYS_NORM
+ *
+ */
+class CLHOGMultiDetection : public IFunction
+{
+public:
+    /** Default constructor */
+    CLHOGMultiDetection();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGMultiDetection(const CLHOGMultiDetection &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHOGMultiDetection &operator=(const CLHOGMultiDetection &) = delete;
+    /** Initialise the function's source, destination, detection window strides, border mode, threshold and non-maxima suppression
+     *
+     * @param[in, out] input                    Input tensor. Data type supported: U8
+     *                                          (Written to only for @p border_mode != UNDEFINED)
+     * @param[in]      multi_hog                Container of multiple HOG data object. Each HOG data object describes one HOG model to detect.
+     *                                          This container should store the HOG data-objects in descending or ascending cell_size width order.
+     *                                          This will help to understand if the HOG descriptor computation can be skipped for some HOG data-objects
+     * @param[out]     detection_windows        Array of @ref DetectionWindow used for locating the detected objects
+     * @param[in]      detection_window_strides Array of @ref Size2D used to specify the distance in pixels between 2 consecutive detection windows in x and y directions for each HOG data-object
+     *                                          The dimension of this array must be the same of multi_hog->num_models()
+     *                                          The i-th detection_window_stride of this array must be multiple of the block_stride stored in the i-th multi_hog array
+     * @param[in]      border_mode              Border mode to use.
+     * @param[in]      constant_border_value    (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     * @param[in]      threshold                (Optional) Threshold for the distance between features and SVM classifying plane
+     * @param[in]      non_maxima_suppression   (Optional) Flag to specify whether the non-maxima suppression is required or not.
+     *                                          True if the non-maxima suppression stage has to be computed
+     * @param[in]      min_distance             (Optional) Radial Euclidean distance to use for the non-maxima suppression stage
+     *
+     */
+    void configure(ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows, ICLSize2DArray *detection_window_strides, BorderMode border_mode,
+                   uint8_t constant_border_value = 0,
+                   float threshold = 0.0f, bool non_maxima_suppression = false, float min_distance = 1.0f);
+
+    // Inherited method overridden:
+    void run() override;
+
+private:
+    CLHOGGradient                                                 _gradient_kernel;
+    std::unique_ptr<CLHOGOrientationBinningKernel[]>              _orient_bin_kernel;
+    std::unique_ptr<CLHOGBlockNormalizationKernel[]>              _block_norm_kernel;
+    std::unique_ptr<CLHOGDetector[]>                              _hog_detect_kernel;
+    std::unique_ptr<CPPDetectionWindowNonMaximaSuppressionKernel> _non_maxima_kernel;
+    std::unique_ptr<CLTensor[]>                                   _hog_space;
+    std::unique_ptr<CLTensor[]>                                   _hog_norm_space;
+    ICLDetectionWindowArray                                      *_detection_windows;
+    CLTensor                                                      _mag;
+    CLTensor                                                      _phase;
+    bool                                                          _non_maxima_suppression;
+    size_t                                                        _num_orient_bin_kernel;
+    size_t                                                        _num_block_norm_kernel;
+    size_t                                                        _num_hog_detect_kernel;
+};
+}
+
+#endif /* __ARM_COMPUTE_CLHOGMULTIDETECTION_H__ */
+\ No newline at end of file
diff --git a/arm_compute/runtime/CL/functions/CLHarrisCorners.h b/arm_compute/runtime/CL/functions/CLHarrisCorners.h
new file mode 100644
index 0000000000..90da687435
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLHarrisCorners.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHARRISCORNERS_H__
+#define __ARM_COMPUTE_CLHARRISCORNERS_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h"
+#include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include <cstdint>
+
+#include <memory>
+
+namespace arm_compute
+{
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Basic function to execute harris corners detection. This function calls the following CL and NEON kernels and functions:
+ *
+ * @note Requires CPU support for the kernels: CPPCornerCandidatesKernel and CPPSortEuclideanDistanceKernel.
+ *
+ * -# @ref CLSobel3x3 (if gradient_size == 3) or<br/>
+ *    @ref CLSobel5x5 (if gradient_size == 5) or<br/>
+ *    @ref CLSobel7x7 (if gradient_size == 7)
+ * -# @ref CLFillBorderKernel
+ * -# @ref CLHarrisScoreKernel
+ * -# @ref CLNonMaximaSuppression3x3
+ * -# @ref CPPCornerCandidatesKernel
+ * -# @ref CPPSortEuclideanDistanceKernel
+ */
+class CLHarrisCorners : public IFunction
+{
+public:
+    /** Constructor */
+    CLHarrisCorners();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHarrisCorners(const CLHarrisCorners &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    const CLHarrisCorners &operator=(const CLHarrisCorners &) = delete;
+    /** Initialize the function's source, destination, conv and border_mode.
+     *
+     * @param[in,out] input                 Source image. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[in]     threshold             Minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel).
+     * @param[in]     min_dist              Radial Euclidean distance for the euclidean distance stage.
+     * @param[in]     sensitivity           Sensitivity threshold k from the Harris-Stephens equation
+     * @param[in]     gradient_size         The gradient window size to use on the input. The implementation supports 3, 5, and 7
+     * @param[in]     block_size            The block window size used to compute the Harris Corner score. The implementation supports 3, 5, and 7.
+     * @param[out]    corners               Array of keypoints to store the results.
+     * @param[in]     border_mode           Border mode to use
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLImage *input, float threshold, float min_dist, float sensitivity,
+                   int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners,
+                   BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    std::unique_ptr<IFunction>          _sobel;                 /**< Sobel function */
+    CLHarrisScoreKernel                 _harris_score;          /**< Harris score kernel */
+    CLNonMaximaSuppression3x3Kernel     _non_max_suppr;         /**< Non-maxima suppression function */
+    CPPCornerCandidatesKernel           _candidates;            /**< Sort kernel */
+    CPPSortEuclideanDistanceKernel      _sort_euclidean;        /**< Euclidean distance kernel */
+    CLFillBorderKernel                  _border_gx;             /**< Border handler before running harris score */
+    CLFillBorderKernel                  _border_gy;             /**< Border handler before running harris score */
+    CLImage                             _gx;                    /**< Source image - Gx component */
+    CLImage                             _gy;                    /**< Source image - Gy component */
+    CLImage                             _score;                 /**< Source image - Harris score */
+    CLImage                             _nonmax;                /**< Source image - Non-Maxima suppressed image */
+    std::unique_ptr<InternalKeypoint[]> _corners_list;          /**< Array of InternalKeypoint. It stores the potential corner candidates */
+    int32_t                             _num_corner_candidates; /**< Number of potential corner candidates */
+    ICLKeyPointArray                   *_corners;               /**< Output corners array */
+};
+}
+#endif /*__ARM_COMPUTE_CLHARRISCORNERS_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLHistogram.h b/arm_compute/runtime/CL/functions/CLHistogram.h
new file mode 100644
index 0000000000..455b61812d
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLHistogram.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLHISTOGRAM_H__
+#define __ARM_COMPUTE_CLHISTOGRAM_H__
+
+#include "arm_compute/core/CL/kernels/CLHistogramKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class ICLDistribution1D;
+class ICLTensor;
+using ICLTensor = ICLImage;
+
+/** Basic function to execute histogram. This function calls the following OpenCL kernels:
+ *
+ *  -# @ref CLHistogramKernel
+ *  -# @ref CLHistogramBorderKernel
+ *
+ */
+class CLHistogram : public IFunction
+{
+public:
+    /*
+     * @ Default constructor
+     */
+    CLHistogram();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLHistogram(const CLHistogram &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    const CLHistogram &operator=(const CLHistogram &) = delete;
+    /** Initialize the function
+     *
+     * @param[in]  input  Source image. Data types supported: U8
+     * @param[out] output Output distribution.
+     */
+    void configure(const ICLImage *input, ICLDistribution1D *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLHistogramKernel       _kernel;        /**< kernel to run */
+    CLHistogramBorderKernel _kernel_border; /**< Border kernel to run */
+};
+}
+#endif /*__ARM_COMPUTE_CLHISTOGRAM_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLIntegralImage.h b/arm_compute/runtime/CL/functions/CLIntegralImage.h
new file mode 100644
index 0000000000..25fc549b29
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLIntegralImage.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLINTEGRALIMAGE_H__
+#define __ARM_COMPUTE_CLINTEGRALIMAGE_H__
+
+#include "arm_compute/core/CL/kernels/CLIntegralImageKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute integral image. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLIntegralImageHorKernel
+ * -# @ref CLIntegralImageVertKernel
+ *
+ */
+class CLIntegralImage : public IFunction
+{
+public:
+    /** Default Constructor. */
+    CLIntegralImage();
+    /** Initialise the function's source, destinations and border mode.
+    *
+    * @param[in]  input  Source tensor. Data types supported: U8.
+    * @param[out] output Destination tensor, Data types supported: U32.
+    */
+    void configure(const ICLTensor *input, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+protected:
+    CLIntegralImageHorKernel  _integral_hor;  /**< Integral Image Horizontal kernel */
+    CLIntegralImageVertKernel _integral_vert; /**< Integral Image Vertical kernel */
+};
+}
+#endif /*__ARM_COMPUTE_CLINTEGRALIMAGE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLLaplacianPyramid.h b/arm_compute/runtime/CL/functions/CLLaplacianPyramid.h
new file mode 100644
index 0000000000..0c6708aa73
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLLaplacianPyramid.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLLAPLACIANPYRAMID_H__
+#define __ARM_COMPUTE_CLLAPLACIANPYRAMID_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLPyramid.h"
+#include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
+#include "arm_compute/runtime/CL/functions/CLDepthConvert.h"
+#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
+#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute laplacian pyramid. This function calls the following OpenCL kernels and functions:
+ *
+ * -# @ref CLGaussianPyramidHalf
+ * -# @ref CLGaussian5x5
+ * -# @ref CLArithmeticSubtraction
+ *
+ *  First a Gaussian pyramid is created. Then, for each level i, the corresponding tensor I(i) is blurred with the Gaussian 5x5 filter, and then
+ *  difference between the two tensors is the corresponding level L(i) of the Laplacian pyramid.
+ *  L(i) = I(i) - Gaussian5x5(I(i))
+ *  Level 0 has always the same first two dimensions as the input tensor.
+*/
+class CLLaplacianPyramid : public IFunction
+{
+public:
+    /** Constructor */
+    CLLaplacianPyramid();
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @param[in]  input                 Source tensor. Data types supported: U8.
+     * @param[out] pyramid               Destination pyramid tensors, Data types supported at each level: S16.
+     * @param[out] output                The lowest resolution tensor necessary to reconstruct the input tensor from the pyramid. Data types supported: S16.
+     *                                   The first two dimensions of this tensor must match the first two dimensions of the tensor in the last level of the pyramid, that is:
+     *                                   output.width = input.width() / pow(2,pyramid_levels-1) and out.height = in.height() / pow(2,pyramid_levels-1)
+     * @param[in]  border_mode           Border mode to use.
+     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(ICLTensor *input, CLPyramid *pyramid, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    size_t                                     _num_levels;
+    CLGaussianPyramidHalf                      _gaussian_pyr_function;
+    std::unique_ptr<CLGaussian5x5[]>           _convf;
+    std::unique_ptr<CLArithmeticSubtraction[]> _subf;
+    CLDepthConvert                             _depth_function;
+    CLPyramid                                  _gauss_pyr;
+    CLPyramid                                  _conv_pyr;
+};
+}
+#endif /*__ARM_COMPUTE_CLLAPLACIANPYRAMID_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h b/arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h
new file mode 100644
index 0000000000..4bc7eb65ce
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLLAPLACIANRECONSTRUCT_H__
+#define __ARM_COMPUTE_CLLAPLACIANRECONSTRUCT_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLPyramid.h"
+#include "arm_compute/runtime/CL/functions/CLArithmeticAddition.h"
+#include "arm_compute/runtime/CL/functions/CLDepthConvert.h"
+#include "arm_compute/runtime/CL/functions/CLScale.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstdint>
+#include <memory>
+
+namespace arm_compute
+{
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Basic function to execute laplacian reconstruction. This function calls the following OpenCL kernels and functions:
+ *
+ * -# @ref CLArithmeticAddition
+ * -# @ref CLScale
+ * -# @ref CLDepthConvert
+ *
+ * This function reconstructs the original image from a Laplacian Image Pyramid.
+ *
+ *  The input image is added to the last level of the Laplacian pyramid L(n-2), the resulting image is upsampled to the
+ *  resolution of the next pyramid level.
+ *
+ *  I(n-2) = upsample( input + L(n-1)
+ *
+ *  For each pyramid level i, except i=0 and i=n-1:
+ *  I(i-1) = upsample(I(i) + L(i))
+ *
+ *  output = I(0) + L(0)
+*/
+class CLLaplacianReconstruct : public IFunction
+{
+public:
+    /** Constructor */
+    CLLaplacianReconstruct();
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * The Output image must have the same size as the first level of the pyramid.
+     * The Input image must have the same size as the last level of the pyramid.
+     *
+     * The idea is to reconstuct the original hi-res image from a low-res representation of it and the laplacian pyramid.
+     *
+     * @param[in]  pyramid               Laplacian pyramid tensors, Data types supported at each level: S16.
+     * @param[in]  input                 Source tensor. Data types supported: S16.
+     * @param[out] output                Output tensor. Data types supported: U8.
+     * @param[in]  border_mode           Border mode to use for the convolution.
+     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(const CLPyramid *pyramid, const ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLPyramid                               _tmp_pyr;
+    std::unique_ptr<CLArithmeticAddition[]> _addf;
+    std::unique_ptr<CLScale[]>              _scalef;
+    CLDepthConvert                          _depthf;
+};
+}
+#endif /*__ARM_COMPUTE_CLLAPLACIANRECONSTRUCT_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
new file mode 100644
index 0000000000..b4e469196e
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLLOCALLYCONNECTEDLAYER_H__
+#define __ARM_COMPUTE_CLLOCALLYCONNECTEDLAYER_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/CL/kernels/CLCol2ImKernel.h"
+#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
+#include "arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
+#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to compute the locally connected layer. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLLocallyConnectedLayerWeightsReshapeKernel (executed only once for each configuration)
+ * -# @ref CLIm2ColKernel
+ * -# @ref CLLocallyConnectedMatrixMultiplyKernel
+ * -# @ref CLCol2ImKernel
+ */
+class CLLocallyConnectedLayer : public IFunction
+{
+public:
+    /** Default constructor */
+    CLLocallyConnectedLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                       while every optional dimension from 4 and above represent a batch of inputs.
+     *                       Data types supported: F32.
+     * @param[in]  weights   Weights tensor. Weights are 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches]. Data type supported:Same as @p input.
+     * @param[in]  biases    Biases tensor. Shared biases supported. Biases are 2D tensor with dimensions [OFM, num_patches]. Data type supported:Same as @p input.
+     * @param[out] output    Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                       Data types supported: Same as @p input.
+     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     */
+    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLIm2ColKernel                              _input_im2col_kernel;
+    CLLocallyConnectedLayerWeightsReshapeKernel _weights_reshape_kernel;
+    CLLocallyConnectedMatrixMultiplyKernel      _mm_kernel;
+    CLCol2ImKernel                              _output_col2im_kernel;
+    CLTensor                                    _input_im2col_reshaped;
+    CLTensor                                    _weights_reshaped;
+    CLTensor                                    _gemm_output;
+    bool                                        _is_first_run;
+};
+}
+#endif /* __ARM_COMPUTE_CLLOCALLYCONNECTEDLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLMagnitude.h b/arm_compute/runtime/CL/functions/CLMagnitude.h
new file mode 100644
index 0000000000..dc5f9139b3
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLMagnitude.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLMAGNITUDE_H__
+#define __ARM_COMPUTE_CLMAGNITUDE_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLMagnitudePhaseKernel. */
+class CLMagnitude : public ICLSimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs.
+     *
+     * @param[in]  input1   First tensor input. Data types supported: S16.
+     * @param[in]  input2   Second tensor input. Data types supported: S16.
+     * @param[out] output   Output tensor. Data types supported: S16.
+     * @param[in]  mag_type (Optional) Magnitude calculation type. Default: L2NORM.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type = MagnitudeType::L2NORM);
+};
+}
+#endif /*__ARM_COMPUTE_CLMAGNITUDE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLMeanStdDev.h b/arm_compute/runtime/CL/functions/CLMeanStdDev.h
new file mode 100644
index 0000000000..e33bcdd779
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLMeanStdDev.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLMEANSTDDEV_H__
+#define __ARM_COMPUTE_CLMEANSTDDEV_H__
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+/** Basic function to execute mean and standard deviation by calling @ref CLMeanStdDevKernel */
+class CLMeanStdDev : public IFunction
+{
+public:
+    /** Default Constructor. */
+    CLMeanStdDev();
+    /** Initialise the kernel's inputs and outputs.
+     *
+     * @param[in]  input  Input image. Data types supported: U8.
+     * @param[out] mean   Output average pixel value.
+     * @param[out] stddev (Optional)Output standard deviation of pixel values.
+     */
+    void configure(const ICLImage *input, float *mean, float *stddev = nullptr);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLMeanStdDevKernel _mean_stddev_kernel; /**< Kernel that standard deviation calculation. */
+    cl::Buffer         _global_sum;         /**< Variable that holds the global sum among calls in order to ease reduction */
+    cl::Buffer         _global_sum_squared; /**< Variable that holds the global sum of squared values among calls in order to ease reduction */
+};
+}
+#endif /*__ARM_COMPUTE_CLMEANSTDDEV_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLMedian3x3.h b/arm_compute/runtime/CL/functions/CLMedian3x3.h
new file mode 100644
index 0000000000..af84ba7289
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLMedian3x3.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLMEDIAN3X3_H__
+#define __ARM_COMPUTE_CLMEDIAN3X3_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute median filter. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref CLMedian3x3Kernel
+ *
+ */
+class CLMedian3x3 : public ICLSimpleFunction
+{
+public:
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Destination tensor, Data types supported: U8.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_CLMEDIAN3X3_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLMinMaxLocation.h b/arm_compute/runtime/CL/functions/CLMinMaxLocation.h
new file mode 100644
index 0000000000..84fd67515b
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLMinMaxLocation.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLMINMAXLOCATION_H__
+#define __ARM_COMPUTE_CLMINMAXLOCATION_H__
+
+#include "arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h"
+#include "arm_compute/runtime/CL/CLArray.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+using ICLImage = ICLTensor;
+
+/** Basic function to execute min and max location. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLMinMaxKernel
+ * -# @ref CLMinMaxLocationKernel
+ */
+class CLMinMaxLocation : public IFunction
+{
+public:
+    /** Constructor */
+    CLMinMaxLocation();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMinMaxLocation(const CLMinMaxLocation &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMinMaxLocation &operator=(const CLMinMaxLocation &) = delete;
+    /** Allow instances of this class to be moved */
+    CLMinMaxLocation(CLMinMaxLocation &&) = default;
+    /** Allow instances of this class to be moved */
+    CLMinMaxLocation &operator=(CLMinMaxLocation &&) = default;
+    /** Initialise the kernel's inputs and outputs.
+     *
+     * @note When locations of min and max occurrences are requested, the reported number of locations is limited to the given array size.
+     *
+     * @param[in]  input     Input image. Data types supported: U8 or S16.
+     * @param[out] min       Minimum value of image.
+     * @param[out] max       Maximum value of image.
+     * @param[out] min_loc   (Optional) Array of Coordinates2D used to store minimum value locations.
+     * @param[out] max_loc   (Optional) Array of Coordinates2D used to store maximum value locations.
+     * @param[out] min_count (Optional) Number of minimum value encounters.
+     * @param[out] max_count (Optional) Number of maximum value encounters.
+     */
+    void configure(const ICLImage *input, int32_t *min, int32_t *max,
+                   CLCoordinates2DArray *min_loc = nullptr, CLCoordinates2DArray *max_loc = nullptr,
+                   uint32_t *min_count = nullptr, uint32_t *max_count = nullptr);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLMinMaxKernel         _min_max_kernel;     /**< Kernel that performs min/max */
+    CLMinMaxLocationKernel _min_max_loc_kernel; /**< Kernel that counts min/max occurrences and identifies their positions */
+    cl::Buffer             _min_max_vals;       /**< Buffer to collect min, max values */
+    cl::Buffer             _min_max_count_vals; /**< Buffer to collect min, max values */
+    int32_t               *_min;                /**< Minimum value. */
+    int32_t               *_max;                /**< Maximum value. */
+    uint32_t              *_min_count;          /**< Minimum value occurrences. */
+    uint32_t              *_max_count;          /**< Maximum value occurrences. */
+    CLCoordinates2DArray *_min_loc;             /**< Minimum value occurrences coordinates. */
+    CLCoordinates2DArray *_max_loc;             /**< Maximum value occurrences  coordinates. */
+};
+}
+#endif /*__ARM_COMPUTE_CLMINMAXLOCATION_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLNonLinearFilter.h b/arm_compute/runtime/CL/functions/CLNonLinearFilter.h
new file mode 100644
index 0000000000..9eee33e0ba
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLNonLinearFilter.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLNONLINEARFILTER_H__
+#define __ARM_COMPUTE_CLNONLINEARFILTER_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute non linear filter. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref CLNonLinearFilterKernel
+ *
+ * @note Supported mask dimensions squares of sizes 3, 5
+ */
+class CLNonLinearFilter : public ICLSimpleFunction
+{
+public:
+    /** Initialize the function's source, destination, conv and border_mode.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Destination tensor. Data types supported: U8
+     * @param[in]     function              Non linear function to perform
+     * @param[in]     mask_size             Mask size. Supported sizes: 3, 5
+     * @param[in]     pattern               Mask pattern
+     * @param[in]     mask                  The given mask. Will be used only if pattern is specified to PATTERN_OTHER
+     * @param[in]     border_mode           Strategy to use for borders.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
+                   BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_CLNONLINEARFILTER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h b/arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h
new file mode 100644
index 0000000000..7adced4313
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLNONMAXIMASUPPRESSION3X3_H__
+#define __ARM_COMPUTE_CLNONMAXIMASUPPRESSION3X3_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute non-maxima suppression over a 3x3 window. This function calls the following CL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref CLNonMaximaSuppression3x3Kernel
+ */
+class CLNonMaximaSuppression3x3 : public ICLSimpleFunction
+{
+public:
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @note The implementation supports just 2 border modes: UNDEFINED and CONSTANT
+     *       The constant values used with CONSTANT border mode is 0
+     *
+     * @param[in,out] input       Source tensor. Data types supported: U8, F32. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output      Destination for the Non-Maxima suppressions 3x3. Data types supported: same as @p input.
+     * @param[in]     border_mode Border mode to use for non-maxima suppression.
+     *                                   The implementation supports just 2 border modes: UNDEFINED and CONSTANT
+     */
+    void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode);
+};
+}
+#endif /* __ARM_COMPUTE_CLNONMAXIMASUPPRESSION3X3_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
new file mode 100644
index 0000000000..a4dae85c1d
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLNORMALIZATIONLAYER_H__
+#define __ARM_COMPUTE_CLNORMALIZATIONLAYER_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to simulate a normalization layer. This function calls the following CL kernels:
+ *
+ * -# @ref CLPixelWiseMultiplicationKernel
+ * -# @ref CLFillBorderKernel
+ * -# @ref CLNormalizationLayerKernel
+ *
+ */
+class CLNormalizationLayer : public IFunction
+{
+public:
+    /** Default constructor */
+    CLNormalizationLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
+     *                       and an optional 4th dimension for batch of inputs. Data types supported: F16, F32. Number of channels must be 1.
+     * @param[out] output    Destination tensor. Dimensions, data type and number of channels must match the input ones.
+     * @param[in]  norm_info Normalization layer information like the normalization type, normalization size and other parameters.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLTensor                        _squared_input;   /**< The intermediate buffer which stores results of squaring input*/
+    CLNormalizationLayerKernel      _norm_kernel;     /**< Normalization layer kernel to run */
+    CLPixelWiseMultiplicationKernel _multiply_kernel; /**< Pixel multiplication kernel to run */
+    CLFillBorderKernel              _border_handler;  /**< Kernel to handle  borders */
+};
+}
+#endif /* __ARM_COMPUTE_CLNORMALIZATIONLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLOpticalFlow.h b/arm_compute/runtime/CL/functions/CLOpticalFlow.h
new file mode 100644
index 0000000000..ca3f86100e
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLOpticalFlow.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLOPTICALFLOW_H__
+#define __ARM_COMPUTE_CLOPTICALFLOW_H__
+
+#include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h"
+
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLArray.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+namespace arm_compute
+{
+class CLPyramid;
+
+using CLLKInternalKeypointArray = CLArray<CLLKInternalKeypoint>;
+using CLCoefficientTableArray   = CLArray<CLCoefficientTable>;
+using CLOldValueArray           = CLArray<CLOldValue>;
+
+/** Basic function to execute optical flow. This function calls the following OpenCL kernels and functions:
+ *
+ * -# @ref CLScharr3x3
+ * -# @ref CLLKTrackerInitKernel
+ * -# @ref CLLKTrackerStage0Kernel
+ * -# @ref CLLKTrackerStage1Kernel
+ * -# @ref CLLKTrackerFinalizeKernel
+ */
+class CLOpticalFlow : public IFunction
+{
+public:
+    /** Default constructor */
+    CLOpticalFlow();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLOpticalFlow(const CLOpticalFlow &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLOpticalFlow &operator=(const CLOpticalFlow &) = delete;
+    /** Allow instances of this class to be moved */
+    CLOpticalFlow(CLOpticalFlow &&) = default;
+    /** Allow instances of this class to be moved */
+    CLOpticalFlow &operator=(CLOpticalFlow &&) = default;
+    /**  Initialise the function input and output
+     *
+     * @param[in]  old_pyramid           Pointer to the pyramid for the old tensor. Data types supported U8
+     * @param[in]  new_pyramid           Pointer to the pyramid for the new tensor. Data types supported U8
+     * @param[in]  old_points            Pointer to the IKeyPointArray storing old key points
+     * @param[in]  new_points_estimates  Pointer to the IKeyPointArray storing new estimates key points
+     * @param[out] new_points            Pointer to the IKeyPointArray storing new key points
+     * @param[in]  termination           The criteria to terminate the search of each keypoint.
+     * @param[in]  epsilon               The error for terminating the algorithm
+     * @param[in]  num_iterations        The maximum number of iterations before terminate the alogrithm
+     * @param[in]  window_dimension      The size of the window on which to perform the algorithm
+     * @param[in]  use_initial_estimate  The flag to indicate whether the initial estimated position should be used
+     * @param[in]  border_mode           The border mode applied at scharr kernel stage
+     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT
+     *
+     */
+    void configure(const CLPyramid *old_pyramid, const CLPyramid *new_pyramid,
+                   const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates, ICLKeyPointArray *new_points,
+                   Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, bool use_initial_estimate,
+                   BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    std::unique_ptr<CLLKTrackerInitKernel[]>   _tracker_init_kernel;
+    std::unique_ptr<CLLKTrackerStage0Kernel[]> _tracker_stage0_kernel;
+    std::unique_ptr<CLLKTrackerStage1Kernel[]> _tracker_stage1_kernel;
+    CLLKTrackerFinalizeKernel                  _tracker_finalize_kernel;
+    std::unique_ptr<CLScharr3x3[]>             _func_scharr;
+    std::unique_ptr<CLTensor[]>                _scharr_gx;
+    std::unique_ptr<CLTensor[]>                _scharr_gy;
+    const ICLKeyPointArray                    *_old_points;
+    const ICLKeyPointArray                    *_new_points_estimates;
+    ICLKeyPointArray                          *_new_points;
+    std::unique_ptr<CLLKInternalKeypointArray> _old_points_internal;
+    std::unique_ptr<CLLKInternalKeypointArray> _new_points_internal;
+    std::unique_ptr<CLCoefficientTableArray>   _coefficient_table;
+    std::unique_ptr<CLOldValueArray>           _old_values;
+    size_t                                     _num_levels;
+};
+}
+#endif /*__ARM_COMPUTE_CLOPTICALFLOW_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLPhase.h b/arm_compute/runtime/CL/functions/CLPhase.h
new file mode 100644
index 0000000000..7cdfab16e2
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLPhase.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLPHASE_H__
+#define __ARM_COMPUTE_CLPHASE_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute an @ref CLMagnitudePhaseKernel. */
+class CLPhase : public ICLSimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output.
+     *
+     * @param[in]  input1     First tensor input. Data types supported: S16.
+     * @param[in]  input2     Second tensor input. Data types supported: S16.
+     * @param[out] output     Output tensor. Data types supported: U8.
+     * @param[in]  phase_type (Optional) Phase calculation type. Default: SIGNED.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, PhaseType phase_type = PhaseType::SIGNED);
+};
+}
+#endif /*__ARM_COMPUTE_CLPHASE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
new file mode 100644
index 0000000000..71754fc3f4
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLPIXELWISEMULTIPLICATION_H__
+#define __ARM_COMPUTE_CLPIXELWISEMULTIPLICATION_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLPixelWiseMultiplicationKernel. */
+class CLPixelWiseMultiplication : public ICLSimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output and convertion policy.
+     *
+     * @param[in]  input1          First tensor input. Data types supported: U8, S16, F16 or F32.
+     * @param[in]  input2          Second tensor input. Data types supported: U8, S16, F16 or F32.
+     * @param[out] output          Output tensor. Data types supported: U8(Only if both inputs are U8), S16, F16 or F32.
+     * @param[in]  scale           Scale to apply after multiplication. Must be positive.
+     * @param[in]  overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+     * @param[in]  rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
+                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
+};
+}
+#endif /*__ARM_COMPUTE_CLPIXELWISEMULTIPLICATION_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLPoolingLayer.h b/arm_compute/runtime/CL/functions/CLPoolingLayer.h
new file mode 100644
index 0000000000..f92860e5b2
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLPoolingLayer.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLPOOLINGLAYER_H__
+#define __ARM_COMPUTE_CLPOOLINGLAYER_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if padding size is different from zero)
+ * -# @ref CLPoolingLayerKernel
+ */
+class CLPoolingLayer : public ICLSimpleFunction
+{
+public:
+    /** Set the input and output tensors.
+     *
+     * @param[in,out] input     Source tensor. (Written to only when padding != 0) Data types supported: F16, F32.
+     * @param[out]    output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in]     pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info);
+};
+}
+#endif /* __ARM_COMPUTE_CLPOOLINGLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLRemap.h b/arm_compute/runtime/CL/functions/CLRemap.h
new file mode 100644
index 0000000000..4cb2be90e7
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLRemap.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLREMAP_H__
+#define __ARM_COMPUTE_CLREMAP_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute remap. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref CLRemapKernel
+ */
+class CLRemap : public ICLSimpleFunction
+{
+public:
+    /** Initialise the function's sources, destination, interpolation policy and border mode.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[in]     map_x                 Map for X coords. Data types supported: F32.
+     * @param[in]     map_y                 Map for Y coords. Data types supported: F32.
+     * @param[out]    output                Output tensor. Data types supported: U8.
+     * @param[in]     policy                Interpolation policy to use. Only NEAREST and BILINEAR are supported.
+     * @param[in]     border_mode           Border mode to use on the input tensor.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output,
+                   InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_CLREMAP_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLScale.h b/arm_compute/runtime/CL/functions/CLScale.h
new file mode 100644
index 0000000000..c2438ddf9b
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLScale.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSCALE_H__
+#define __ARM_COMPUTE_CLSCALE_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLScaleKernel */
+class CLScale : public ICLSimpleFunction
+{
+public:
+    /** Initialize the function's source, destination, interpolation type and border_mode.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8, S16. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Destination tensor. Data types supported: U8, S16 (Must be the same as the input tensor).
+     *                                      All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in]     policy                The interpolation type.
+     * @param[in]     border_mode           Strategy to use for borders.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_CLSCALE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLScharr3x3.h b/arm_compute/runtime/CL/functions/CLScharr3x3.h
new file mode 100644
index 0000000000..3ea0b84624
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLScharr3x3.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSCHARR3X3_H__
+#define __ARM_COMPUTE_CLSCHARR3X3_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute scharr 3x3 filter. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref CLScharr3x3Kernel
+ *
+ */
+class CLScharr3x3 : public ICLSimpleFunction
+{
+public:
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @note At least one of output_x or output_y must be not NULL.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output_x              (optional) Destination for the Scharr 3x3 convolution along the X axis. Data types supported: S16.
+     * @param[out]    output_y              (optional) Destination for the Scharr 3x3 convolution along the Y axis. Data types supported: S16.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_CLSCHARR3X3_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLSobel3x3.h b/arm_compute/runtime/CL/functions/CLSobel3x3.h
new file mode 100644
index 0000000000..7a4f47d0ed
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLSobel3x3.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSOBEL3X3_H__
+#define __ARM_COMPUTE_CLSOBEL3X3_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute sobel 3x3 filter. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref CLSobel3x3Kernel
+ *
+ */
+class CLSobel3x3 : public ICLSimpleFunction
+{
+public:
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @note At least one of output_x or output_y must be not NULL.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output_x              (optional) Destination for the Sobel 3x3 convolution along the X axis. Data types supported: S16.
+     * @param[out]    output_y              (optional) Destination for the Sobel 3x3 convolution along the Y axis. Data types supported: S16.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_CLSOBEL3X3_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLSobel5x5.h b/arm_compute/runtime/CL/functions/CLSobel5x5.h
new file mode 100644
index 0000000000..ad1f72faf8
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLSobel5x5.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSOBEL5X5_H__
+#define __ARM_COMPUTE_CLSOBEL5X5_H__
+
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/CL/kernels/CLSobel5x5Kernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute sobel 5x5 filter. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref CLSobel5x5HorKernel
+ * -# @ref CLSobel5x5VertKernel
+ *
+ */
+class CLSobel5x5 : public IFunction
+{
+public:
+    /** Default Constructor. */
+    CLSobel5x5();
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @note At least one of output_x or output_y must be not NULL.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output_x              (optional) Destination for the Sobel 5x5 convolution along the X axis. Data types supported: S16.
+     * @param[out]    output_y              (optional) Destination for the Sobel 5x5 convolution along the Y axis. Data types supported: S16.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+protected:
+    CLSobel5x5HorKernel  _sobel_hor;      /**< Sobel Horizontal 5x5 kernel */
+    CLSobel5x5VertKernel _sobel_vert;     /**< Sobel Vertical 5x5 kernel */
+    CLFillBorderKernel   _border_handler; /**< Kernel to handle image borders */
+    CLImage              _tmp_x;          /**< Temporary buffer for Sobel X */
+    CLImage              _tmp_y;          /**< Temporary buffer for Sobel Y */
+};
+}
+#endif /*__ARM_COMPUTE_CLSOBEL5X5_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLSobel7x7.h b/arm_compute/runtime/CL/functions/CLSobel7x7.h
new file mode 100644
index 0000000000..1a3fe1a50a
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLSobel7x7.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSOBEL7X7_H__
+#define __ARM_COMPUTE_CLSOBEL7X7_H__
+
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/CL/kernels/CLSobel7x7Kernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute sobel 7x7 filter. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref CLSobel7x7HorKernel
+ * -# @ref CLSobel7x7VertKernel
+ *
+ */
+class CLSobel7x7 : public IFunction
+{
+public:
+    /** Default Constructor. */
+    CLSobel7x7();
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @note At least one of output_x or output_y must be not NULL.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output_x              (optional) Destination for the Sobel 7x7 convolution along the X axis. Data types supported: S32.
+     * @param[out]    output_y              (optional) Destination for the Sobel 7x7 convolution along the Y axis. Data types supported: S32.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+protected:
+    CLSobel7x7HorKernel  _sobel_hor;      /**< Sobel Horizontal 7x7 kernel */
+    CLSobel7x7VertKernel _sobel_vert;     /**< Sobel Vertical 7x7 kernel */
+    CLFillBorderKernel   _border_handler; /**< Kernel to handle image borders */
+    CLImage              _tmp_x;          /**< Temporary buffer for Sobel X */
+    CLImage              _tmp_y;          /**< Temporary buffer for Sobel Y */
+};
+}
+#endif /*__ARM_COMPUTE_CLSOBEL7X7_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
new file mode 100644
index 0000000000..42cfc06fc4
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLSOFTMAXLAYER_H__
+#define __ARM_COMPUTE_CLSOFTMAXLAYER_H__
+
+#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to compute a SoftmaxLayer.
+ *
+ * Softmax is calculated by :
+ * @f[ out = exp(x - max(x)) / sum(exp(x - max(x))) @f]
+ *
+ * This function runs the following kernels:
+ * -# @ref CLLogits1DMaxKernel
+ * -# @ref CLLogits1DShiftExpSumKernel
+ * -# @ref CLLogits1DNormKernel
+ */
+class CLSoftmaxLayer : public IFunction
+{
+public:
+    /** Constructor */
+    CLSoftmaxLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Source tensor. Data types supported: F16, F32. Number of channels must be 1.
+     * @param[out] output Destination tensor. Matching input type and channel number.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLLogits1DMaxKernel         _max_kernel;
+    CLLogits1DShiftExpSumKernel _shift_exp_sum_kernel;
+    CLLogits1DNormKernel        _norm_kernel;
+    CLTensor                    _max;
+    CLTensor                    _sum;
+    CLTensor                    _tmp;
+};
+}
+#endif /* __ARM_COMPUTE_CLSOFTMAXLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLTableLookup.h b/arm_compute/runtime/CL/functions/CLTableLookup.h
new file mode 100644
index 0000000000..ebe6593b6a
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLTableLookup.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLTABLELOOKUP_H__
+#define __ARM_COMPUTE_CLTABLELOOKUP_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+class ICLLut;
+
+/** Basic function to run @ref CLTableLookupKernel */
+class CLTableLookup : public ICLSimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]  input  First tensor input. Data types supported: U8 and S16
+     * @param[in]  lut    Input lookup table. Data types supported: U8 and S16
+     * @param[out] output Output tensor. Data types supported: U8 and S16
+     */
+    void configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output);
+};
+}
+#endif /*__ARM_COMPUTE_CLTABLELOOKUP_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLThreshold.h b/arm_compute/runtime/CL/functions/CLThreshold.h
new file mode 100644
index 0000000000..14c05786c1
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLThreshold.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLTHRESHOLD_H__
+#define __ARM_COMPUTE_CLTHRESHOLD_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLThresholdKernel */
+class CLThreshold : public ICLSimpleFunction
+{
+public:
+    /** Initialise the function's source, destination, thresholds and threshold type
+     *
+     * @param[in]  input       First tensor input. Data types supported: U8.
+     * @param[out] output      Output tensor. Data types supported: U8.
+     * @param[in]  threshold   Threshold. If upper threshold is specified, this will be used as the lower threshold.
+     * @param[in]  false_value Value to assign when the condition is false.
+     * @param[in]  true_value  value to assign when the condition is true.
+     * @param[in]  type        Thresholding type. Can either be BINARY or RANGE.
+     * @param[in]  upper       Upper threshold. Only used with RANGE thresholding
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, uint8_t threshold,
+                   uint8_t false_value = 0, uint8_t true_value = 0,
+                   ThresholdType type = ThresholdType::BINARY, uint8_t upper = 0);
+};
+}
+#endif /*__ARM_COMPUTE_CLTHRESHOLD_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLTranspose.h b/arm_compute/runtime/CL/functions/CLTranspose.h
new file mode 100644
index 0000000000..9b57fe00a8
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLTranspose.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLTRANSPOSE_H__
+#define __ARM_COMPUTE_CLTRANSPOSE_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to transpose a matrix on OpenCL. This function calls the following OpenCL kernel:
+ *
+ *  -# @ref CLTransposeKernel
+ *
+ */
+class CLTranspose : public ICLSimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/U16/S16/F16/U32/S32/F32
+     * @param[out] output Output tensor. Data type supported: Same as @p input
+     */
+    void configure(const ICLTensor *input, ICLTensor *output);
+};
+}
+
+#endif /* __ARM_COMPUTE_CLTRANSPOSE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLWarpAffine.h b/arm_compute/runtime/CL/functions/CLWarpAffine.h
new file mode 100644
index 0000000000..aeab3f7b22
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLWarpAffine.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLWARPAFFINE_H__
+#define __ARM_COMPUTE_CLWARPAFFINE_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLWarpAffineKernel for AFFINE transformation */
+class CLWarpAffine : public ICLSimpleFunction
+{
+public:
+    /** Initialize the function's source, destination, interpolation policy and border_mode.
+     *
+     * @param[in,out] input                 Source temspr. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Destination tensor, Data types supported: U8.
+     * @param[in]     matrix                The affine matrix. Must be 2x3 of type float.
+     * @param[in]     policy                The interpolation type.
+     * @param[in]     border_mode           Strategy to use for borders.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_CLWARPAFFINE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLWarpPerspective.h b/arm_compute/runtime/CL/functions/CLWarpPerspective.h
new file mode 100644
index 0000000000..80237017aa
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLWarpPerspective.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLWARPPERSPECTIVE_H__
+#define __ARM_COMPUTE_CLWARPPERSPECTIVE_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLWarpPerspectiveKernel for PERSPECTIVE transformation */
+class CLWarpPerspective : public ICLSimpleFunction
+{
+public:
+    /** Initialize the function's source, destination, interpolation policy and border_mode.
+     *
+     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Destination tensor. Data types supported: U8.
+     * @param[in]     matrix                The perspective matrix. Must be 2x3 of type float.
+     * @param[in]     policy                The interpolation type.
+     * @param[in]     border_mode           Strategy to use for borders.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_CLWARPPERSPECTIVE_H__ */
diff --git a/arm_compute/runtime/CPP/CPPScheduler.h b/arm_compute/runtime/CPP/CPPScheduler.h
new file mode 100644
index 0000000000..7a37e5ec21
--- /dev/null
+++ b/arm_compute/runtime/CPP/CPPScheduler.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CPPSCHEDULER_H__
+#define __ARM_COMPUTE_CPPSCHEDULER_H__
+
+#include "arm_compute/runtime/IScheduler.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class Thread;
+
+/** C++11 implementation of a pool of threads to automatically split a kernel's execution among several threads. */
+class CPPScheduler : public IScheduler
+{
+public:
+    /** Sets the number of threads the scheduler will use to run the kernels.
+     *
+     * @param[in] num_threads If set to 0, then the maximum number of threads supported by C++11 will be used, otherwise the number of threads specified.
+     */
+    void set_num_threads(unsigned int num_threads) override;
+    /** Returns the number of threads that the CPPScheduler has in his pool.
+     *
+     * @return Number of threads available in CPPScheduler.
+     */
+    unsigned int num_threads() const override;
+    /** Access the scheduler singleton
+     *
+     * @return The scheduler
+     */
+    static CPPScheduler &get();
+    /** Multithread the execution of the passed kernel if possible.
+     *
+     * The kernel will run on a single thread if any of these conditions is true:
+     * - ICPPKernel::is_parallelisable() returns false
+     * - The scheduler has been initialized with only one thread.
+     *
+     * @param[in] kernel          Kernel to execute.
+     * @param[in] split_dimension Dimension along which to split the kernel's execution window.
+     */
+    void schedule(ICPPKernel *kernel, unsigned int split_dimension) override;
+
+private:
+    /** Constructor: create a pool of threads. */
+    CPPScheduler();
+
+    unsigned int _num_threads;
+    std::unique_ptr<Thread[], void (*)(Thread *)> _threads;
+};
+}
+#endif /* __ARM_COMPUTE_CPPSCHEDULER_H__ */
diff --git a/arm_compute/runtime/Distribution1D.h b/arm_compute/runtime/Distribution1D.h
new file mode 100644
index 0000000000..7080e88075
--- /dev/null
+++ b/arm_compute/runtime/Distribution1D.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_DISTRIBUTION1D_H__
+#define __ARM_COMPUTE_DISTRIBUTION1D_H__
+
+#include "arm_compute/core/IDistribution1D.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+namespace arm_compute
+{
+/** Basic implementation of the 1D distribution interface */
+class Distribution1D : public IDistribution1D
+{
+public:
+    /** Constructor: Creates a 1D Distribution of a consecutive interval [offset, offset + range - 1]
+     *               defined by a start offset and valid range, divided equally into num_bins parts.
+     *
+     * @param[in] num_bins The number of bins the distribution is divided in.
+     * @param[in] offset   The start of the values to use.
+     * @param[in] range    The total number of the consecutive values of the distribution interval.
+     */
+    Distribution1D(size_t num_bins, int32_t offset, uint32_t range);
+
+    // Inherited methods overridden:
+    uint32_t *buffer() const override;
+
+private:
+    std::unique_ptr<uint32_t[]> _data; /**< The distribution data. */
+};
+}
+#endif /* __ARM_COMPUTE_DISTRIBUTION1D_H__ */
diff --git a/arm_compute/runtime/HOG.h b/arm_compute/runtime/HOG.h
new file mode 100644
index 0000000000..70d8034bef
--- /dev/null
+++ b/arm_compute/runtime/HOG.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_HOG_H__
+#define __ARM_COMPUTE_HOG_H__
+
+#include "arm_compute/core/HOGInfo.h"
+#include "arm_compute/core/IHOG.h"
+#include "arm_compute/core/Types.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** CPU implementation of HOG data-object */
+class HOG : public IHOG
+{
+public:
+    /** Default constructor */
+    HOG();
+    /** Allocate the HOG descriptor using the given HOG's metadata
+     *
+     * @param[in] input HOG's metadata used to allocate the HOG descriptor
+     */
+    void init(const HOGInfo &input);
+
+    // Inherited method overridden:
+    const HOGInfo *info() const override;
+    float         *descriptor() const override;
+
+private:
+    HOGInfo                  _info;
+    std::unique_ptr<float[]> _descriptor;
+};
+}
+#endif /* __ARM_COMPUTE_HOG_H__ */
diff --git a/arm_compute/runtime/IFunction.h b/arm_compute/runtime/IFunction.h
new file mode 100644
index 0000000000..a4e7ed15e0
--- /dev/null
+++ b/arm_compute/runtime/IFunction.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_IFUNCTION_H__
+#define __ARM_COMPUTE_IFUNCTION_H__
+
+namespace arm_compute
+{
+/** Base class for all functions */
+class IFunction
+{
+public:
+    /** Run the kernels contained in the function
+     *
+     * For NEON kernels:
+     * - Multi-threading is used for the kernels which are parallelisable.
+     * - By default std::thread::hardware_concurrency() threads are used.
+     *
+     * @note @ref CPPScheduler::set_num_threads() can be used to manually set the number of threads
+     *
+     * For OpenCL kernels:
+     * - All the kernels are enqueued on the queue associated with CLScheduler.
+     * - The queue is then flushed.
+     *
+     * @note The function will not block until the kernels are executed. It is the user's responsibility to wait.
+     */
+    virtual void run() = 0;
+    /** Destructor
+     *
+     */
+    virtual ~IFunction() = default;
+};
+}
+#endif /*__ARM_COMPUTE_IFUNCTION_H__ */
diff --git a/arm_compute/runtime/ILutAllocator.h b/arm_compute/runtime/ILutAllocator.h
new file mode 100644
index 0000000000..f23fbd2154
--- /dev/null
+++ b/arm_compute/runtime/ILutAllocator.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ILUTALLOCATOR_H__
+#define __ARM_COMPUTE_ILUTALLOCATOR_H__
+
+#include "arm_compute/core/Types.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_compute
+{
+/** Basic interface to allocate LUTs' */
+class ILutAllocator
+{
+public:
+    /** Default constructor */
+    ILutAllocator();
+    /** Default virtual destructor */
+    virtual ~ILutAllocator() = default;
+    /** Allow instances of this class to be move constructed */
+    ILutAllocator(ILutAllocator &&) = default;
+    /** Allow instances of this class to be moved */
+    ILutAllocator &operator=(ILutAllocator &&) = default;
+    /** Allocate an LUT of the requested number of elements and data_type.
+     *
+     * @param[in] num_elements Number of elements of the LUT.
+     * @param[in] data_type    Data type of each element.
+     */
+    void init(size_t num_elements, DataType data_type);
+    /** Returns the total number of elements in the LUT.
+     *
+     * @return Total number of elements.
+     */
+    size_t num_elements() const;
+    /** Returns the type of the LUT.
+     *
+     * @return The type of the LUT.
+     */
+    DataType type() const;
+    /** Returns the total size in bytes of the LUT.
+     *
+     * @return Total size of the LUT in bytes.
+     */
+    size_t size() const;
+
+protected:
+    /** Interface to be implemented by the child class to allocate the LUT. */
+    virtual void allocate() = 0;
+    /** Interface to be implemented by the child class to lock the memory allocation for the CPU to access.
+     *
+     * @return Pointer to a CPU mapping of the memory
+     */
+    virtual uint8_t *lock() = 0;
+    /** Interface to be implemented by the child class to unlock the memory allocation after the CPU is done accessing it. */
+    virtual void unlock() = 0;
+
+private:
+    size_t   _num_elements; /**< Number of elements allocated */
+    DataType _data_type;    /**< Data type of LUT elements. */
+};
+}
+#endif /* __ARM_COMPUTE_ILUTALLOCATOR_H__ */
diff --git a/arm_compute/runtime/IScheduler.h b/arm_compute/runtime/IScheduler.h
new file mode 100644
index 0000000000..39c027c6b7
--- /dev/null
+++ b/arm_compute/runtime/IScheduler.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ISCHEDULER_H__
+#define __ARM_COMPUTE_ISCHEDULER_H__
+
+namespace arm_compute
+{
+class ICPPKernel;
+
+/** Scheduler interface to run kernels */
+class IScheduler
+{
+public:
+    /** Destructor. */
+    virtual ~IScheduler() = default;
+    /** Sets the number of threads the scheduler will use to run the kernels.
+     *
+     * @param[in] num_threads If set to 0, then one thread per CPU core available on the system will be used, otherwise the number of threads specified.
+     */
+    virtual void set_num_threads(unsigned int num_threads) = 0;
+    /** Returns the number of threads that the SingleThreadScheduler has in his pool.
+     *
+     * @return Number of threads available in SingleThreadScheduler.
+     */
+    virtual unsigned int num_threads() const = 0;
+    /** Runs the kernel in the same thread as the caller synchronously.
+     *
+     * @param[in] kernel          Kernel to execute.
+     * @param[in] split_dimension Dimension along which to split the kernel's execution window.
+     */
+    virtual void schedule(ICPPKernel *kernel, unsigned int split_dimension) = 0;
+};
+}
+#endif /* __ARM_COMPUTE_ISCHEDULER_H__ */
diff --git a/arm_compute/runtime/ITensorAllocator.h b/arm_compute/runtime/ITensorAllocator.h
new file mode 100644
index 0000000000..6103e436bc
--- /dev/null
+++ b/arm_compute/runtime/ITensorAllocator.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ITENSORALLOCATOR_H__
+#define __ARM_COMPUTE_ITENSORALLOCATOR_H__
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+/** Interface to allocate tensors */
+class ITensorAllocator
+{
+public:
+    /** Default constructor. */
+    ITensorAllocator();
+    /** Allow instances of this class to be copy constructed */
+    ITensorAllocator(const ITensorAllocator &) = default;
+    /** Allow instances of this class to be copied */
+    ITensorAllocator &operator=(const ITensorAllocator &) = default;
+    /** Allow instances of this class to be move constructed */
+    ITensorAllocator(ITensorAllocator &&) = default;
+    /** Allow instances of this class to be moved */
+    ITensorAllocator &operator=(ITensorAllocator &&) = default;
+    /** Default virtual destructor. */
+    virtual ~ITensorAllocator() = default;
+
+    /** Initialize a tensor based on the passed @ref TensorInfo.
+     *
+     * @param[in] input TensorInfo object containing the description of the tensor to initialize.
+     */
+    void init(const TensorInfo &input);
+    /** Return a reference to the tensor's metadata
+     *
+     * @return Reference to the tensor's metadata.
+     */
+    TensorInfo &info();
+    /** Return a constant reference to the tensor's metadata
+     *
+     * @return Constant reference to the tensor's metadata.
+     */
+    const TensorInfo &info() const;
+
+    /** Interface to be implemented by the child class to allocate the tensor.
+     *
+     * @note The child is expected to use the TensorInfo to get the size of the memory allocation.
+     * @warning The tensor must not already be allocated. Otherwise calling the function will fail.
+     */
+    virtual void allocate() = 0;
+
+    /** Interface to be implemented by the child class to free the allocated tensor.
+     *
+     * @warning The tensor must have been allocated previously. Otherwise calling the function will fail.
+     */
+    virtual void free() = 0;
+
+protected:
+    /** Interface to be implemented by the child class to lock the memory allocation for the CPU to access.
+     *
+     * @return Pointer to a CPU mapping of the memory
+     */
+    virtual uint8_t *lock() = 0;
+    /** Interface to be implemented by the child class to unlock the memory allocation after the CPU is done accessing it. */
+    virtual void unlock() = 0;
+
+private:
+    TensorInfo _info; /**< Tensor's metadata. */
+};
+}
+#endif /*__ARM_COMPUTE_ITENSORALLOCATOR_H__ */
diff --git a/arm_compute/runtime/Lut.h b/arm_compute/runtime/Lut.h
new file mode 100644
index 0000000000..87431feee4
--- /dev/null
+++ b/arm_compute/runtime/Lut.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_LUT_H__
+#define __ARM_COMPUTE_LUT_H__
+
+#include "arm_compute/core/ILut.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/LutAllocator.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_compute
+{
+class ILutAllocator;
+
+/** Basic implementation of the LUT interface */
+class Lut : public ILut
+{
+public:
+    /** Constructor */
+    Lut();
+    /** Constructor: initializes a LUT which can contain num_values values of data_type type.
+     *
+     * @param[in] num_elements Number of elements of the LUT.
+     * @param[in] data_type    Data type of each element.
+     */
+    Lut(size_t num_elements, DataType data_type);
+    /** Return a pointer to the lut's allocator
+     *
+     * @return A pointer to the lut's allocator
+     */
+    ILutAllocator *allocator();
+
+    // Inherited methods overridden:
+    size_t   num_elements() const override;
+    uint32_t index_offset() const override;
+    size_t   size_in_bytes() const override;
+    DataType type() const override;
+    uint8_t *buffer() const override;
+    void     clear() override;
+
+private:
+    LutAllocator _allocator; /**< Instance of the basic CPU allocator.*/
+};
+}
+#endif /* __ARM_COMPUTE_LUT_H__ */
diff --git a/arm_compute/runtime/LutAllocator.h b/arm_compute/runtime/LutAllocator.h
new file mode 100644
index 0000000000..76b596bfa0
--- /dev/null
+++ b/arm_compute/runtime/LutAllocator.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_LUTALLOCATOR_H__
+#define __ARM_COMPUTE_LUTALLOCATOR_H__
+
+#include "arm_compute/runtime/ILutAllocator.h"
+
+#include <cstdint>
+#include <memory>
+
+namespace arm_compute
+{
+/** Basic implementation of a CPU memory LUT allocator. */
+class LutAllocator : public ILutAllocator
+{
+public:
+    /** Default constructor. */
+    LutAllocator();
+    /** Interface to be implemented by the child class to return the pointer to the allocate data. */
+    uint8_t *data() const;
+
+protected:
+    /** Allocate num_elements() * sizeof(type()) of CPU memory. */
+    void allocate() override;
+    /** No-op for CPU memory
+     *
+     * @return A pointer to the beginning of the look up table's allocation.
+     */
+    uint8_t *lock() override;
+    /** No-op for CPU memory. */
+    void unlock() override;
+
+private:
+    std::unique_ptr<uint8_t[]> _buffer; /**< CPU memory allocation. */
+};
+}
+#endif /* __ARM_COMPUTE_LUTALLOCATOR_H__ */
diff --git a/arm_compute/runtime/MultiHOG.h b/arm_compute/runtime/MultiHOG.h
new file mode 100644
index 0000000000..32bad70738
--- /dev/null
+++ b/arm_compute/runtime/MultiHOG.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_MULTIHOG_H__
+#define __ARM_COMPUTE_MULTIHOG_H__
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IMultiHOG.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/HOG.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** CPU implementation of multi HOG data-object */
+class MultiHOG : public IMultiHOG
+{
+public:
+    /** Constructor
+     *
+     * @param[in] num_models Number of HOG data objects to contain
+     *
+     */
+    MultiHOG(size_t num_models);
+
+    // Inherited methods overridden:
+    size_t num_models() const override;
+    IHOG *model(size_t index) override;
+    const IHOG *model(size_t index) const override;
+
+private:
+    size_t                 _num_models;
+    std::unique_ptr<HOG[]> _model;
+};
+}
+
+#endif /* __ARM_COMPUTE_MULTIHOG_H__ */
diff --git a/arm_compute/runtime/MultiImage.h b/arm_compute/runtime/MultiImage.h
new file mode 100644
index 0000000000..917e586ef8
--- /dev/null
+++ b/arm_compute/runtime/MultiImage.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_MULTIIMAGE_H__
+#define __ARM_COMPUTE_MULTIIMAGE_H__
+
+#include "arm_compute/core/IMultiImage.h"
+#include "arm_compute/core/MultiImageInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <array>
+
+namespace arm_compute
+{
+class Coordinates;
+class ITensor;
+using IImage = ITensor;
+
+/** Basic implementation of the multi-planar image interface */
+class MultiImage : public IMultiImage
+{
+public:
+    /** Constructor */
+    MultiImage();
+    /** Allocate the multi-planar image
+     *
+     *  @param[in] width  Width of the whole image
+     *  @param[in] height Height of the whole image
+     *  @param[in] format Format of the whole image
+     */
+    void init(unsigned int width, unsigned int height, Format format);
+    /** Allocate the multi-planar image
+     *
+     * @note Uses conservative padding strategy which fits all kernels.
+     *
+     *  @param[in] width  Width of the whole image
+     *  @param[in] height Height of the whole image
+     *  @param[in] format Format of the whole image
+     */
+    void init_auto_padding(unsigned int width, unsigned int height, Format format);
+    /** Allocated a previously initialised multi image
+     *
+     * @note The multi image must not already be allocated when calling this function.
+     *
+     **/
+    void allocate();
+    /** Create a subimage from an existing MultiImage.
+     *
+     *  @param[in] image  Image to use backing memory from
+     *  @param[in] coords Starting coordinates of the new image. Should be within the parent image sizes
+     *  @param[in] width  The width of the subimage
+     *  @param[in] height The height of the subimage
+     */
+    void create_subimage(MultiImage *image, const Coordinates &coords, unsigned int width, unsigned int height);
+
+    // Inherited methods overridden:
+    const MultiImageInfo *info() const override;
+    Image *plane(unsigned int index) override;
+    const Image *plane(unsigned int index) const override;
+
+private:
+    /** Init the multi-planar image
+     *
+     *  @param[in] width        Width of the whole image
+     *  @param[in] height       Height of the whole image
+     *  @param[in] format       Format of the whole image
+     *  @param[in] auto_padding Specifies whether the image uses auto padding
+     */
+    void internal_init(unsigned int width, unsigned int height, Format format, bool auto_padding);
+
+    MultiImageInfo _info;        /** Instance of the multi-planar image's meta data */
+    std::array<Image, 3> _plane; /* Instance Image to hold the planar's information */
+};
+}
+#endif /*__ARM_COMPUTE_MULTIIMAGE_H__ */
diff --git a/arm_compute/runtime/NEON/INESimpleFunction.h b/arm_compute/runtime/NEON/INESimpleFunction.h
new file mode 100644
index 0000000000..6e000d8fd8
--- /dev/null
+++ b/arm_compute/runtime/NEON/INESimpleFunction.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_INESIMPLEFUNCTION_H__
+#define __ARM_COMPUTE_INESIMPLEFUNCTION_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Basic interface for functions which have a single NEON kernel */
+class INESimpleFunction : public IFunction
+{
+public:
+    /** Constructor */
+    INESimpleFunction();
+
+    // Inherited methods overridden:
+    void run() override final;
+
+protected:
+    std::unique_ptr<INEKernel> _kernel;         /**< Kernel to run */
+    NEFillBorderKernel         _border_handler; /**< Kernel to handle image borders */
+};
+}
+#endif /*__ARM_COMPUTE_INESIMPLEFUNCTION_H__ */
diff --git a/arm_compute/runtime/NEON/NEFunctions.h b/arm_compute/runtime/NEON/NEFunctions.h
new file mode 100644
index 0000000000..daf76f3a87
--- /dev/null
+++ b/arm_compute/runtime/NEON/NEFunctions.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFUNCTIONS_H__
+#define __ARM_COMPUTE_NEFUNCTIONS_H__
+
+/* Header regrouping all the NEON functions */
+#include "arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h"
+#include "arm_compute/runtime/NEON/functions/NEAccumulate.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
+#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
+#include "arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEBitwiseAnd.h"
+#include "arm_compute/runtime/NEON/functions/NEBitwiseNot.h"
+#include "arm_compute/runtime/NEON/functions/NEBitwiseOr.h"
+#include "arm_compute/runtime/NEON/functions/NEBitwiseXor.h"
+#include "arm_compute/runtime/NEON/functions/NEBox3x3.h"
+#include "arm_compute/runtime/NEON/functions/NECannyEdge.h"
+#include "arm_compute/runtime/NEON/functions/NEChannelCombine.h"
+#include "arm_compute/runtime/NEON/functions/NEChannelExtract.h"
+#include "arm_compute/runtime/NEON/functions/NEColorConvert.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolution.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthConcatenate.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/NEON/functions/NEDerivative.h"
+#include "arm_compute/runtime/NEON/functions/NEDilate.h"
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEEqualizeHistogram.h"
+#include "arm_compute/runtime/NEON/functions/NEErode.h"
+#include "arm_compute/runtime/NEON/functions/NEFastCorners.h"
+#include "arm_compute/runtime/NEON/functions/NEFillBorder.h"
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowp.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h"
+#include "arm_compute/runtime/NEON/functions/NEGaussian3x3.h"
+#include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
+#include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
+#include "arm_compute/runtime/NEON/functions/NEHOGDescriptor.h"
+#include "arm_compute/runtime/NEON/functions/NEHOGDetector.h"
+#include "arm_compute/runtime/NEON/functions/NEHOGGradient.h"
+#include "arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h"
+#include "arm_compute/runtime/NEON/functions/NEHarrisCorners.h"
+#include "arm_compute/runtime/NEON/functions/NEHistogram.h"
+#include "arm_compute/runtime/NEON/functions/NEIntegralImage.h"
+#include "arm_compute/runtime/NEON/functions/NELaplacianPyramid.h"
+#include "arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h"
+#include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEMagnitude.h"
+#include "arm_compute/runtime/NEON/functions/NEMeanStdDev.h"
+#include "arm_compute/runtime/NEON/functions/NEMedian3x3.h"
+#include "arm_compute/runtime/NEON/functions/NEMinMaxLocation.h"
+#include "arm_compute/runtime/NEON/functions/NENonLinearFilter.h"
+#include "arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h"
+#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEOpticalFlow.h"
+#include "arm_compute/runtime/NEON/functions/NEPhase.h"
+#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
+#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
+#include "arm_compute/runtime/NEON/functions/NERemap.h"
+#include "arm_compute/runtime/NEON/functions/NEScale.h"
+#include "arm_compute/runtime/NEON/functions/NEScharr3x3.h"
+#include "arm_compute/runtime/NEON/functions/NESobel3x3.h"
+#include "arm_compute/runtime/NEON/functions/NESobel5x5.h"
+#include "arm_compute/runtime/NEON/functions/NESobel7x7.h"
+#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
+#include "arm_compute/runtime/NEON/functions/NETableLookup.h"
+#include "arm_compute/runtime/NEON/functions/NEThreshold.h"
+#include "arm_compute/runtime/NEON/functions/NETranspose.h"
+#include "arm_compute/runtime/NEON/functions/NEWarpAffine.h"
+#include "arm_compute/runtime/NEON/functions/NEWarpPerspective.h"
+
+#endif /* __ARM_COMPUTE_NEFUNCTIONS_H__ */
diff --git a/arm_compute/runtime/NEON/NEScheduler.h b/arm_compute/runtime/NEON/NEScheduler.h
new file mode 100644
index 0000000000..94c82b2f03
--- /dev/null
+++ b/arm_compute/runtime/NEON/NEScheduler.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESCHEDULER_H__
+#define __ARM_COMPUTE_NESCHEDULER_H__
+
+#include "arm_compute/runtime/Scheduler.h"
+
+namespace arm_compute
+{
+using NEScheduler = Scheduler;
+}
+#endif /*__ARM_COMPUTE_NESCHEDULER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h b/arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h
new file mode 100644
index 0000000000..266a27586a
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEABSOLUTEDIFFERENCE_H__
+#define __ARM_COMPUTE_NEABSOLUTEDIFFERENCE_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEAbsoluteDifferenceKernel
+ *
+ * @note The image data type for the inputs must be U8 or S16
+ * @note The function calculates the absolute difference also when the 2 inputs have different image data types
+ */
+class NEAbsoluteDifference : public INESimpleFunction
+{
+public:
+    /** Set the inputs and output images
+     *
+     * @param[in]  input1 Source tensor. Data types supported: U8/S16.
+     * @param[in]  input2 Source tensor. Data types supported: U8/S16.
+     * @param[out] output Destination tensor. Data types supported: U8/S16.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+};
+}
+#endif /* __ARM_COMPUTE_NEABSOLUTEDIFFERENCE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEAccumulate.h b/arm_compute/runtime/NEON/functions/NEAccumulate.h
new file mode 100644
index 0000000000..de532c37a0
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEAccumulate.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEACCUMULATE_H__
+#define __ARM_COMPUTE_NEACCUMULATE_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEAccumulateKernel */
+class NEAccumulate : public INESimpleFunction
+{
+public:
+    /** Set the input and accumulation tensors
+     *
+     * @param[in]  input  Source tensor. Data type supported: U8.
+     * @param[out] output Destination tensor. Data type supported: S16.
+     */
+    void configure(const ITensor *input, ITensor *output);
+};
+
+/** Basic function to run @ref NEAccumulateWeightedKernel */
+class NEAccumulateWeighted : public INESimpleFunction
+{
+public:
+    /** Set the input and accumulation tensors, and the scale value
+     *
+     * @param[in]     input    Source tensor. Data type supported: U8.
+     * @param[in]     alpha    The input scalar value with a value input the range of [0, 1.0]
+     * @param[in,out] output   Accumulated tensor. Data type supported: U8.
+     * @param[in]     use_fp16 (Optional) If true the FP16 kernels will be used. If false F32 kernels are used.
+     */
+    void configure(const ITensor *input, float alpha, ITensor *output, bool use_fp16 = false);
+};
+
+/** Basic function to run @ref NEAccumulateSquaredKernel */
+class NEAccumulateSquared : public INESimpleFunction
+{
+public:
+    /** Set the input and accumulation tensors and the shift value.
+     *
+     * @param[in]     input  Source tensor. Data type supported: U8.
+     * @param[in]     shift  The input with a value input the range of [0, 15]
+     * @param[in,out] output Accumulated tensor. Data type supported: S16.
+     */
+    void configure(const ITensor *input, uint32_t shift, ITensor *output);
+};
+}
+#endif /*__ARM_COMPUTE_NEACCUMULATE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEActivationLayer.h b/arm_compute/runtime/NEON/functions/NEActivationLayer.h
new file mode 100644
index 0000000000..35366e16fb
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEActivationLayer.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEACTIVATIONLAYER_H__
+#define __ARM_COMPUTE_NEACTIVATIONLAYER_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEActivationLayerKernel
+ *
+ * @note The function simulates an activation layer with the specified activation function.
+ */
+class NEActivationLayer : public INESimpleFunction
+{
+public:
+    /** Set the input and output tensor.
+     *
+     * @param[in]  input           Source tensor. Data type supported: QS8/F32.
+     * @param[out] output          Destination tensor. Data type supported: same as @p input
+     * @param[in]  activation_info Activation layer parameters.
+     */
+    void configure(const ITensor *input, ITensor *output, ActivationLayerInfo activation_info);
+};
+}
+#endif /* __ARM_COMPUTE_NEACTIVATIONLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
new file mode 100644
index 0000000000..8e34e983c7
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEARITHMETICADDITION_H__
+#define __ARM_COMPUTE_NEARITHMETICADDITION_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEArithmeticAdditionKernel */
+class NEArithmeticAddition : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in]  input1 First tensor input. Data types supported: U8/S16.
+     * @param[in]  input2 Second tensor input. Data types supported: U8/S16.
+     * @param[out] output Output tensor. Data types supported: U8/S16.
+     * @param[in]  policy Policy to use to handle overflow.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
+};
+}
+#endif /*__ARM_COMPUTE_NEARITHMETICADDITION_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
new file mode 100644
index 0000000000..841b5912b9
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEARITHMETICSUBTRACTION_H__
+#define __ARM_COMPUTE_NEARITHMETICSUBTRACTION_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEArithmeticSubtractionKernel */
+class NEArithmeticSubtraction : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in]  input1 First tensor input. Data types supported: U8/S16.
+     * @param[in]  input2 Second tensor input. Data types supported: U8/S16.
+     * @param[out] output Output tensor. Data types supported: U8/S16.
+     * @param[in]  policy Policy to use to handle overflow.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
+};
+}
+#endif /* __ARM_COMPUTE_NEARITHMETICSUBTRACTION_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
new file mode 100644
index 0000000000..b0b5c122cb
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBATCHNORMALIZATIONLAYER_H__
+#define __ARM_COMPUTE_NEBATCHNORMALIZATIONLAYER_H__
+
+#include "arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NENormalizationLayerKernel and simulate a batch normalization layer.
+ *
+ * Batch normalization is calculated by:
+ * @f[ out_i = \gamma * (\frac{in_i - \mu_{B}}{\sqrt{\sigma^2_{B} + \epsilon}}) + \beta \equiv BN_{\gamma,\beta}(in_i) @f]
+ *
+ */
+class NEBatchNormalizationLayer : public IFunction
+{
+public:
+    /** Default constructor */
+    NEBatchNormalizationLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input   Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, FM].
+     *                     The rest are optional and used for representing batches. Data types supported: QS8/F32.
+     * @param[in]  mean    Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  var     Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  gamma   Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  beta    Beta values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
+     * @param[in]  epsilon Small value to avoid division with zero.
+     * @param[out] output  Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
+     */
+    void configure(const ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEBatchNormalizationLayerKernel _norm_kernel; /**< Batch normalization layer kernel */
+};
+}
+#endif /* __ARM_COMPUTE_NEBATCHNORMALIZATIONLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEBitwiseAnd.h b/arm_compute/runtime/NEON/functions/NEBitwiseAnd.h
new file mode 100644
index 0000000000..0250293e97
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEBitwiseAnd.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBITWISEAND_H__
+#define __ARM_COMPUTE_NEBITWISEAND_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEBitwiseAndKernel */
+class NEBitwiseAnd : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]  input1 First tensor input. Data type supported: U8.
+     * @param[in]  input2 Second tensor input. Data type supported: U8.
+     * @param[out] output Output tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+};
+}
+#endif /* __ARM_COMPUTE_NEBITWISEAND_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEBitwiseNot.h b/arm_compute/runtime/NEON/functions/NEBitwiseNot.h
new file mode 100644
index 0000000000..62c08ffcf9
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEBitwiseNot.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBITWISENOT_H__
+#define __ARM_COMPUTE_NEBITWISENOT_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEBitwiseNotKernel */
+class NEBitwiseNot : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's input and output
+     *
+     * @param[in]  input  Input tensor. Data type supported: U8.
+     * @param[out] output Output tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input, ITensor *output);
+};
+}
+#endif /* __ARM_COMPUTE_NEBITWISENOT_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEBitwiseOr.h b/arm_compute/runtime/NEON/functions/NEBitwiseOr.h
new file mode 100644
index 0000000000..1c9a2f9d2e
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEBitwiseOr.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBITWISEOR_H__
+#define __ARM_COMPUTE_NEBITWISEOR_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEBitwiseOrKernel */
+class NEBitwiseOr : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]  input1 First tensor input. Data type supported: U8.
+     * @param[in]  input2 Second tensor input. Data type supported: U8.
+     * @param[out] output Output tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+};
+}
+#endif /* __ARM_COMPUTE_NEBITWISEOR_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEBitwiseXor.h b/arm_compute/runtime/NEON/functions/NEBitwiseXor.h
new file mode 100644
index 0000000000..4690f0a4e3
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEBitwiseXor.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBITWISEXOR_H__
+#define __ARM_COMPUTE_NEBITWISEXOR_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEBitwiseXorKernel */
+class NEBitwiseXor : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]  input1 First tensor input. Data type supported: U8.
+     * @param[in]  input2 Second tensor input. Data type supported: U8.
+     * @param[out] output Output tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+};
+}
+#endif /* __ARM_COMPUTE_NEBITWISEXOR_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEBox3x3.h b/arm_compute/runtime/NEON/functions/NEBox3x3.h
new file mode 100644
index 0000000000..2b5440a74c
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEBox3x3.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEBOX3x3_H__
+#define __ARM_COMPUTE_NEBOX3x3_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute box filter 3x3. This function calls the following NEON kernels:
+ *
+ *  -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ *  -# @ref NEBox3x3Kernel
+ *
+ */
+class NEBox3x3 : public INESimpleFunction
+{
+public:
+    /** Initialise the function's input, output and border mode.
+     *
+     * @note The border handler is run on the input tensor.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output                Destination tensor, Data type supported: U8.
+     * @param[in]      border_mode           Strategy to use for borders.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     * @param[in]      use_fp16              (Optional) If true the FP16 kernels will be used. If false F32 kernels are used.
+     */
+    void configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value = 0, bool use_fp16 = false);
+};
+}
+#endif /*__ARM_COMPUTE_NEBOX3x3_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NECannyEdge.h b/arm_compute/runtime/NEON/functions/NECannyEdge.h
new file mode 100644
index 0000000000..fbf2d90740
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NECannyEdge.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECANNYEDGE_H__
+#define __ARM_COMPUTE_NECANNYEDGE_H__
+
+#include "arm_compute/core/NEON/kernels/NECannyEdgeKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstdint>
+#include <memory>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute canny edge on NEON. This function calls the following NEON kernels and functions:
+ *
+ *  -# @ref NEFillBorderKernel (if border_mode == REPLICATE or border_mode == CONSTANT)
+ *  -# @ref NESobel3x3 (if gradient_size == 3) or
+ *     @ref NESobel5x5 (if gradient_size == 5) or
+ *     @ref NESobel7x7 (if gradient_size == 7)
+ *  -# @ref NEGradientKernel
+ *  -# @ref NEEdgeNonMaxSuppressionKernel
+ *  -# @ref NEEdgeTraceKernel
+ *
+ */
+class NECannyEdge : public IFunction
+{
+public:
+    /** Constructor
+     *
+     * Initialize Sobel kernel to nullptr.
+     */
+    NECannyEdge();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NECannyEdge(const NECannyEdge &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NECannyEdge &operator=(const NECannyEdge &) = delete;
+    /** Initialise the function's source, destination, thresholds, gradient size, normalization type and border mode.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output                Destination tensor. Data type supported: U8.
+     * @param[in]      upper_thr             Upper threhold used for the hysteresis
+     * @param[in]      lower_thr             Lower threshold used for the hysteresis.
+     * @param[in]      gradient_size         Gradient size (3, 5 or 7)
+     * @param[in]      norm_type             Normalization type. If 1, L1-Norm otherwise L2-Norm
+     * @param[in]      border_mode           Border mode to use for the convolution.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     * @param[in]      use_fp16              (Optional) If true the FP16 kernels will be used. If false F32 kernels are used.
+     *
+     */
+    void configure(ITensor *input, ITensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, uint8_t constant_border_value = 0,
+                   bool use_fp16 = false);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    std::unique_ptr<IFunction>    _sobel;               /**< Pointer to Sobel kernel */
+    std::unique_ptr<INEKernel>    _gradient;            /**< Gradient kernel */
+    NEEdgeNonMaxSuppressionKernel _non_max_suppr;       /**< Non-Maxima suppression kernel */
+    NEEdgeTraceKernel             _edge_trace;          /**< Edge tracing kernel */
+    NEFillBorderKernel            _border_mag_gradient; /**< Fill border on magnitude tensor kernel */
+    NEFillBorderKernel            _border_edge_trace;   /**< Fill border before edge trace */
+    Tensor                        _gx;                  /**< Source tensor - Gx component */
+    Tensor                        _gy;                  /**< Source tensor - Gy component */
+    Tensor                        _magnitude;           /**< Source tensor - Magnitude */
+    Tensor                        _phase;               /**< Source tensor - Phase */
+    Tensor                        _nonmax;              /**< Source tensor - Non-Maxima suppressed */
+    ITensor                      *_output;              /**< Output tensor provided by the user. */
+};
+}
+#endif /* __ARM_COMPUTE_NECANNYEDGE_H */
diff --git a/arm_compute/runtime/NEON/functions/NEChannelCombine.h b/arm_compute/runtime/NEON/functions/NEChannelCombine.h
new file mode 100644
index 0000000000..7133553e1d
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEChannelCombine.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECHANNELCOMBINE_H__
+#define __ARM_COMPUTE_NECHANNELCOMBINE_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class IMultiImage;
+class ITensor;
+using IImage = ITensor;
+
+/**Basic function to run @ref NEChannelCombineKernel to perform channel combination. */
+class NEChannelCombine : public INESimpleFunction
+{
+public:
+    /** Initialize function's inputs and outputs.
+     *
+     * @param[in]  plane0 The 2D plane that forms channel 0. Data type supported: U8
+     * @param[in]  plane1 The 2D plane that forms channel 1. Data type supported: U8
+     * @param[in]  plane2 The 2D plane that forms channel 2. Data type supported: U8
+     * @param[in]  plane3 The 2D plane that forms channel 3. Data type supported: U8
+     * @param[out] output The single planar output tensor. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
+     */
+    void configure(const ITensor *plane0, const ITensor *plane1, const ITensor *plane2, const ITensor *plane3, ITensor *output);
+    /** Initialize function's inputs and outputs.
+     *
+     * @param[in]  plane0 The 2D plane that forms channel 0. Data type supported: U8
+     * @param[in]  plane1 The 2D plane that forms channel 1. Data type supported: U8
+     * @param[in]  plane2 The 2D plane that forms channel 2. Data type supported: U8
+     * @param[out] output The multi planar output image. Formats supported: NV12/NV21/IYUV/YUV444
+     */
+    void configure(const IImage *plane0, const IImage *plane1, const IImage *plane2, IMultiImage *output);
+};
+}
+#endif /*__ARM_COMPUTE_NECHANNELCOMBINE_H__*/
diff --git a/arm_compute/runtime/NEON/functions/NEChannelExtract.h b/arm_compute/runtime/NEON/functions/NEChannelExtract.h
new file mode 100644
index 0000000000..5e46eef3a6
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEChannelExtract.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECHANNELEXTRACT_H__
+#define __ARM_COMPUTE_NECHANNELEXTRACT_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class IMultiImage;
+class ITensor;
+using IImage = ITensor;
+
+/**Basic function to run @ref NEChannelExtractKernel to perform channel extraction. */
+class NEChannelExtract : public INESimpleFunction
+{
+public:
+    /** Initialize the function's source, destination
+     *
+     * @param[in]  input   The input tensor to extract the channel from. Formats supported: Any single planar.
+     * @param[in]  channel The channel to extract.
+     * @param[out] output  The extracted channel. Format supported: U8
+     */
+    void configure(const ITensor *input, Channel channel, ITensor *output);
+    /** Initialize the function's source, destination
+     *
+     * @param[in]  input   The multi-planar input image to extract channel from.
+     * @param[in]  channel The channel to extract.
+     * @param[out] output  The extracted channel. Format supported: U8
+     */
+    void configure(const IMultiImage *input, Channel channel, IImage *output);
+};
+}
+#endif /*__ARM_COMPUTE_NECHANNELEXTRACT_H__*/
diff --git a/arm_compute/runtime/NEON/functions/NEColorConvert.h b/arm_compute/runtime/NEON/functions/NEColorConvert.h
new file mode 100644
index 0000000000..2997778ed5
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEColorConvert.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECOLORCONVERT_H__
+#define __ARM_COMPUTE_NECOLORCONVERT_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+class IMultiImage;
+using IImage = ITensor;
+
+/**Basic function to run @ref NEColorConvertKernel to perform color conversion */
+class NEColorConvert : public INESimpleFunction
+{
+public:
+    /** Initialize the function's source, destination
+     *
+     * @param[in] input  The input single-planar tensor from which to convert
+     * @param[in] output The converted single-planar output tensor
+     */
+    void configure(const ITensor *input, ITensor *output);
+    /** Initialize the function's source, destination
+     *
+     * @param[in] input  The multi-planar input image from which to convert
+     * @param[in] output The converted single-planar output image
+     */
+    void configure(const IMultiImage *input, IImage *output);
+    /** Initialize the function's source, destination
+     *
+     * @param[in] input  The single-planar input image from which to convert
+     * @param[in] output The converted multi-planar output image
+     */
+    void configure(const IImage *input, IMultiImage *output);
+    /** Initialize the function's source, destination
+     *
+     * @param[in] input  The multi-planar input image from which to convert
+     * @param[in] output The converted multi-planar output image
+     */
+    void configure(const IMultiImage *input, IMultiImage *output);
+};
+}
+#endif /*__ARM_COMPUTE_NECOLORCONVERT_H__*/
diff --git a/arm_compute/runtime/NEON/functions/NEConvolution.h b/arm_compute/runtime/NEON/functions/NEConvolution.h
new file mode 100644
index 0000000000..1704d9fa94
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEConvolution.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECONVOLUTION_H__
+#define __ARM_COMPUTE_NECONVOLUTION_H__
+
+#include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute convolution of size 3x3. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NEConvolution3x3Kernel
+ *
+ */
+class NEConvolution3x3 : public INESimpleFunction
+{
+public:
+    /** Initialize the function's source, destination, conv and border_mode.
+     *
+     * @param[in,out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Destination tensor, Data types supported: U8/S16.
+     * @param[in]     conv                  Matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer.
+     * @param[in]     scale                 Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
+     * @param[in]     border_mode           Strategy to use for borders.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+
+/** Basic function to execute convolution of size 5x5, 7x7, 9x9. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NEConvolutionKernel or<br/>
+ *    @ref NESeparableConvolutionHorKernel and @ref NESeparableConvolutionVertKernel (if convolution matrix is separable)
+ *
+ */
+template <unsigned int matrix_size>
+class NEConvolutionSquare : public IFunction
+{
+public:
+    /** Default constructor */
+    NEConvolutionSquare();
+    /** Initialize the function's source, destination, conv and border_mode.
+     *
+     * @param[in,out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Destination tensor, Data types supported: U8 or S16.
+     * @param[in]     conv                  matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer.
+     * @param[in]     scale                 Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
+     * @param[in]     border_mode           Strategy to use for borders.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    Tensor                                        _tmp;            /**< temporary buffer for output of horizontal pass */
+    bool                                          _is_separable;   /**< true if the convolution can be separated */
+    NESeparableConvolutionHorKernel<matrix_size>  _kernel_hor;     /**< kernel for horizontal pass of separated convolution */
+    NESeparableConvolutionVertKernel<matrix_size> _kernel_vert;    /**< kernel for vertical pass of separated convolution */
+    NEConvolutionKernel<matrix_size>              _kernel;         /**< kernel for non-separated convolution **/
+    NEFillBorderKernel                            _border_handler; /**< kernel for border handling */
+};
+
+/** Basic function to run 5x5 convolution. */
+using NEConvolution5x5 = NEConvolutionSquare<5>;
+/** Basic function to run 7x7 convolution. */
+using NEConvolution7x7 = NEConvolutionSquare<7>;
+/** Basic function to run 9x9 convolution. */
+using NEConvolution9x9 = NEConvolutionSquare<9>;
+
+/** Basic function to execute non-square convolution. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NEConvolutionRectangleKernel or<br/>
+ *
+ * @note Convolution rectangle should have dimensions of 3, 5, 7, 9
+ */
+class NEConvolutionRectangle : public INESimpleFunction
+{
+public:
+    /** Initialize the function's source, destination, conv and border_mode.
+     *
+     * @param[in,out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Destination tensor, Data types supported: U8 or S16.
+     * @param[in]     conv                  Matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer.
+     * @param[in]     rows                  Rows of convolution kernel.
+     * @param[in]     cols                  Columns of convolution kernel.
+     * @param[in]     scale                 Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
+     * @param[in]     border_mode           Strategy to use for borders.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_NECONVOLUTION_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
new file mode 100644
index 0000000000..a8fff8d047
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECONVOLUTIONLAYER_H__
+#define __ARM_COMPUTE_NECONVOLUTIONLAYER_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
+#include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Function to reshape and perform 1xW transposition on the weights. This function calls the following kernels:
+ * -# @ref NEWeightsReshapeKernel
+ * -# @ref NEGEMMTranspose1xWKernel (executed in case GEMM is required for the operation)
+ */
+class NEConvolutionLayerReshapeWeights : public IFunction
+{
+public:
+    /** Constructor */
+    NEConvolutionLayerReshapeWeights();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QS8/F32.
+     * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
+     * @param[out] output       Destination tensor. Data types supported: Same as @p weights.
+     * @param[in]  transpose1xW True if the weights are to undergo a 1xW transposition after reshaping (in case of GEMM operation), false otherwise.
+     *                          Data types supported: Same as @p weights.
+     */
+    void configure(const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose1xW);
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEWeightsReshapeKernel   _weights_reshape_kernel;
+    NEGEMMTranspose1xWKernel _weights_transposed_kernel;
+    Tensor                   _weights_reshaped;
+    bool                     _transpose1xW;
+};
+
+/** Basic function to simulate a convolution layer. This function calls the following NEON kernels:
+ * -# @ref NEWeightsReshapeKernel   (executed only once for each configuration)
+ * -# @ref NEIm2ColKernel
+ * -# @ref NEGEMMInterleave4x4Kernel (executed only in case GEMM is required for the operation)
+ * -# @ref NEGEMMMatrixMultiplyKernel
+ * -# @ref NECol2ImKernel
+ */
+class NEConvolutionLayer : public IFunction
+{
+public:
+    /** Constructor */
+    NEConvolutionLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                          while every optional dimension from 4 and above represent a batch of inputs.
+     *                          Data types supported: QS8/F32.
+     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
+     * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] output       Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                          Data types supported: Same as @p input.
+     * @param[in]  conv_info    Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
+     *                          tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input.
+     */
+    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo());
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEIm2ColKernel                   _input_im2col_kernel;
+    NEGEMMInterleave4x4Kernel        _input_interleave_kernel;
+    NEConvolutionLayerReshapeWeights _reshape_weights;
+    NEGEMMMatrixMultiplyKernel       _mm_kernel;
+    NECol2ImKernel                   _output_col2im_kernel;
+    Tensor                           _input_im2col_reshaped;
+    Tensor                           _input_interleaved_reshaped;
+    Tensor                           _weights_reshaped;
+    Tensor                           _gemm_output;
+    bool                             _has_bias;
+    bool                             _is_fully_connected_convolution;
+    bool                             _are_weights_reshaped;
+};
+}
+#endif /* __ARM_COMPUTE_NECONVOLUTIONLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEDepthConcatenate.h b/arm_compute/runtime/NEON/functions/NEDepthConcatenate.h
new file mode 100644
index 0000000000..02ff1227c7
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEDepthConcatenate.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDEPTHCONCATENATE_H__
+#define __ARM_COMPUTE_NEDEPTHCONCATENATE_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+#include <vector>
+
+namespace arm_compute
+{
+class ITensor;
+class NEDepthConcatenateKernel;
+class NEFillBorderKernel;
+
+/** Basic function to execute concatenate tensors along z axis. This function calls the following kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if input's lowest two dimensions are smaller than respective output's dimensions)
+ * -# @ref NEDepthConcatenateKernel
+ *
+ */
+class NEDepthConcatenate : public IFunction
+{
+public:
+    /** Default constructor */
+    NEDepthConcatenate();
+    /** Initialise the kernel's inputs vector and output.
+     *
+     * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported:  F32.
+     * @param[out]    output        Output tensor. Data types supported: F32.
+     */
+    void configure(std::vector<ITensor *> inputs_vector, ITensor *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    std::vector<ITensor *>                      _inputs_vector;
+    std::unique_ptr<NEDepthConcatenateKernel[]> _concat_kernels_vector;
+    std::unique_ptr<NEFillBorderKernel[]>       _border_handlers_vector;
+    unsigned int                                _num_inputs;
+};
+}
+#endif /* __ARM_COMPUTE_NEDEPTHCONCATENATE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEDepthConvert.h b/arm_compute/runtime/NEON/functions/NEDepthConvert.h
new file mode 100644
index 0000000000..7c59ce432d
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEDepthConvert.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDEPTHCONVERT_H__
+#define __ARM_COMPUTE_NEDEPTHCONVERT_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/**Basic function to run @ref NEDepthConvertKernel */
+class NEDepthConvert : public INESimpleFunction
+{
+public:
+    /* Contructor */
+    NEDepthConvert() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    NEDepthConvert(const NEDepthConvert &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    const NEDepthConvert &operator=(const NEDepthConvert &) = delete;
+    /** Initialize the function's source, destination
+     *
+     * Input format must be different than output format.
+     *
+     * Valid conversions Input -> Output :
+     *    QS8 -> F32
+     *    U8 -> U16, S16, S32
+     *    U16 -> U8, U32
+     *    S16 -> U8, S32
+     *    F32 -> QS8
+     *
+     *
+     * @param[in]  input  The input tensor to convert. Data type supported: QS8/U8/U16/S16/F32.
+     * @param[out] output The output tensor. Data type supported: QS8/U8/U16/S16/U32/S32/F32.
+     * @param[in]  policy Conversion policy.
+     * @param[in]  shift  Value for down/up conversions. Must be 0 <= shift < 8.
+     *                    It is not used on fixed point conversion.
+     */
+    void configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift);
+};
+}
+#endif /*__ARM_COMPUTE_NEDEPTHCONVERT_H__*/
diff --git a/arm_compute/runtime/NEON/functions/NEDerivative.h b/arm_compute/runtime/NEON/functions/NEDerivative.h
new file mode 100644
index 0000000000..57b7409b39
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEDerivative.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDERIVATIVE_H__
+#define __ARM_COMPUTE_NEDERIVATIVE_H__
+
+#include "arm_compute/core/NEON/kernels/NEDerivativeKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute first order derivative operator. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NEDerivativeKernel
+ *
+ */
+class NEDerivative : public IFunction
+{
+public:
+    /** Default constructor */
+    NEDerivative();
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @note At least one of output_x or output_y must be not NULL.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output_x              (optional) Destination tensor. Derivative along the X direction. Data type supported: S16.
+     * @param[out]     output_y              (optional) Destination tensor. Derivative along the Y direction. Data type supported: S16.
+     * @param[in]      border_mode           Border mode to use
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEDerivativeKernel _kernel;         /**< Derivative kernel */
+    NEFillBorderKernel _border_handler; /**< Kernel to handle tensor borders */
+};
+}
+#endif /* __ARM_COMPUTE_NEDERIVATIVE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEDilate.h b/arm_compute/runtime/NEON/functions/NEDilate.h
new file mode 100644
index 0000000000..17bdb3363e
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEDilate.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDILATE_H__
+#define __ARM_COMPUTE_NEDILATE_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute dilate. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NEDilateKernel
+ *
+ */
+class NEDilate : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output and border mode.
+     *
+     * @param[in, out] input                 First tensor input. Data type supported: U8.(Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output                Output tensor. Data type supported: U8.
+     * @param[in]      border_mode           Border mode to use for the convolution.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value);
+};
+}
+#endif /*__ARM_COMPUTE_NEDILATE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
new file mode 100644
index 0000000000..a356cac7c8
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYER_H__
+#define __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYER_H__
+
+#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+/** Function to run the direct convolution.
+ *
+ *  This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel for the input
+ * -# @ref NEDirectConvolutionLayerBiasAccumulateKernel
+ * -# @ref NEDirectConvolutionLayerKernel
+ */
+class NEDirectConvolutionLayer : public IFunction
+{
+public:
+    /** Constructor */
+    NEDirectConvolutionLayer();
+    /** Set the input, weights, biases and output tensors.
+      *
+      * @param[in, out] input     Input tensor. Data types supported: QS8/F32.
+      * @param[in]      weights   Set of kernels to convolve the input volume.
+      *                           The 3rd dimension must be the same as the input's volume 3rd dimension.
+      *                           Data type supported: Same as @p input.
+      * @param[in]      bias      Set of biases. Data type supported: Same as @p input.
+      * @param[out]     output    Output tensor.
+      *                           The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
+      * @param[in]      conv_info Contains padding and stride information described in @ref PadStrideInfo.
+      */
+    void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEDirectConvolutionLayerBiasAccumulateKernel _accumulate_bias_kernel;
+    NEDirectConvolutionLayerKernel               _conv_kernel;
+    NEFillBorderKernel                           _input_border_handler;
+    Tensor                                       _accumulator;
+};
+}
+#endif /* __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEEqualizeHistogram.h b/arm_compute/runtime/NEON/functions/NEEqualizeHistogram.h
new file mode 100644
index 0000000000..6cf8008480
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEEqualizeHistogram.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEEQUALIZEHISTOGRAM_H__
+#define __ARM_COMPUTE_NEEQUALIZEHISTOGRAM_H__
+
+#include "arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h"
+#include "arm_compute/core/NEON/kernels/NEHistogramKernel.h"
+#include "arm_compute/core/NEON/kernels/NETableLookupKernel.h"
+#include "arm_compute/runtime/Distribution1D.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/Lut.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+
+/** Basic function to execute histogram equalization. This function calls the following NEON kernels:
+ *
+ * -# @ref NEHistogramKernel
+ * -# @ref NECumulativeDistributionKernel
+ * -# @ref NETableLookupKernel
+ *
+ */
+class NEEqualizeHistogram : public IFunction
+{
+public:
+    /** Default Constructor. */
+    NEEqualizeHistogram();
+    /** Initialise the kernel's inputs.
+     *
+     * @note Currently the width of the input image must be a multiple of 16.
+     *
+     * @param[in]  input  Input image. Data type supported: U8.
+     * @param[out] output Output image. Data type supported: same as @p input
+     */
+    void configure(const IImage *input, IImage *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEHistogramKernel              _histogram_kernel;        /**< Kernel that calculates the histogram of input. */
+    NECumulativeDistributionKernel _cd_histogram_kernel;     /**< Kernel that calculates the cumulative distribution
+                                                                  and creates the relevant LookupTable. */
+    NETableLookupKernel            _map_histogram_kernel;    /**< Kernel that maps the input to output using the lut. */
+    Distribution1D                 _hist;                    /**< Distribution that holds the histogram of the input image. */
+    Distribution1D                 _cum_dist;                /**< Distribution that holds the cummulative distribution of the input histogram. */
+    Lut                            _cd_lut;                  /**< Holds the equalization lookuptable. */
+    static constexpr uint32_t      nr_bins{ 256 };           /**< Histogram bins of the internal histograms. */
+    static constexpr uint32_t      max_range{ nr_bins - 1 }; /**< Histogram range of the internal histograms. */
+};
+}
+#endif /*__ARM_COMPUTE_NEEQUALIZEHISTOGRAM_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEErode.h b/arm_compute/runtime/NEON/functions/NEErode.h
new file mode 100644
index 0000000000..940ae18471
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEErode.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEERODE_H__
+#define __ARM_COMPUTE_NEERODE_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute erode. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NEErodeKernel
+ *
+ */
+class NEErode : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output and border mode
+     *
+     * @param[in, out] input                 First tensor input. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output                Output tensor. Data type supported: U8.
+     * @param[in]      border_mode           Border mode to use for the convolution.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value);
+};
+}
+#endif /*__ARM_COMPUTE_NEERODE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEFastCorners.h b/arm_compute/runtime/NEON/functions/NEFastCorners.h
new file mode 100644
index 0000000000..d7c31750c5
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEFastCorners.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFASTCORNERS_H__
+#define __ARM_COMPUTE_NEFASTCORNERS_H__
+
+#include "arm_compute/core/NEON/kernels/NEFastCornersKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillArrayKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Array.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+
+/** Basic function to execute fast corners. This function call the following NEON kernels:
+ *
+ * -# @ref NEFastCornersKernel
+ * -# @ref NENonMaximaSuppression3x3Kernel (executed if nonmax_suppression == true)
+ * -# @ref NEFillArrayKernel
+ *
+ */
+class NEFastCorners : public IFunction
+{
+public:
+    /** Constructor */
+    NEFastCorners();
+    /** Initialize the function's source, destination, conv and border_mode.
+     *
+     * @param[in, out] input                 Source image. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[in]      threshold             Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3.
+     * @param[in]      nonmax_suppression    If true, non-maximum suppression is applied to detected corners before being placed in the array.
+     * @param[out]     corners               Array of keypoints to store the results.
+     * @param[in]      border_mode           Strategy to use for borders.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(IImage *input, float threshold, bool nonmax_suppression, KeyPointArray *corners,
+                   BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEFastCornersKernel             _fast_corners_kernel;
+    NEFillBorderKernel              _border_handler;
+    NENonMaximaSuppression3x3Kernel _nonmax_kernel;
+    NEFillArrayKernel               _fill_kernel;
+    Image                           _output;
+    Image                           _suppressed;
+    bool                            _non_max;
+};
+}
+#endif /*__ARM_COMPUTE_NEFASTCORNERS_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEFillBorder.h b/arm_compute/runtime/NEON/functions/NEFillBorder.h
new file mode 100644
index 0000000000..b6b7e77471
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEFillBorder.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFILLBORDER_H__
+#define __ARM_COMPUTE_NEFILLBORDER_H__
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEFillBorderKernel */
+class NEFillBorder : public IFunction
+{
+public:
+    /** Initialize the function's source, destination and border_mode.
+     *
+     * @note This function fills the borders within the XY-planes.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8/QS8/S16/S32/F32
+     * @param[in]      border_width          Width of the tensor border in pixels.
+     * @param[in]      border_mode           Strategy to use for borders.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ITensor *input, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEFillBorderKernel _border_handler; /**< Kernel to handle image borders */
+};
+}
+#endif /*__ARM_COMPUTE_NEFILLBORDER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
new file mode 100644
index 0000000000..33ec4ef721
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEFULLYCONNECTEDLAYER_H__
+#define __ARM_COMPUTE_NEFULLYCONNECTEDLAYER_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
+#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+/** Basic function to reshape the weights of Fully Connected layer with NEON. This function calls the following kernels:
+ *
+ *  -# @ref NETransposeKernel        (if @p transpose_weights is set to true)
+ *  -# @ref NEGEMMTranspose1xWKernel (if @p is_batched_fc_layer is set to true)
+ *
+ * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ */
+class NEFullyConnectedLayerReshapeWeights : public IFunction
+{
+public:
+    /** Constructor */
+    NEFullyConnectedLayerReshapeWeights();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input               Weights tensor. The weights must be 2 dimensional. Data types supported: QS8/F32.
+     * @param[out] output              Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  transpose_weights   True if the weights must be transposed. Data types supported: Same as @p weights.
+     * @param[in]  is_batched_fc_layer True if it is a batched fully connected layer
+     */
+    void configure(const ITensor *input, ITensor *output, bool transpose_weights, bool is_batched_fc_layer);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NETransposeKernel        _transpose_kernel;
+    NEGEMMTranspose1xWKernel _transpose1xW_kernel;
+    Tensor                   _transpose_output;
+    bool                     _transpose_weights;
+    bool                     _is_batched_fc_layer;
+};
+
+/** Basic function to compute a Fully Connected layer on NEON. This function calls the following NEON kernels:
+ *  -# @ref NEIm2ColKernel                      (called when the input comes from a convolutional layer)
+ *  -# @ref NEFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped flag is set to false) (called once)
+ *  -# @ref NEGEMMInterleave4x4Kernel (called if we have a multi-batch input)
+ *  -# @ref NEGEMMMatrixMultiplyKernel
+ *  -# @ref NEGEMMMatrixAccumulateBiasesKernel (if @p biases is not equal to nullptr)
+ *
+ * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ */
+class NEFullyConnectedLayer : public IFunction
+{
+public:
+    /** Constructor */
+    NEFullyConnectedLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input                Source tensor. Data type supported: QS8/F32.
+     * @param[in]  weights              Weights tensor. The weights must be 2 dimensional. Data type supported: Same as @p input.
+     * @param[in]  biases               Bias tensor. Can be nullptr. Data type supported:Same as @p input.
+     * @param[out] output               Destination tensor. Data type supported: Same as @p input.
+     * @param[in]  transpose_weights    (Optional) Transpose the weights tensor if true. Defaults to true.
+     * @param[in]  are_weights_reshaped (Optional) Reshape the weights tensor if false. Defaults to false.
+     */
+    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose_weights = true, bool are_weights_reshaped = false);
+
+    //Inherited methods override
+    void run() override;
+
+private:
+    void configure_fc_fc_wb(const ITensor *input, const ITensor *weights, ITensor *output);
+    void configure_fc_fc_nb(const ITensor *input, const ITensor *weights, ITensor *output);
+    void configure_conv_fc_wb(const ITensor *input, const ITensor *weights, ITensor *output);
+    void configure_conv_fc_nb(const ITensor *input, const ITensor *weights, ITensor *output);
+
+    NEIm2ColKernel                      _im2col_kernel;
+    NEFullyConnectedLayerReshapeWeights _reshape_weights_kernel;
+    NEGEMMInterleave4x4Kernel           _interleave4x4_kernel;
+    NEGEMMMatrixMultiplyKernel          _mm_kernel;
+    NEGEMMMatrixAccumulateBiasesKernel  _accumulate_biases_kernel;
+    Tensor                              _im2col_output;
+    Tensor                              _interleave4x4_output;
+    Tensor                              _reshape_weights_output;
+    bool                                _are_weights_reshaped;
+    bool                                _is_fc_after_conv;
+    bool                                _is_batched_fc_layer;
+    bool                                _accumulate_biases;
+};
+}
+#endif /* __ARM_COMPUTE_NEFULLYCONNECTEDLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h
new file mode 100644
index 0000000000..a40aa910a5
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEGEMM.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMM_H__
+#define __ARM_COMPUTE_NEGEMM_H__
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+/** Basic function to execute GEMM on NEON. This function calls the following NEON kernels:
+ *
+ *  -# @ref NEGEMMInterleave4x4Kernel (if the output tensor is a matrix)
+ *  -# @ref NEGEMMTranspose1xWKernel (if the output tensor is a matrix)
+ *  -# @ref NEGEMMMatrixMultiplyKernel
+ *  -# @ref NEGEMMMatrixAdditionKernel (if c != nullptr and beta != 0.0)
+ *
+ */
+class NEGEMM : public IFunction
+{
+public:
+    /** Constructor */
+    NEGEMM();
+    /** Initialise the kernel's inputs, output
+     *
+     * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
+     * @note GEMM: The tensors a, b, c, d must have the same data type. You should not mix data types when calling this function.
+     *
+     * @param[in]  a     First input tensor  (Matrix A or Vector A). Data type supported: QS8/F16/F32
+     * @param[in]  b     Second input tensor (Matrix B). Data type supported: same as @p a
+     * @param[in]  c     Third input tensor  (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a
+     * @param[out] d     Output tensor. Data type supported: same as @p a
+     * @param[in]  alpha Weight of the matrix product
+     * @param[in]  beta  Weight of matrix C
+     */
+    void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEGEMMInterleave4x4Kernel  _interleave_kernel;
+    NEGEMMTranspose1xWKernel   _transpose_kernel;
+    NEGEMMMatrixMultiplyKernel _mm_kernel;
+    NEGEMMMatrixAdditionKernel _ma_kernel;
+    Tensor                     _tmp_a;
+    Tensor                     _tmp_b;
+    bool                       _run_vector_matrix_multiplication;
+    bool                       _run_addition;
+};
+}
+#endif /*__ARM_COMPUTE_NEGEMM_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h b/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h
new file mode 100644
index 0000000000..b911fd064f
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMINTERLEAVE4X4_H__
+#define __ARM_COMPUTE_NEGEMMINTERLEAVE4X4_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute NEGEMMInterleave4x4Kernel. This function calls the following NEON kernel:
+ *
+ *  -# @ref NEGEMMInterleave4x4Kernel
+ *
+ */
+class NEGEMMInterleave4x4 : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output
+     *
+     * @param[in]  input  First input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32
+     * @param[out] output Output tensor. Data type supported: same as @p input
+     */
+    void configure(const ITensor *input, ITensor *output);
+};
+}
+#endif /*__ARM_COMPUTE_NEGEMMINTERLEAVE4X4_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowp.h b/arm_compute/runtime/NEON/functions/NEGEMMLowp.h
new file mode 100644
index 0000000000..bfb1a494b8
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowp.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMLOWP_H__
+#define __ARM_COMPUTE_NEGEMMLOWP_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute GEMMLowp on NEON. This function calls the following NEON kernels:
+*
+*  -# @ref NEGEMMInterleave4x4Kernel
+*  -# @ref NEGEMMTranspose1xWKernel
+*  -# @ref NEGEMMLowpMatrixMultiplyKernel
+*
+*/
+class NEGEMMLowp : public IFunction
+{
+public:
+    /** Constructor */
+    NEGEMMLowp();
+    /** Initialise the kernel's inputs, output
+    *
+    * @note GEMM_LOWP:  low precision GEMM kernel
+    *  This kernel performs the following computation:
+    *
+    *  -# Convert a values from uint8 to int32 and add a_offset to each of them.
+    *  -# Convert b values from uint8 to int32 and add b_offset to each of them.
+    *  -# Compute the int32 matrix product of the resulting a * b.
+    *  -# Add output_offset to each entry of the result.
+    *  -# Multiply each entry of the result and round to the nearest integer
+    *  -# Clamp the resulting int32 values to the [0..255] range and cast to uint8.
+    *
+    * @param[in]  a               First input tensor  (Matrix A). Data type supported: U8.
+    * @param[in]  b               Second input tensor (Matrix B). Data type supported: same as @p a
+    * @param[out] output          Output tensor. Data type supported: same as @p a.
+    * @param[in]  a_offset        Offset to be added to each element of the matrix A.
+    * @param[in]  b_offset        Offset to be added to each element of the matrix B.
+    * @param[in]  output_offset   Offset to be added to each element of the output matrix
+    * @param[in]  output_mult_int Value to be multiplied to each element of the output matrix
+    * @param[in]  shift           Number of bits to shift right the result.
+    */
+    void configure(const ITensor *a, const ITensor *b, ITensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift);
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEGEMMInterleave4x4Kernel      _interleave_kernel;
+    NEGEMMTranspose1xWKernel       _transpose_kernel;
+    NEGEMMLowpMatrixMultiplyKernel _mm_kernel;
+    Tensor                         _tmp_a;
+    Tensor                         _tmp_b;
+};
+}
+#endif /*__ARM_COMPUTE_NEGEMMLOWP_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h b/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h
new file mode 100644
index 0000000000..447b8c9c70
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMTRANSPOSE1XW_H__
+#define __ARM_COMPUTE_NEGEMMTRANSPOSE1XW_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+/** Basic function to execute NEGEMMTranspose1xWKernel. This function calls the following NEON kernels:
+ *
+ *  -# @ref NEGEMMTranspose1xWKernel
+ *
+ */
+class NEGEMMTranspose1xW : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output
+     *
+     * @param[in]  input  First input tensor. Data type supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32/
+     * @param[out] output Output tensor. Data type supported: same as @p input
+     */
+    void configure(const ITensor *input, ITensor *output);
+};
+}
+#endif /*__ARM_COMPUTE_NEGEMMTRANSPOSE1XW_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGaussian3x3.h b/arm_compute/runtime/NEON/functions/NEGaussian3x3.h
new file mode 100644
index 0000000000..a237e6f0e5
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEGaussian3x3.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGAUSSIAN3x3_H__
+#define __ARM_COMPUTE_NEGAUSSIAN3x3_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute gaussian filter 3x3. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NEGaussian3x3Kernel
+ *
+ */
+class NEGaussian3x3 : public INESimpleFunction
+{
+public:
+    /** Initialise the function's input, output and border mode.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output                Destination tensor, Data type supported: U8.
+     * @param[in]      border_mode           Strategy to use for borders.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_NEGAUSSIAN3x3_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGaussian5x5.h b/arm_compute/runtime/NEON/functions/NEGaussian5x5.h
new file mode 100644
index 0000000000..699e42efb4
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEGaussian5x5.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGAUSSIAN5x5_H__
+#define __ARM_COMPUTE_NEGAUSSIAN5x5_H__
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute gaussian filter 5x5. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NEGaussian5x5HorKernel
+ * -# @ref NEGaussian5x5VertKernel
+ *
+ */
+class NEGaussian5x5 : public IFunction
+{
+public:
+    /** Default constructor
+     */
+    NEGaussian5x5();
+    /** Initialise the function's input, output and border mode.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output                Destination tensor, Data type supported: U8.
+     * @param[in]      border_mode           Strategy to use for borders.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+protected:
+    NEGaussian5x5HorKernel  _kernel_hor;     /**< kernel for horizontal pass */
+    NEGaussian5x5VertKernel _kernel_vert;    /**< kernel for vertical pass */
+    Tensor                  _tmp;            /**< temporary buffer for output of horizontal pass */
+    NEFillBorderKernel      _border_handler; /**< kernel to handle tensor borders */
+};
+}
+#endif /*__ARM_COMPUTE_NEGAUSSIAN5x5_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGaussianPyramid.h b/arm_compute/runtime/NEON/functions/NEGaussianPyramid.h
new file mode 100644
index 0000000000..5f0a67ea05
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEGaussianPyramid.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGAUSSIANPYRAMID_H__
+#define __ARM_COMPUTE_NEGAUSSIANPYRAMID_H__
+
+#include "arm_compute/core/IPyramid.h"
+#include "arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h"
+#include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
+#include "arm_compute/runtime/Pyramid.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstdint>
+#include <memory>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Common interface for all Gaussian pyramid functions */
+class NEGaussianPyramid : public IFunction
+{
+public:
+    /** Default constructor */
+    NEGaussianPyramid();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussianPyramid(const NEGaussianPyramid &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGaussianPyramid &operator=(const NEGaussianPyramid &) = delete;
+    /** Allow instances of this class to be moved */
+    NEGaussianPyramid(NEGaussianPyramid &&) = default;
+    /** Allow instances of this class to be moved */
+    NEGaussianPyramid &operator=(NEGaussianPyramid &&) = default;
+    /** Default destructor */
+    virtual ~NEGaussianPyramid() = default;
+
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @param[in]  input                 Source tensor. Data type supported: U8.
+     * @param[out] pyramid               Destination pyramid tensors, Data type supported at each level: U8.
+     * @param[in]  border_mode           Border mode to use.
+     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    virtual void configure(const ITensor *input, IPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) = 0;
+
+protected:
+    const ITensor *_input;
+    IPyramid      *_pyramid;
+    Pyramid        _tmp;
+};
+
+/** Basic function to execute gaussian pyramid with HALF scale factor. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NEGaussianPyramidHorKernel
+ * -# @ref NEGaussianPyramidVertKernel
+ *
+ */
+class NEGaussianPyramidHalf : public NEGaussianPyramid
+{
+public:
+    /** Constructor */
+    NEGaussianPyramidHalf();
+
+    // Inherited methods overridden:
+    void configure(const ITensor *input, IPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) override;
+    void run() override;
+
+private:
+    std::unique_ptr<NEFillBorderKernel[]>          _border_handler;
+    std::unique_ptr<NEGaussianPyramidHorKernel[]>  _horizontal_reduction;
+    std::unique_ptr<NEGaussianPyramidVertKernel[]> _vertical_reduction;
+};
+
+/** Basic function to execute gaussian pyramid with ORB scale factor. This function calls the following NEON kernels and functions:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NEGaussian5x5
+ * -# @ref NEScaleKernel
+ *
+ */
+class NEGaussianPyramidOrb : public NEGaussianPyramid
+{
+public:
+    /** Constructor */
+    NEGaussianPyramidOrb();
+
+    // Inherited methods overridden:
+    void configure(const ITensor *input, IPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) override;
+    void run() override;
+
+private:
+    std::unique_ptr<Image[]>         _offsets;
+    std::unique_ptr<NEGaussian5x5[]> _gaus5x5;
+    std::unique_ptr<NEScaleKernel[]> _scale_nearest;
+};
+}
+#endif /*__ARM_COMPUTE_NEGAUSSIANPYRAMID_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEHOGDescriptor.h b/arm_compute/runtime/NEON/functions/NEHOGDescriptor.h
new file mode 100644
index 0000000000..b7b4909060
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEHOGDescriptor.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEHOGDESCRIPTOR_H__
+#define __ARM_COMPUTE_NEHOGDESCRIPTOR_H__
+
+#include "arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/functions/NEHOGGradient.h"
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+class IHOG;
+/** Basic function to calculate HOG descriptor. This function calls the following NEON kernels:
+ *
+ * -# @ref NEHOGGradient
+ * -# @ref NEHOGOrientationBinningKernel
+ * -# @ref NEHOGBlockNormalizationKernel
+ *
+ */
+class NEHOGDescriptor : public IFunction
+{
+public:
+    /** Default constructor */
+    NEHOGDescriptor();
+    /** Initialise the function's source, destination, HOG data-object and border mode
+     *
+     * @param[in, out] input                 Input tensor. Data type supported: U8
+     *                                       (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output                Output tensor which stores the HOG descriptor. DataType supported: F32. The number of channels is equal to the number of histogram bins per block
+     * @param[in]      hog                   HOG data object which describes the HOG descriptor
+     * @param[in]      border_mode           Border mode to use.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ITensor *input, ITensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited method overridden:
+    void run() override;
+
+private:
+    NEHOGGradient                 _gradient;
+    NEHOGOrientationBinningKernel _orient_bin;
+    NEHOGBlockNormalizationKernel _block_norm;
+    Tensor                        _mag;
+    Tensor                        _phase;
+    Tensor                        _hog_space;
+};
+}
+
+#endif /* __ARM_COMPUTE_NEHOGDESCRIPTOR_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEHOGDetector.h b/arm_compute/runtime/NEON/functions/NEHOGDetector.h
new file mode 100644
index 0000000000..98b8a89bc1
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEHOGDetector.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEHOGDETECTOR_H__
+#define __ARM_COMPUTE_NEHOGDETECTOR_H__
+
+#include "arm_compute/core/IHOG.h"
+#include "arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+/** Basic function to execute HOG detector based on linear SVM. This function calls the following NEON kernel:
+ *
+ * -# @ref NEHOGDetectorKernel
+ *
+ */
+class NEHOGDetector : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's input, output, HOG data object, detection window stride, threshold and index class
+     *
+     * @attention The function does not reset the number of values in @ref IDetectionWindowArray so it is caller's responsibility to clear it.
+     *
+     * @param[in]  input                   Input tensor. It is the output of @ref NEHOGDescriptor. Data type supported: F32
+     * @param[in]  hog                     HOG data-object that describes the HOG descriptor
+     * @param[out] detection_windows       Array of @ref DetectionWindow used to store the detected objects
+     * @param[in]  detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions.
+     *                                     It must be multiple of the block stride stored in hog
+     * @param[in]  threshold               (Optional) Threshold for the distance between features and SVM classifying plane
+     * @param[in]  idx_class               (Optional) Index of the class used for evaluating which class the detection window belongs to
+     */
+    void configure(const ITensor *input, const IHOG *hog, IDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold = 0.0f, size_t idx_class = 0);
+};
+}
+
+#endif /* __ARM_COMPUTE_NEHOGDETECTOR_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEHOGGradient.h b/arm_compute/runtime/NEON/functions/NEHOGGradient.h
new file mode 100644
index 0000000000..dd2d99adfe
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEHOGGradient.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEHOGGRADIENT_H__
+#define __ARM_COMPUTE_NEHOGGRADIENT_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/functions/NEDerivative.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstdint>
+#include <memory>
+
+namespace arm_compute
+{
+class ITensor;
+/** Basic function to calculate the gradient for HOG. This function calls the following NEON kernels:
+ *
+ * -# @ref NEDerivative
+ * -# NEMagnitudePhaseKernel
+ *
+ */
+class NEHOGGradient : public IFunction
+{
+public:
+    /** Default constructor */
+    NEHOGGradient();
+    /** Initialise the function's source, destinations, phase type and border mode
+     *
+     * @param[in, out] input                 Input tensor. Data type supported: U8.
+     *                                       (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output_magnitude      Output tensor (magnitude). Data type supported: U16.
+     * @param[out]     output_phase          Output tensor.(phase). Format supported: U8
+     * @param[in]      phase_type            Type of @ref PhaseType
+     * @param[in]      border_mode           Border mode to use
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ITensor *input, ITensor *output_magnitude, ITensor *output_phase, PhaseType phase_type, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited method overridden:
+    void run() override;
+
+private:
+    NEDerivative               _derivative;
+    std::unique_ptr<INEKernel> _mag_phase;
+    Tensor                     _gx;
+    Tensor                     _gy;
+};
+}
+#endif /*__ARM_COMPUTE_NEHOGGRADIENT_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h b/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h
new file mode 100644
index 0000000000..2d07e6435f
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEHOGMULTIDETECTION_H__
+#define __ARM_COMPUTE_NEHOGMULTIDETECTION_H__
+
+#include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h"
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/IMultiHOG.h"
+#include "arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/functions/NEHOGDetector.h"
+#include "arm_compute/runtime/NEON/functions/NEHOGGradient.h"
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+/** Basic function to detect multiple objects (or the same object at different scales) on the same input image using HOG. This function calls the following NEON kernels:
+ *
+ * -# @ref NEHOGGradient
+ * -# @ref NEHOGOrientationBinningKernel
+ * -# @ref NEHOGBlockNormalizationKernel
+ * -# @ref NEHOGDetector
+ * -# @ref CPPDetectionWindowNonMaximaSuppressionKernel (executed if non_maxima_suppression == true)
+ *
+ * @note This implementation works if all the HOG data-objects within the IMultiHOG container have the same:
+ *       -# Phase type
+         -# Normalization type
+         -# L2 hysteresis threshold if the normalization type is L2HYS_NORM
+ *
+ */
+class NEHOGMultiDetection : public IFunction
+{
+public:
+    /** Default constructor */
+    NEHOGMultiDetection();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHOGMultiDetection(const NEHOGMultiDetection &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEHOGMultiDetection &operator=(const NEHOGMultiDetection &) = delete;
+    /** Initialise the function's source, destination, detection window strides, border mode, threshold and non-maxima suppression
+     *
+     * @param[in, out] input                    Input tensor. Data type supported: U8
+     *                                          (Written to only for @p border_mode != UNDEFINED)
+     * @param[in]      multi_hog                Container of multiple HOG data object. Each HOG data object describes one HOG model to detect.
+     *                                          This container should store the HOG data-objects in descending or ascending cell_size width order.
+     *                                          This will help to understand if the HOG descriptor computation can be skipped for some HOG data-objects
+     * @param[out]     detection_windows        Array of @ref DetectionWindow used for locating the detected objects
+     * @param[in]      detection_window_strides Array of @ref Size2D used to specify the distance in pixels between 2 consecutive detection windows in x and y directions for each HOG data-object
+     *                                          The dimension of this array must be the same of multi_hog->num_models()
+     *                                          The i-th detection_window_stride of this array must be multiple of the block_stride stored in the i-th multi_hog array
+     * @param[in]      border_mode              Border mode to use.
+     * @param[in]      constant_border_value    (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     * @param[in]      threshold                (Optional) Threshold for the distance between features and SVM classifying plane
+     * @param[in]      non_maxima_suppression   (Optional) Flag to specify whether the non-maxima suppression is required or not.
+     *                                          True if the non-maxima suppression stage has to be computed
+     * @param[in]      min_distance             (Optional) Radial Euclidean distance to use for the non-maxima suppression stage
+     *
+     */
+    void configure(ITensor *input, const IMultiHOG *multi_hog, IDetectionWindowArray *detection_windows, const ISize2DArray *detection_window_strides, BorderMode border_mode,
+                   uint8_t constant_border_value = 0,
+                   float threshold = 0.0f, bool non_maxima_suppression = false, float min_distance = 1.0f);
+
+    // Inherited method overridden:
+    void run() override;
+
+private:
+    NEHOGGradient                                                 _gradient_kernel;
+    std::unique_ptr<NEHOGOrientationBinningKernel[]>              _orient_bin_kernel;
+    std::unique_ptr<NEHOGBlockNormalizationKernel[]>              _block_norm_kernel;
+    std::unique_ptr<NEHOGDetector[]>                              _hog_detect_kernel;
+    std::unique_ptr<CPPDetectionWindowNonMaximaSuppressionKernel> _non_maxima_kernel;
+    std::unique_ptr<Tensor[]>                                     _hog_space;
+    std::unique_ptr<Tensor[]>                                     _hog_norm_space;
+    IDetectionWindowArray                                        *_detection_windows;
+    Tensor                                                        _mag;
+    Tensor                                                        _phase;
+    bool                                                          _non_maxima_suppression;
+    size_t                                                        _num_orient_bin_kernel;
+    size_t                                                        _num_block_norm_kernel;
+    size_t                                                        _num_hog_detect_kernel;
+};
+}
+
+#endif /* __ARM_COMPUTE_NEHOGMULTIDETECTION_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEHarrisCorners.h b/arm_compute/runtime/NEON/functions/NEHarrisCorners.h
new file mode 100644
index 0000000000..a709871153
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEHarrisCorners.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEHARRISCORNERS_H__
+#define __ARM_COMPUTE_NEHARRISCORNERS_H__
+
+#include "arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h"
+#include "arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Array.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstdint>
+#include <memory>
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+
+/** Basic function to execute harris corners detection. This function calls the following NEON kernels and functions:
+ *
+ * -# @ref NESobel3x3 (if gradient_size == 3) or<br/>
+ *    @ref NESobel5x5 (if gradient_size == 5) or<br/>
+ *    @ref NESobel7x7 (if gradient_size == 7)
+ * -# @ref NEFillBorderKernel
+ * -# NEHarrisScoreKernel<3> (if block_size == 3) or<br/>
+ *    NEHarrisScoreKernel<5> (if block_size == 5) or<br/>
+ *    NEHarrisScoreKernel<7> (if block_size == 7)
+ * -# @ref NENonMaximaSuppression3x3
+ * -# @ref CPPCornerCandidatesKernel
+ * -# @ref CPPSortEuclideanDistanceKernel
+ *
+ */
+class NEHarrisCorners : public IFunction
+{
+public:
+    /** Constructor
+     *
+     * Initialize _sobel, _harris_score and _corner_list to nullptr.
+     */
+    NEHarrisCorners();
+    /** Initialize the function's source, destination, conv and border_mode.
+     *
+     * @param[in, out] input                 Source image. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[in]      threshold             Minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel).
+     * @param[in]      min_dist              Radial Euclidean distance for the euclidean diatance stage
+     * @param[in]      sensitivity           Sensitivity threshold k from the Harris-Stephens equation
+     * @param[in]      gradient_size         The gradient window size to use on the input. The implementation supports 3, 5, and 7
+     * @param[in]      block_size            The block window size used to compute the Harris Corner score. The implementation supports 3, 5, and 7.
+     * @param[out]     corners               Array of keypoints to store the results.
+     * @param[in]      border_mode           Border mode to use
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     * @param[in]      use_fp16              (Optional) If true the FP16 kernels will be used. If false F32 kernels are used.
+     */
+    void configure(IImage *input, float threshold, float min_dist, float sensitivity,
+                   int32_t gradient_size, int32_t block_size, KeyPointArray *corners,
+                   BorderMode border_mode, uint8_t constant_border_value = 0, bool use_fp16 = false);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    std::unique_ptr<IFunction>            _sobel;                 /**< Sobel function */
+    std::unique_ptr<INEHarrisScoreKernel> _harris_score;          /**< Harris score kernel */
+    NENonMaximaSuppression3x3             _non_max_suppr;         /**< Non-maxima suppression function */
+    CPPCornerCandidatesKernel             _candidates;            /**< Sort kernel */
+    CPPSortEuclideanDistanceKernel        _sort_euclidean;        /**< Euclidean distance kernel */
+    NEFillBorderKernel                    _border_gx;             /**< Border handler before running harris score */
+    NEFillBorderKernel                    _border_gy;             /**< Border handler before running harris score */
+    Image                                 _gx;                    /**< Source image - Gx component */
+    Image                                 _gy;                    /**< Source image - Gy component */
+    Image                                 _score;                 /**< Source image - Harris score */
+    Image                                 _nonmax;                /**< Source image - Non-Maxima suppressed image */
+    std::unique_ptr<InternalKeypoint[]>   _corners_list;          /**< Array of InternalKeypoint. It stores the potential corner candidates */
+    int32_t                               _num_corner_candidates; /**< Number of potential corner candidates */
+};
+}
+#endif /*__ARM_COMPUTE_NEHARRISCORNERS_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEHistogram.h b/arm_compute/runtime/NEON/functions/NEHistogram.h
new file mode 100644
index 0000000000..c24510dcb3
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEHistogram.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEHISTOGRAM_H__
+#define __ARM_COMPUTE_NEHISTOGRAM_H__
+
+#include "arm_compute/core/NEON/kernels/NEHistogramKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+namespace arm_compute
+{
+class IDistribution1D;
+
+/** Basic function to run @ref NEHistogramKernel. */
+class NEHistogram : public IFunction
+{
+public:
+    /** Default Constructor. */
+    NEHistogram();
+    /** Initialise the kernel's inputs.
+     *
+     * @param[in]  input  Input image. Data type supported: U8.
+     * @param[out] output Output distribution.
+     */
+    void configure(const IImage *input, IDistribution1D *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEHistogramKernel           _histogram_kernel;
+    std::unique_ptr<uint32_t[]> _local_hist;
+    std::unique_ptr<uint32_t[]> _window_lut;
+    size_t                      _local_hist_size;
+    /** 256 possible pixel values as we handle only U8 images */
+    static constexpr unsigned int window_lut_default_size = 256;
+};
+}
+#endif /*__ARM_COMPUTE_NEHISTOGRAM_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEIntegralImage.h b/arm_compute/runtime/NEON/functions/NEIntegralImage.h
new file mode 100644
index 0000000000..6d7dd697e8
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEIntegralImage.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEINTEGRALIMAGE_H__
+#define __ARM_COMPUTE_NEINTEGRALIMAGE_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run a @ref NEIntegralImageKernel */
+class NEIntegralImage : public INESimpleFunction
+{
+public:
+    /** Initialise the function's source, destinations and border mode.
+        *
+        * @param[in]  input  Source tensor. Data type supported: U8.
+        * @param[out] output Destination tensor. Data type supported: U32.
+        */
+    void configure(const ITensor *input, ITensor *output);
+};
+}
+#endif /*__ARM_COMPUTE_NEINTEGRALIMAGE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NELaplacianPyramid.h b/arm_compute/runtime/NEON/functions/NELaplacianPyramid.h
new file mode 100644
index 0000000000..991ae7c293
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NELaplacianPyramid.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NELAPLACIANPYRAMID_H__
+#define __ARM_COMPUTE_NELAPLACIANPYRAMID_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
+#include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
+#include "arm_compute/runtime/Pyramid.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute laplacian pyramid. This function calls the following NEON kernels and functions:
+ *
+ * -# @ref NEGaussianPyramidHalf
+ * -# @ref NEGaussian5x5
+ * -# @ref NEArithmeticSubtraction
+ *
+ *  First a Gaussian pyramid is created. Then, for each level i, the corresponding tensor I(i) is blurred with the Gaussian 5x5 filter, and then
+ *  difference between the two tensors is the corresponding level L(i) of the Laplacian pyramid.
+ *  L(i) = I(i) - Gaussian5x5(I(i))
+ *  Level 0 has always the same first two dimensions as the input tensor.
+*/
+class NELaplacianPyramid : public IFunction
+{
+public:
+    /** Constructor */
+    NELaplacianPyramid();
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @param[in]  input                 Source tensor. Data type supported: U8.
+     * @param[out] pyramid               Destination pyramid tensors, Data type supported at each level: S16.
+     * @param[out] output                The lowest resolution tensor necessary to reconstruct the input tensor from the pyramid. Data type supported: S16.
+     *                                   The first two dimensions of this tensor must match the first two dimensions of the tensor in the last level of the pyramid, that is:
+     *                                   out.width = in.width() / pow(2,pyramid_levels-1) and out.height = in.height() / pow(2,pyramid_levels-1)
+     * @param[in]  border_mode           Border mode to use.
+     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(const ITensor *input, IPyramid *pyramid, ITensor *output, BorderMode border_mode, uint8_t constant_border_value);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    size_t                                     _num_levels;
+    NEGaussianPyramidHalf                      _gaussian_pyr_function;
+    std::unique_ptr<NEGaussian5x5[]>           _convf;
+    std::unique_ptr<NEArithmeticSubtraction[]> _subf;
+    Pyramid                                    _gauss_pyr;
+    Pyramid                                    _conv_pyr;
+    NEDepthConvert                             _depth_function;
+};
+}
+#endif /*__ARM_COMPUTE_NELAPLACIANPYRAMID_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h b/arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h
new file mode 100644
index 0000000000..4139733499
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NELAPLACIANRECONSTRUCT_H__
+#define __ARM_COMPUTE_NELAPLACIANRECONSTRUCT_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/NEON/functions/NEScale.h"
+#include "arm_compute/runtime/Pyramid.h"
+
+#include <cstdint>
+#include <memory>
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+
+/** Basic function to execute laplacian reconstruction. This function calls the following NEON kernels and functions:
+ *
+ * -# @ref NEArithmeticAddition
+ * -# @ref NEScale
+ * -# @ref NEDepthConvert
+ *
+ * This function reconstructs the original image from a Laplacian Image Pyramid.
+ *
+ *  The input image is added to the last level of the Laplacian pyramid L(n-2), the resulting image is upsampled to the
+ *  resolution of the next pyramid level.
+ *
+ *  I(n-2) = upsample( input + L(n-1)
+ *
+ *  For each pyramid level i, except i=0 and i=n-1:
+ *  I(i-1) = upsample(I(i) + L(i))
+ *
+ *  output = I(0) + L(0)
+*/
+class NELaplacianReconstruct : public IFunction
+{
+public:
+    /** Constructor */
+    NELaplacianReconstruct();
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * The Output image must have the same size as the first level of the pyramid.
+     * The Input image must have the same size as the last level of the pyramid.
+     *
+     * The idea is to reconstuct the original hi-res image from a low-res representation of it and the laplacian pyramid.
+     *
+     * @param[in]  pyramid               Laplacian pyramid tensors, Data type supported at each level: S16.
+     * @param[in]  input                 Source tensor. Data type supported: S16.
+     * @param[out] output                Output tensor. Data type supported: U8.
+     * @param[in]  border_mode           Border mode to use for the convolution.
+     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(const IPyramid *pyramid, const ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    Pyramid                                 _tmp_pyr;
+    std::unique_ptr<NEArithmeticAddition[]> _addf;
+    std::unique_ptr<NEScale[]>              _scalef;
+    NEDepthConvert                          _depthf;
+};
+}
+#endif /*__ARM_COMPUTE_NELAPLACIANRECONSTRUCT_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h
new file mode 100644
index 0000000000..1b2b2ee3cf
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NELOCALLYCONNECTEDLAYER_H__
+#define __ARM_COMPUTE_NELOCALLYCONNECTEDLAYER_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
+#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
+#include "arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+class INETensor;
+
+/** Basic function to compute the locally connected layer. This function calls the following NEON kernels:
+ *
+ * -# @ref NEWeightsReshapeKernel (executed only once for each configuration)
+ * -# @ref NEIm2ColKernel
+ * -# @ref NELocallyConnectedMatrixMultiplyKernel
+ * -# @ref NECol2ImKernel
+ */
+class NELocallyConnectedLayer : public IFunction
+{
+public:
+    /** Default constructor */
+    NELocallyConnectedLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                       while every optional dimension from 4 and above represent a batch of inputs.
+     *                       Data types supported: F32.
+     * @param[in]  weights   Weights tensor. Weights are 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches]. Data type supported:Same as @p input.
+     * @param[in]  biases    Biases tensor. Shared biases supported. Biases are 2D tensor with dimensions [OFM, num_patches]. Data type supported:Same as @p input.
+     * @param[out] output    Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                       Data types supported: Same as @p input.
+     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     */
+    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEIm2ColKernel                         _input_im2col_kernel;
+    NEWeightsReshapeKernel                 _weights_reshape_kernel;
+    NELocallyConnectedMatrixMultiplyKernel _mm_kernel;
+    NECol2ImKernel                         _output_col2im_kernel;
+    Tensor                                 _input_im2col_reshaped;
+    Tensor                                 _weights_reshaped;
+    Tensor                                 _gemm_output;
+    bool                                   _is_first_run;
+};
+}
+#endif /* __ARM_COMPUTE_NELOCALLYCONNECTEDLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEMagnitude.h b/arm_compute/runtime/NEON/functions/NEMagnitude.h
new file mode 100644
index 0000000000..6c1f988ef0
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEMagnitude.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEMAGNITUDE_H__
+#define __ARM_COMPUTE_NEMAGNITUDE_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run NEMagnitudePhaseKernel */
+class NEMagnitude : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs.
+     *
+     * @param[in]  input1   First tensor input. Data type supported: S16.
+     * @param[in]  input2   Second tensor input. Data type supported: S16.
+     * @param[out] output   Output tensor. Data type supported: S16.
+     * @param[in]  use_fp16 (Optional) If true the FP16 kernels will be used. If false F32 kernels are used.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, bool use_fp16 = false);
+};
+}
+#endif /*__ARM_COMPUTE_NEMAGNITUDE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEMeanStdDev.h b/arm_compute/runtime/NEON/functions/NEMeanStdDev.h
new file mode 100644
index 0000000000..3770b2a270
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEMeanStdDev.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEMEANSTDDEV_H__
+#define __ARM_COMPUTE_NEMEANSTDDEV_H__
+
+#include "arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+/** Basic function to execute mean and std deviation. This function calls the following NEON kernels:
+ *
+ * @ref NEMeanStdDevKernel
+ *
+ */
+class NEMeanStdDev : public IFunction
+{
+public:
+    /** Default Constructor. */
+    NEMeanStdDev();
+    /** Initialise the kernel's inputs and outputs.
+     *
+     * @param[in]  input  Input image. Data type supported: U8.
+     * @param[out] mean   Output average pixel value.
+     * @param[out] stddev (Optional) Output standard deviation of pixel values.
+     */
+    void configure(const IImage *input, float *mean, float *stddev = nullptr);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEMeanStdDevKernel _mean_stddev_kernel; /**< Kernel that standard deviation calculation. */
+    uint64_t           _global_sum;         /**< Variable that holds the global sum among calls in order to ease reduction */
+    uint64_t           _global_sum_squared; /**< Variable that holds the global sum of squared values among calls in order to ease reduction */
+};
+}
+#endif /*__ARM_COMPUTE_NEMEANSTDDEV_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEMedian3x3.h b/arm_compute/runtime/NEON/functions/NEMedian3x3.h
new file mode 100644
index 0000000000..a3df687a35
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEMedian3x3.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEMEDIAN3x3_H__
+#define __ARM_COMPUTE_NEMEDIAN3x3_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute median filter. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NEMedian3x3Kernel
+ *
+ */
+class NEMedian3x3 : public INESimpleFunction
+{
+public:
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output                Destination tensor, Data type supported: U8.
+     * @param[in]      border_mode           Border mode to use for the convolution.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_NEMEDIAN3x3_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEMinMaxLocation.h b/arm_compute/runtime/NEON/functions/NEMinMaxLocation.h
new file mode 100644
index 0000000000..82e75ee48b
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEMinMaxLocation.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEMINMAXLOCATION_H__
+#define __ARM_COMPUTE_NEMINMAXLOCATION_H__
+
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h"
+#include "arm_compute/runtime/Array.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+using IImage = ITensor;
+
+/** Basic function to execute min and max location. This function calls the following NEON kernels:
+ *
+ * -# NEMinMaxKernel
+ * -# NEMinMaxLocationKernel
+ */
+class NEMinMaxLocation : public IFunction
+{
+public:
+    /** Constructor */
+    NEMinMaxLocation();
+    /** Initialise the kernel's inputs and outputs.
+     *
+     * @param[in]  input     Input image. Data types supported: U8/S16.
+     * @param[out] min       Minimum value of image.
+     * @param[out] max       Maximum value of image.
+     * @param[out] min_loc   (Optional) Array of minimum value locations.
+     * @param[out] max_loc   (Optional) Array of maximum value locations.
+     * @param[out] min_count (Optional) Number of minimum value encounters.
+     * @param[out] max_count (Optional) Number of maximum value encounters.
+     */
+    void configure(const IImage *input, int32_t *min, int32_t *max,
+                   ICoordinates2DArray *min_loc = nullptr, ICoordinates2DArray *max_loc = nullptr,
+                   uint32_t *min_count = nullptr, uint32_t *max_count = nullptr);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEMinMaxKernel         _min_max;     /**< Kernel that performs min/max */
+    NEMinMaxLocationKernel _min_max_loc; /**< Kernel that extracts min/max locations */
+};
+}
+#endif /*__ARM_COMPUTE_NEMINMAXLOCATION_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NENonLinearFilter.h b/arm_compute/runtime/NEON/functions/NENonLinearFilter.h
new file mode 100644
index 0000000000..d8a9eaebfb
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NENonLinearFilter.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NENONLINEARFILTER_H__
+#define __ARM_COMPUTE_NENONLINEARFILTER_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute non linear filter. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NENonLinearFilterKernel
+ *
+ * @note Supported mask dimensions squares of sizes 3, 5
+ */
+class NENonLinearFilter : public INESimpleFunction
+{
+public:
+    /** Initialize the function's source, destination, conv and border_mode.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output                Destination tensor. Data type supported: U8
+     * @param[in]      function              Non linear function to perform
+     * @param[in]      mask_size             Mask size. Supported sizes: 3, 5
+     * @param[in]      pattern               Mask pattern
+     * @param[in]      mask                  The given mask. Will be used only if pattern is specified to PATTERN_OTHER
+     * @param[in]      border_mode           Strategy to use for borders.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ITensor *input, ITensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask, BorderMode border_mode,
+                   uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_NENONLINEARFILTER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h b/arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h
new file mode 100644
index 0000000000..c87d722878
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NENONMAXIMASUPPRESSION3X3_H__
+#define __ARM_COMPUTE_NENONMAXIMASUPPRESSION3X3_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute non-maxima suppression over a 3x3 window. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NENonMaximaSuppression3x3Kernel
+ *
+ */
+class NENonMaximaSuppression3x3 : public INESimpleFunction
+{
+public:
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @note The implementation supports just 2 border modes: UNDEFINED and CONSTANT
+     *       The constant values used with CONSTANT border mode is 0
+     *
+     * @param[in, out] input       Source tensor. Data type supported: U8/F32. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output      Destination for the Non-Maxima suppressions 3x3. Data type supported: same as @p input
+     * @param[in]      border_mode Border mode to use for non-maxima suppression. The implementation supports just 2 border modes: UNDEFINED and CONSTANT
+     *
+     */
+    void configure(ITensor *input, ITensor *output, BorderMode border_mode);
+};
+}
+#endif /* __ARM_COMPUTE_NENONMAXIMASUPPRESSION3X3_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
new file mode 100644
index 0000000000..3202867c43
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NENORMALIZATIONLAYER_H__
+#define __ARM_COMPUTE_NENORMALIZATIONLAYER_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to simulate a normalization layer. This function calls the following NEON kernels:
+ *
+ * -# @ref NEPixelWiseMultiplicationKernel
+ * -# @ref NEFillBorderKernel
+ * -# @ref NENormalizationLayerKernel
+ *
+ */
+class NENormalizationLayer : public IFunction
+{
+public:
+    /** Default constructor */
+    NENormalizationLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
+     *                       and an optional 4th dimension for batch of inputs. Data type supported: QS8/F32
+     * @param[out] output    Destination with the same dimensions, data type and number of channels of  @p input
+     * @param[in]  norm_info Normalization layer information like the normalization type, normalization size and other parameters.
+     */
+    void configure(const ITensor *input, ITensor *output, NormalizationLayerInfo norm_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NENormalizationLayerKernel      _norm_kernel;     /**< Normalization layer kernel */
+    NEPixelWiseMultiplicationKernel _multiply_kernel; /**< Pixel multiplication kernel */
+    NEFillBorderKernel              _border_handler;  /**< Kernel to handle  borders */
+    Tensor                          _input_squared;   /**< The intermediate buffer which stores results of squaring input */
+};
+}
+#endif /* __ARM_COMPUTE_NENORMALIZATIONLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEOpticalFlow.h b/arm_compute/runtime/NEON/functions/NEOpticalFlow.h
new file mode 100644
index 0000000000..0534551d19
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEOpticalFlow.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEOPTICALFLOW_H__
+#define __ARM_COMPUTE_NEOPTICALFLOW_H__
+
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Array.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/functions/NEScharr3x3.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+namespace arm_compute
+{
+class Pyramid;
+
+using LKInternalKeypointArray = Array<NELKInternalKeypoint>;
+/** Basic function to execute optical flow. This function calls the following NEON kernels and functions:
+ *
+ * -# @ref NEScharr3x3
+ * -# @ref NELKTrackerKernel
+ *
+ */
+class NEOpticalFlow : public IFunction
+{
+public:
+    /** Constructor */
+    NEOpticalFlow();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEOpticalFlow(const NEOpticalFlow &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEOpticalFlow &operator=(const NEOpticalFlow &) = delete;
+    /**  Initialise the function input and output
+     *
+     * @param[in]  old_pyramid           Pointer to the pyramid for the old tensor. Data type supported U8
+     * @param[in]  new_pyramid           Pointer to the pyramid for the new tensor. Data type supported U8
+     * @param[in]  old_points            Pointer to the IKeyPointArray storing old key points
+     * @param[in]  new_points_estimates  Pointer to the IKeyPointArray storing new estimates key points
+     * @param[out] new_points            Pointer to the IKeyPointArray storing new key points
+     * @param[in]  termination           The criteria to terminate the search of each keypoint.
+     * @param[in]  epsilon               The error for terminating the algorithm
+     * @param[in]  num_iterations        The maximum number of iterations before terminate the alogrithm
+     * @param[in]  window_dimension      The size of the window on which to perform the algorithm
+     * @param[in]  use_initial_estimate  The flag to indicate whether the initial estimated position should be used
+     * @param[in]  border_mode           The border mode applied at scharr kernel stage
+     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT
+     *
+     */
+    void configure(const Pyramid *old_pyramid, const Pyramid *new_pyramid, const IKeyPointArray *old_points, const IKeyPointArray *new_points_estimates,
+                   IKeyPointArray *new_points, Termination termination, float epsilon, unsigned int num_iterations, size_t window_dimension,
+                   bool use_initial_estimate, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    std::unique_ptr<NEScharr3x3[]>       _func_scharr;
+    std::unique_ptr<NELKTrackerKernel[]> _kernel_tracker;
+    std::unique_ptr<Tensor[]>            _scharr_gx;
+    std::unique_ptr<Tensor[]>            _scharr_gy;
+    IKeyPointArray                      *_new_points;
+    const IKeyPointArray                *_new_points_estimates;
+    const IKeyPointArray                *_old_points;
+    LKInternalKeypointArray              _new_points_internal;
+    LKInternalKeypointArray              _old_points_internal;
+    unsigned int                         _num_levels;
+};
+}
+#endif /*__ARM_COMPUTE_NEOPTICALFLOW_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEPhase.h b/arm_compute/runtime/NEON/functions/NEPhase.h
new file mode 100644
index 0000000000..985ba84c4c
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEPhase.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEPHASE_H__
+#define __ARM_COMPUTE_NEPHASE_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run NEMagnitudePhaseKernel */
+class NEPhase : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output.
+     *
+     * @param[in]  input1 First tensor input. Data type supported: S16.
+     * @param[in]  input2 Second tensor input. Data type supported: S16.
+     * @param[out] output Output tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+};
+}
+#endif /*__ARM_COMPUTE_NEPHASE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
new file mode 100644
index 0000000000..de7a797cd8
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEPIXELWISEMULTIPLICATION_H__
+#define __ARM_COMPUTE_NEPIXELWISEMULTIPLICATION_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEPixelWiseMultiplicationKernel */
+class NEPixelWiseMultiplication : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs, output and convertion policy.
+     *
+     * @param[in]  input1          First tensor input. Data types supported: U8/QS8/S16/F32.
+     * @param[in]  input2          Second tensor input. Data types supported: U8/QS8/S16/F32.
+     * @param[out] output          Output tensor. Data types supported: U8/QS8/S16/F32.
+     * @param[in]  scale           Scale to apply after multiplication. Must be positive.
+     * @param[in]  overflow_policy Overflow policy.
+     * @param[in]  rounding_policy Rounding policy.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
+};
+}
+#endif /*__ARM_COMPUTE_NEPIXELWISEMULTIPLICATION_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
new file mode 100644
index 0000000000..5a9cffa5ae
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEPOOLINGLAYER_H__
+#define __ARM_COMPUTE_NEPOOLINGLAYER_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if padding size is different from zero)
+ * -# @ref NEPoolingLayerKernel
+ */
+class NEPoolingLayer : public INESimpleFunction
+{
+public:
+    /** Set the input and output tensors.
+     *
+     * @param[in, out] input     Source tensor. (Written to only when padding != 0) Data types supported: QS8/F32.
+     * @param[out]     output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in]      pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     */
+    void configure(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info);
+};
+}
+#endif /* __ARM_COMPUTE_NEPOOLINGLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NERemap.h b/arm_compute/runtime/NEON/functions/NERemap.h
new file mode 100644
index 0000000000..b1ec559817
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NERemap.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEREMAP_H__
+#define __ARM_COMPUTE_NEREMAP_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute remap. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NERemapKernel
+ */
+class NERemap : public INESimpleFunction
+{
+public:
+    /** Initialise the function's sources, destination, interpolation policy and border mode.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[in]      map_x                 Map for X coordinates. Data type supported: F32.
+     * @param[in]      map_y                 Map for Y coordinates. Data type supported: F32.
+     * @param[out]     output                Output tensor. Data type supported: U8.
+     * @param[in]      policy                Interpolation policy to use. Only NEAREST and BILINEAR are supported.
+     * @param[in]      border_mode           Border mode to use on the input tensor.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output,
+                   InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_NEREMAP_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEScale.h b/arm_compute/runtime/NEON/functions/NEScale.h
new file mode 100644
index 0000000000..e1da891dcf
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEScale.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESCALEIMAGE_H__
+#define __ARM_COMPUTE_NESCALEIMAGE_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEScaleKernel */
+class NEScale : public INESimpleFunction
+{
+public:
+    /** Constructor
+     *
+     * Initialize NEScale
+     */
+    NEScale();
+    /** Initialize the function's source, destination, interpolation type and border_mode.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output                Destination tensor. Data type supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in]      policy                The interpolation type.
+     * @param[in]      border_mode           Strategy to use for borders.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+private:
+    Tensor _offsets; /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */
+    Tensor _dx;      /**< Element's distance between the X real coordinate and the smallest X following integer */
+    Tensor _dy;      /**< Element's distance between the Y real coordinate and the smallest Y following integer */
+};
+}
+#endif /*__ARM_COMPUTE_NESCALEIMAGE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEScharr3x3.h b/arm_compute/runtime/NEON/functions/NEScharr3x3.h
new file mode 100644
index 0000000000..db24723902
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEScharr3x3.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESCHARR3x3_H__
+#define __ARM_COMPUTE_NESCHARR3x3_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute scharr 3x3 filter. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NEScharr3x3Kernel
+ *
+ */
+class NEScharr3x3 : public INESimpleFunction
+{
+public:
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @note At least one of output_x or output_y must be not NULL.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output_x              (optional) Destination for the Scharr 3x3 convolution along the X axis. Data type supported: S16.
+     * @param[out]     output_y              (optional) Destination for the Scharr 3x3 convolution along the Y axis. Data type supported: S16.
+     * @param[in]      border_mode           Border mode to use for the convolution.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_NESCHARR3x3_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NESobel3x3.h b/arm_compute/runtime/NEON/functions/NESobel3x3.h
new file mode 100644
index 0000000000..e2896ba058
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NESobel3x3.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESOBEL3x3_H__
+#define __ARM_COMPUTE_NESOBEL3x3_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute sobel 3x3 filter. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NESobel3x3Kernel
+ *
+ */
+class NESobel3x3 : public INESimpleFunction
+{
+public:
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @note At least one of output_x or output_y must be not NULL.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output_x              (optional) Destination for the Sobel 3x3 convolution along the X axis. Data type supported: S16.
+     * @param[out]     output_y              (optional) Destination for the Sobel 3x3 convolution along the Y axis. Data type supported: S16.
+     * @param[in]      border_mode           Border mode to use for the convolution.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_NESOBEL3x3_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NESobel5x5.h b/arm_compute/runtime/NEON/functions/NESobel5x5.h
new file mode 100644
index 0000000000..fc4d665a70
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NESobel5x5.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESOBEL5x5_H__
+#define __ARM_COMPUTE_NESOBEL5x5_H__
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NESobel5x5Kernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute sobel 5x5 filter. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NESobel5x5HorKernel
+ * -# @ref NESobel5x5VertKernel
+ *
+ */
+class NESobel5x5 : public IFunction
+{
+public:
+    /** Default constructor */
+    NESobel5x5();
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @note At least one of output_x or output_y must be not NULL.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output_x              (optional) Destination for the Sobel 5x5 convolution along the X axis. Data type supported: S16.
+     * @param[out]     output_y              (optional) Destination for the Sobel 5x5 convolution along the Y axis. Data type supported: S16.
+     * @param[in]      border_mode           Border mode to use for the convolution.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+protected:
+    NESobel5x5HorKernel  _sobel_hor;      /**< Sobel Horizontal 5x5 kernel */
+    NESobel5x5VertKernel _sobel_vert;     /**< Sobel Vertical 5x5 kernel */
+    Tensor               _tmp_x;          /**< Temporary buffer for Sobel X */
+    Tensor               _tmp_y;          /**< Temporary buffer for Sobel Y */
+    NEFillBorderKernel   _border_handler; /**< Kernel to handle tensor borders */
+};
+}
+#endif /*__ARM_COMPUTE_NESOBEL5x5_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NESobel7x7.h b/arm_compute/runtime/NEON/functions/NESobel7x7.h
new file mode 100644
index 0000000000..06b7c80ad6
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NESobel7x7.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESOBEL7x7_H__
+#define __ARM_COMPUTE_NESOBEL7x7_H__
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NESobel7x7Kernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to execute sobel 7x7 filter. This function calls the following NEON kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
+ * -# @ref NESobel7x7HorKernel
+ * -# @ref NESobel7x7VertKernel
+ *
+ */
+class NESobel7x7 : public IFunction
+{
+public:
+    /** Default constructor */
+    NESobel7x7();
+    /** Initialise the function's source, destinations and border mode.
+     *
+     * @note At least one of output_x or output_y must be not NULL.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output_x              (optional) Destination for the Sobel 7x7 convolution along the X axis. Data type supported: S32.
+     * @param[out]     output_y              (optional) Destination for the Sobel 7x7 convolution along the Y axis. Data type supported: S32.
+     * @param[in]      border_mode           Border mode to use for the convolution.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     *
+     */
+    void configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+protected:
+    NESobel7x7HorKernel  _sobel_hor;      /**< Sobel Horizontal 7x7 kernel */
+    NESobel7x7VertKernel _sobel_vert;     /**< Sobel Vertical 7x7 kernel */
+    Tensor               _tmp_x;          /**< Temporary buffer for Sobel X */
+    Tensor               _tmp_y;          /**< Temporary buffer for Sobel Y */
+    NEFillBorderKernel   _border_handler; /**< Kernel to handle tensor borders */
+};
+}
+#endif /*__ARM_COMPUTE_NESOBEL7x7_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
new file mode 100644
index 0000000000..dc84dec0e4
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NESOFTMAXLAYER_H__
+#define __ARM_COMPUTE_NESOFTMAXLAYER_H__
+
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to compute a SoftmaxLayer.
+ *
+ * Softmax is calculated by :
+ * @f[ out = \frac{e^{x - max(x)}}{\sum{e^{x - max(x)}}} @f]
+ *
+ * This function runs the following kernels:
+ * -# @ref NELogits1DMaxKernel
+ * -# @ref NELogits1DShiftExpSumKernel
+ * -# @ref NELogits1DNormKernel
+ */
+class NESoftmaxLayer : public IFunction
+{
+public:
+    /** Constructor */
+    NESoftmaxLayer();
+    /** Set the input and output tensors.
+     *
+     * @param[in]  input  Source tensor. Data types supported: QS8/F32.
+     * @param[out] output Destination tensor. Data types supported: same as @p input.
+     */
+    void configure(ITensor *input, ITensor *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NELogits1DMaxKernel         _max_kernel;
+    NELogits1DShiftExpSumKernel _shift_exp_sum_kernel;
+    NELogits1DNormKernel        _norm_kernel;
+    NEFillBorderKernel          _fill_border_kernel;
+    Tensor                      _max;
+    Tensor                      _sum;
+    Tensor                      _tmp;
+};
+}
+#endif /* __ARM_COMPUTE_NESOFTMAXLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NETableLookup.h b/arm_compute/runtime/NEON/functions/NETableLookup.h
new file mode 100644
index 0000000000..b59ffb877c
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NETableLookup.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NETABLELOOKUP_H__
+#define __ARM_COMPUTE_NETABLELOOKUP_H__
+
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+class ILut;
+
+/** Basic function to run @ref NETableLookupKernel */
+class NETableLookup : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]  input  First tensor input. Data types supported: U8/S16
+     * @param[in]  lut    Input lookup table.
+     * @param[out] output Output tensor. Data types supported: same as @p input
+     */
+    void configure(const ITensor *input, const ILut *lut, ITensor *output);
+};
+}
+#endif /*__ARM_COMPUTE_NETABLELOOKUP_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEThreshold.h b/arm_compute/runtime/NEON/functions/NEThreshold.h
new file mode 100644
index 0000000000..d407ee5b15
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEThreshold.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NETHRESHOLD_H__
+#define __ARM_COMPUTE_NETHRESHOLD_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEThresholdKernel */
+class NEThreshold : public INESimpleFunction
+{
+public:
+    /** Initialise the function's source, destination, thresholds and threshold type
+     *
+     * @param[in]  input       First tensor input. Data type supported: U8.
+     * @param[out] output      Output tensor. Data type supported: U8.
+     * @param[in]  threshold   Threshold. If upper threshold is specified, this will be used as the lower threshold
+     * @param[in]  false_value Value to assign when the condition is false
+     * @param[in]  true_value  value to assign when the condition is true
+     * @param[in]  type        Thresholding type. Can either be BINARY or RANGE.
+     * @param[in]  upper       Upper threshold. Only used with RANGE thresholding
+     */
+    void configure(const ITensor *input, ITensor *output, uint8_t threshold, uint8_t false_value = 0, uint8_t true_value = 0,
+                   ThresholdType type = ThresholdType::BINARY, uint8_t upper = 0);
+};
+}
+#endif /*__ARM_COMPUTE_NETHRESHOLD_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NETranspose.h b/arm_compute/runtime/NEON/functions/NETranspose.h
new file mode 100644
index 0000000000..4b606e7282
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NETranspose.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NETRANSPOSE_H__
+#define __ARM_COMPUTE_NETRANSPOSE_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to transpose a matrix on NEON. This function calls the following NEON kernel:
+ *
+ *  -# @ref NETransposeKernel
+ *
+ */
+class NETranspose : public INESimpleFunction
+{
+public:
+    /** Initialise the kernel's inputs and output
+     *
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32
+     * @param[out] output Output tensor. Data type supported: Same as @p input
+     */
+    void configure(const ITensor *input, ITensor *output);
+};
+}
+
+#endif /* __ARM_COMPUTE_NETRANSPOSE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEWarpAffine.h b/arm_compute/runtime/NEON/functions/NEWarpAffine.h
new file mode 100644
index 0000000000..f8eebe8d2a
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEWarpAffine.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEWARPAFFINE_H__
+#define __ARM_COMPUTE_NEWARPAFFINE_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEWarpAffineKernel */
+class NEWarpAffine : public INESimpleFunction
+{
+public:
+    /** Initialize the function's source, destination, interpolation policy and border_mode.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output                Destination tensor. Data type supported: U8
+     * @param[in]      matrix                The perspective matrix. Must be 2x3 of type float.
+     * @param[in]      policy                The interpolation type.
+     * @param[in]      border_mode           Strategy to use for borders.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ITensor *input, ITensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_NEWARPAFFINE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEWarpPerspective.h b/arm_compute/runtime/NEON/functions/NEWarpPerspective.h
new file mode 100644
index 0000000000..d0699291b1
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEWarpPerspective.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEWARPPERSPECTIVE_H__
+#define __ARM_COMPUTE_NEWARPPERSPECTIVE_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEWarpPerspectiveKernel */
+class NEWarpPerspective : public INESimpleFunction
+{
+public:
+    /** Initialize the function's source, destination, interpolation policy and border_mode.
+     *
+     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output                Destination tensor. Data type supported: U8
+     * @param[in]      matrix                The perspective matrix. Must be 3x3 of type float.
+     * @param[in]      policy                The interpolation type.
+     * @param[in]      border_mode           Strategy to use for borders.
+     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(ITensor *input, ITensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
+};
+}
+#endif /*__ARM_COMPUTE_NEWARPPERSPECTIVE_H__ */
diff --git a/arm_compute/runtime/OMP/OMPScheduler.h b/arm_compute/runtime/OMP/OMPScheduler.h
new file mode 100644
index 0000000000..21df6a699d
--- /dev/null
+++ b/arm_compute/runtime/OMP/OMPScheduler.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_OMPSCHEDULER_H__
+#define __ARM_COMPUTE_OMPSCHEDULER_H__
+
+#include "arm_compute/runtime/IScheduler.h"
+
+namespace arm_compute
+{
+/** Pool of threads to automatically split a kernel's execution among several threads. */
+class OMPScheduler : public IScheduler
+{
+public:
+    /** Sets the number of threads the scheduler will use to run the kernels.
+     *
+     * @param[in] num_threads If set to 0, then the number returned by omp_get_max_threads() will be used, otherwise the number of threads specified.
+     */
+    void set_num_threads(unsigned int num_threads) override;
+    /** Returns the number of threads that the OMPScheduler has in its pool.
+     *
+     * @return Number of threads available in OMPScheduler.
+     */
+    unsigned int num_threads() const override;
+    /** Access the scheduler singleton
+     *
+     * @return The scheduler
+     */
+    static OMPScheduler &get();
+    /** Multithread the execution of the passed kernel if possible.
+     *
+     * The kernel will run on a single thread if any of these conditions is true:
+     * - ICPPKernel::is_parallelisable() returns false
+     * - The scheduler has been initialized with only one thread.
+     *
+     * @param[in] kernel          Kernel to execute.
+     * @param[in] split_dimension Dimension along which to split the kernel's execution window.
+     */
+    void schedule(ICPPKernel *kernel, unsigned int split_dimension) override;
+
+private:
+    /** Constructor. */
+    OMPScheduler();
+
+    unsigned int _num_threads;
+};
+}
+#endif /* __ARM_COMPUTE_OMPSCHEDULER_H__ */
diff --git a/arm_compute/runtime/Pyramid.h b/arm_compute/runtime/Pyramid.h
new file mode 100644
index 0000000000..2e7613759f
--- /dev/null
+++ b/arm_compute/runtime/Pyramid.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_PYRAMID_H__
+#define __ARM_COMPUTE_PYRAMID_H__
+
+#include "arm_compute/core/IPyramid.h"
+#include "arm_compute/core/PyramidInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstddef>
+#include <memory>
+
+namespace arm_compute
+{
+class Tensor;
+
+/** Basic implementation of the pyramid interface */
+class Pyramid : public IPyramid
+{
+public:
+    /** Initialize pyramid data-object using the given Pyramid's metadata
+     *
+     * @param[in] info Pyramid's metadata
+     */
+    void init(const PyramidInfo &info);
+
+    /** Initialize pyramid data-object using the given Pyramid's metadata
+     *
+     * @note Uses conservative padding strategy which fits all kernels.
+     *
+     * @param[in] info Pyramid's metadata
+     */
+    void init_auto_padding(const PyramidInfo &info);
+
+    /** Allocate the planes in the pyramid */
+    void allocate();
+
+    // Inherited method overridden
+    const PyramidInfo *info() const override;
+    Tensor *get_pyramid_level(size_t index) const override;
+
+private:
+    /** Initialize pyramid data-object using the given Pyramid's metadata
+     *
+     * @param[in] info         Pyramid's metadata
+     * @param[in] auto_padding Specifies whether the image in the pyramid use auto padding
+     */
+    void internal_init(const PyramidInfo &info, bool auto_padding);
+
+    PyramidInfo               _info{};
+    std::unique_ptr<Tensor[]> _pyramid{ nullptr };
+};
+}
+#endif /*__ARM_COMPUTE_PYRAMID_H__ */
diff --git a/arm_compute/runtime/Scheduler.h b/arm_compute/runtime/Scheduler.h
new file mode 100644
index 0000000000..21f944b75f
--- /dev/null
+++ b/arm_compute/runtime/Scheduler.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_SCHEDULER_H__
+#define __ARM_COMPUTE_SCHEDULER_H__
+
+#include "arm_compute/runtime/IScheduler.h"
+#include <memory>
+
+namespace arm_compute
+{
+/** Configurable scheduler which supports multiple multithreading APIs and choosing between different schedulers at runtime. */
+class Scheduler
+{
+public:
+    enum class Type
+    {
+        ST,    // Single thread.
+        CPP,   // C++11 threads.
+        OMP,   // OpenMP.
+        CUSTOM // Provided by the user.
+    };
+    /** Sets the user defined scheduler and makes it the active scheduler.
+     *
+     * @param[in] scheduler A shared pointer to a custom scheduler implemented by the user.
+     */
+    static void set(std::shared_ptr<IScheduler> &scheduler);
+    /** Access the scheduler singleton.
+     *
+     * @return A reference to the scheduler object.
+     */
+    static IScheduler &get();
+    /** Set the active scheduler.
+     *
+     * Only one scheduler can be enabled at any time.
+     *
+     * @param[in] t the type of the scheduler to be enabled.
+     */
+    static void set(Type t);
+    /** Returns the type of the active scheduler.
+     *
+     * @return The current scheduler's type.
+     */
+    static Type get_type();
+    /** Returns true if the given scheduler type is supported. False otherwise.
+     *
+     * @return true if the given scheduler type is supported. False otherwise.
+     */
+    static bool is_available(Type t);
+
+private:
+    static Type                        _scheduler_type;
+    static std::shared_ptr<IScheduler> _custom_scheduler;
+    Scheduler();
+};
+}
+#endif /* __ARM_COMPUTE_SCHEDULER_H__ */
diff --git a/arm_compute/runtime/SingleThreadScheduler.h b/arm_compute/runtime/SingleThreadScheduler.h
new file mode 100644
index 0000000000..a6e1defe7c
--- /dev/null
+++ b/arm_compute/runtime/SingleThreadScheduler.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_SINGLETHREADSCHEDULER_H__
+#define __ARM_COMPUTE_SINGLETHREADSCHEDULER_H__
+
+#include "arm_compute/runtime/IScheduler.h"
+
+namespace arm_compute
+{
+/** Pool of threads to automatically split a kernel's execution among several threads. */
+class SingleThreadScheduler : public IScheduler
+{
+public:
+    /** Sets the number of threads the scheduler will use to run the kernels.
+     *
+     * @param[in] num_threads This is ignored for this scheduler as the number of threads is always one.
+     */
+    void set_num_threads(unsigned int num_threads) override;
+    /** Returns the number of threads that the SingleThreadScheduler has, which is always 1.
+     *
+     * @return Number of threads available in SingleThreadScheduler.
+     */
+    unsigned int num_threads() const override;
+    /** Access the scheduler singleton
+     *
+     * @return The scheduler
+     */
+    static SingleThreadScheduler &get();
+    /** Runs the kernel in the same thread as the caller synchronously.
+     *
+     * @param[in] kernel          Kernel to execute.
+     * @param[in] split_dimension Dimension along which to split the kernel's execution window.
+     */
+    void schedule(ICPPKernel *kernel, unsigned int split_dimension) override;
+
+private:
+    /** Constructor. */
+    SingleThreadScheduler() = default;
+};
+}
+#endif /* __ARM_COMPUTE_SINGLETHREADSCHEDULER_H__ */
diff --git a/arm_compute/runtime/SubTensor.h b/arm_compute/runtime/SubTensor.h
new file mode 100644
index 0000000000..bdb229de49
--- /dev/null
+++ b/arm_compute/runtime/SubTensor.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_SUBTENSOR_H__
+#define __ARM_COMPUTE_SUBTENSOR_H__
+
+#include "arm_compute/core/SubTensorInfo.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensorInfo;
+
+/** Basic implementation of the sub-tensor interface */
+class SubTensor : public ITensor
+{
+public:
+    /** Constructor
+     *
+     * @param[in] parent       Parent tensor
+     * @param[in] tensor_shape Shape of the subtensor
+     * @param[in] coords       Coordinates of the first subtensor element inside the parent tensor.
+     */
+    SubTensor(ITensor *parent, const TensorShape &tensor_shape, const Coordinates &coords);
+    /** Destructor: free the tensor's memory */
+    ~SubTensor() = default;
+    /** Restrict instances of this class to be copy constructed */
+    SubTensor(const SubTensor &) = delete;
+    /** Restrict instances of this class to be copied */
+    SubTensor &operator=(const SubTensor &) = delete;
+    /** Allow instances of this class to be move constructed */
+    SubTensor(SubTensor &&) = default;
+    /** Allow instances of this class to be moved */
+    SubTensor &operator=(SubTensor &&) = default;
+    /** Return the parent tensor of the subtensor
+     *
+     * @return Parent tensor
+     */
+    ITensor *parent();
+
+    // Inherited methods overridden:
+    ITensorInfo *info() const override;
+    ITensorInfo *info() override;
+    uint8_t     *buffer() const override;
+
+private:
+    ITensor              *_parent;
+    mutable SubTensorInfo _info;
+};
+}
+#endif /*__ARM_COMPUTE_SUBTENSOR_H__ */
diff --git a/arm_compute/runtime/Tensor.h b/arm_compute/runtime/Tensor.h
new file mode 100644
index 0000000000..1fe73a2353
--- /dev/null
+++ b/arm_compute/runtime/Tensor.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TENSOR_H__
+#define __ARM_COMPUTE_TENSOR_H__
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class ITensorInfo;
+
+/** Basic implementation of the tensor interface */
+class Tensor : public ITensor
+{
+public:
+    /** Constructor */
+    Tensor();
+    /** Destructor: free the tensor's memory */
+    ~Tensor() = default;
+    /** Allow instances of this class to be move constructed */
+    Tensor(Tensor &&) = default;
+    /** Allow instances of this class to be moved */
+    Tensor &operator=(Tensor &&) = default;
+    /** Return a pointer to the tensor's allocator
+     *
+     * @return A pointer to the tensor's allocator
+     */
+    TensorAllocator *allocator();
+
+    // Inherited methods overridden:
+    ITensorInfo *info() const override;
+    ITensorInfo *info() override;
+    uint8_t     *buffer() const override;
+
+private:
+    mutable TensorAllocator _allocator; /**< Instance of the basic CPU allocator.*/
+};
+
+using Image = Tensor;
+}
+#endif /*__ARM_COMPUTE_TENSOR_H__ */
diff --git a/arm_compute/runtime/TensorAllocator.h b/arm_compute/runtime/TensorAllocator.h
new file mode 100644
index 0000000000..450323b3ab
--- /dev/null
+++ b/arm_compute/runtime/TensorAllocator.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TENSORALLOCATOR_H__
+#define __ARM_COMPUTE_TENSORALLOCATOR_H__
+
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+namespace arm_compute
+{
+class Coordinates;
+class TensorInfo;
+
+/** Basic implementation of a CPU memory tensor allocator. */
+class TensorAllocator : public ITensorAllocator
+{
+public:
+    /** Default constructor. */
+    TensorAllocator();
+
+    /** Make ITensorAllocator's init methods available */
+    using ITensorAllocator::init;
+
+    /** Shares the same backing memory with another tensor allocator, while the tensor info might be different.
+     *  In other words this can be used to create a sub-tensor from another tensor while sharing the same memory.
+     *
+     * @note TensorAllocator have to be of the same specialized type.
+     *
+     * @param[in] allocator The allocator that owns the backing memory to be shared. Ownership becomes shared afterwards.
+     * @param[in] coords    The starting coordinates of the new tensor inside the parent tensor.
+     * @param[in] sub_info  The new tensor information (e.g. shape etc)
+     */
+    void init(const TensorAllocator &allocator, const Coordinates &coords, TensorInfo sub_info);
+
+    /** Returns the pointer to the allocated data. */
+    uint8_t *data() const;
+
+    /** Allocate size specified by TensorInfo of CPU memory.
+     *
+     * @note The tensor must not already be allocated when calling this function.
+     *
+     */
+    void allocate() override;
+
+    /** Free allocated CPU memory.
+     *
+     * @note The tensor must have been allocated when calling this function.
+     *
+     */
+    void free() override;
+
+protected:
+    /** No-op for CPU memory
+     *
+     * @return A pointer to the beginning of the tensor's allocation.
+     */
+    uint8_t *lock() override;
+
+    /** No-op for CPU memory. */
+    void unlock() override;
+
+private:
+    std::shared_ptr<std::vector<uint8_t>> _buffer; /**< CPU memory allocation. */
+};
+}
+#endif /* __ARM_COMPUTE_TENSORALLOCATOR_H__ */
diff --git a/arm_compute/runtime/Utils.h b/arm_compute/runtime/Utils.h
new file mode 100644
index 0000000000..2f037a0621
--- /dev/null
+++ b/arm_compute/runtime/Utils.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_RUNTIME_UTILS_H__
+#define __ARM_COMPUTE_RUNTIME_UTILS_H__
+
+#include "arm_compute/runtime/Scheduler.h"
+
+#include <string>
+
+namespace arm_compute
+{
+/** Convert a Scheduler::Type into a string.
+ *
+ * @param[in] t @ref Scheduler::Type to be translated to string.
+ *
+ * @return The string describing the scheduler type.
+ */
+const std::string &string_from_scheduler_type(Scheduler::Type t);
+}
+#endif /* __ARM_COMPUTE_RUNTIME_UTILS_H__ */
diff --git a/data b/data
new file mode 160000
+Subproject 1f4578a90cde937d510198fc0926adf42a81440
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
new file mode 100644
index 0000000000..f84e64e9a0
--- /dev/null
+++ b/docs/00_introduction.dox
@@ -0,0 +1,514 @@
+/** @mainpage Introduction
+
+@tableofcontents
+
+The Computer Vision and Machine Learning library is a set of functions optimised for both ARM CPUs and GPUs using SIMD technologies.
+
+Several builds of the library are available using various configurations:
+ - OS: Linux, Android or bare metal.
+ - Architecture: armv7a (32bit) or arm64-v8a (64bit)
+ - Technology: NEON / OpenCL / NEON and OpenCL
+ - Debug / Asserts / Release: Use a build with asserts enabled to debug your application and enable extra validation. Once you are sure your application works as expected you can switch to a release build of the library for maximum performance.
+
+@section S0_1_contact Contact / Support
+
+Please email developer@arm.com
+
+In order to facilitate the work of the support team please provide the build information of the library you are using. To get the version of the library you are using simply run:
+
+    $ strings android-armv7a-cl-asserts/libarm_compute.so | grep arm_compute_version
+    arm_compute_version=v16.12 Build options: {'embed_kernels': '1', 'opencl': '1', 'arch': 'armv7a', 'neon': '0', 'asserts': '1', 'debug': '0', 'os': 'android', 'Werror': '1'} Git hash=f51a545d4ea12a9059fe4e598a092f1fd06dc858
+
+@section S1_file_organisation File organisation
+
+This archive contains:
+ - The arm_compute header and source files
+ - The latest Khronos OpenCL 1.2 C headers from the <a href="https://www.khronos.org/registry/cl/">Khronos OpenCL registry</a>
+ - The latest Khronos cl2.hpp from the <a href="https://www.khronos.org/registry/cl/">Khronos OpenCL registry</a> (API version 2.1 when this document was written)
+ - The sources for a stub version of libOpenCL.so to help you build your application.
+ - An examples folder containing a few examples to compile and link against the library.
+ - A @ref utils folder containing headers with some boiler plate code used by the examples.
+ - This documentation.
+
+You should have the following file organisation:
+
+	.
+	├── arm_compute --> All the arm_compute headers
+	│   ├── core
+	│   │   ├── CL
+	│   │   │   ├── CLKernels.h --> Includes all the OpenCL kernels at once
+	│   │   │   ├── CL specialisation of all the generic objects interfaces (ICLTensor, ICLImage, etc.)
+	│   │   │   ├── kernels --> Folder containing all the OpenCL kernels
+	│   │   │   │   └── CL*Kernel.h
+	│   │   │   └── OpenCL.h --> Wrapper to configure the Khronos OpenCL C++ header
+	│   │   ├── CPP
+	│   │   │   └── kernels --> Folder containing all the CPP kernels
+	│   │   │   │   └── CPP*Kernel.h
+	│   │   ├── NEON
+	│   │   │   ├── kernels --> Folder containing all the NEON kernels
+	│   │   │   │   └── NE*Kernel.h
+	│   │   │   └── NEKernels.h --> Includes all the NEON kernels at once
+	│   │   ├── All common basic types (Types.h, Window, Coordinates, Iterator, etc.)
+	│   │   ├── All generic objects interfaces (ITensor, IImage, etc.)
+	│   │   └── Objects metadata classes (ImageInfo, TensorInfo, MultiImageInfo)
+	│   └── runtime
+	│       ├── CL
+	│       │   ├── CL objects & allocators (CLArray, CLImage, CLTensor, etc.)
+	│       │   ├── functions --> Folder containing all the OpenCL functions
+	│       │   │   └── CL*.h
+	│       │   └── CLFunctions.h --> Includes all the OpenCL functions at once
+	│       ├── CPP
+	│       │   └── Scheduler.h --> Basic pool of threads to execute CPP/NEON code on several cores in parallel
+	│       ├── NEON
+	│       │   ├── functions --> Folder containing all the NEON functions
+	│       │   │   └── NE*.h
+	│       │   └── NEFunctions.h --> Includes all the NEON functions at once
+	│       └── Basic implementations of the generic object interfaces (Array, Image, Tensor, etc.)
+	├── documentation
+	│   ├── index.xhtml
+	│   └── ...
+	├── documentation.xhtml -> documentation/index.xhtml
+	├── examples
+	│   ├── cl_convolution.cpp
+	│   ├── neoncl_scale_median_gaussian.cpp
+	│   ├── neon_convolution.cpp
+	│   └── neon_scale.cpp
+	├── include
+	│   └── CL
+	│       └── Khronos OpenCL C headers and C++ wrapper
+	├── opencl-1.2-stubs
+	│   └── opencl_stubs.c
+	├── src
+	│   ├── core
+	│   │   └── ... (Same structure as headers)
+	│   │       └── CL
+	│   │           └── cl_kernels --> All the OpenCL kernels
+	│   └── runtime
+	│       └── ... (Same structure as headers)
+	├── tests
+	│   ├── All test related files shared between validation and benchmark
+	│   ├── CL --> OpenCL specific files (shared)
+	│   ├── NEON --> NEON specific files (shared)
+	│   ├── benchmark --> Sources for benchmarking
+	│   │   ├── Benchmark specific files
+	│   │   ├── main.cpp --> Entry point for benchmark test framework
+	│   │   ├── CL --> OpenCL benchmarking tests
+	│   │   └── NEON --> NEON benchmarking tests
+	│   ├── validation --> Sources for validation
+	│   │   ├── Validation specific files
+	│   │   ├── main.cpp --> Entry point for validation test framework
+	│   │   ├── CL --> OpenCL validation tests
+	│   │   ├── NEON --> NEON validation tests
+	│   │   └── UNIT --> Library validation tests
+	│   └── dataset --> Datasets defining common sets of input parameters
+	└── utils --> Boiler plate code used by examples
+	    └── Utils.h
+
+@section S2_versions_changelog Release versions and changelog
+
+@subsection S2_1_versions Release versions
+
+All releases are numbered vYY.MM Where YY are the last two digits of the year, and MM the month number.
+If there is more than one release in a month then an extra sequential number is appended at the end:
+
+	v17.03 (First release of March 2017)
+	v17.03.1 (Second release of March 2017)
+	v17.04 (First release of April 2017)
+
+@note We're aiming at releasing one major public release with new features per quarter. All releases in between will only contain bug fixes.
+
+@subsection S2_2_changelog Changelog
+
+v17.06 Public major release
+ - Various bug fixes
+ - Added support for fixed point 8 bit (QS8) to the various NEON machine learning kernels.
+ - Added unit tests and benchmarks (AlexNet, LeNet)
+ - Added support for sub tensors.
+ - Added infrastructure to provide GPU specific optimisation for some OpenCL kernels.
+ - Added @ref arm_compute::OMPScheduler (OpenMP) scheduler for NEON
+ - Added @ref arm_compute::SingleThreadScheduler scheduler for NEON (For bare metal)
+ - User can specify his own scheduler by implementing the @ref arm_compute::IScheduler interface.
+ - New OpenCL kernels / functions:
+    - @ref arm_compute::CLBatchNormalizationLayerKernel / @ref arm_compute::CLBatchNormalizationLayer
+    - @ref arm_compute::CLDepthConcatenateKernel / @ref arm_compute::CLDepthConcatenate
+    - @ref arm_compute::CLHOGOrientationBinningKernel @ref arm_compute::CLHOGBlockNormalizationKernel, @ref arm_compute::CLHOGDetectorKernel / @ref arm_compute::CLHOGDescriptor @ref arm_compute::CLHOGDetector @ref arm_compute::CLHOGGradient @ref arm_compute::CLHOGMultiDetection
+    - @ref arm_compute::CLLocallyConnectedMatrixMultiplyKernel / @ref arm_compute::CLLocallyConnectedLayer
+    - @ref arm_compute::CLWeightsReshapeKernel / @ref arm_compute::CLConvolutionLayerReshapeWeights
+ - New C++ kernels:
+    - @ref arm_compute::CPPDetectionWindowNonMaximaSuppressionKernel
+ - New NEON kernels / functions:
+    - @ref arm_compute::NEBatchNormalizationLayerKernel / @ref arm_compute::NEBatchNormalizationLayer
+    - @ref arm_compute::NEDepthConcatenateKernel / @ref arm_compute::NEDepthConcatenate
+    - @ref arm_compute::NEDirectConvolutionLayerKernel / @ref arm_compute::NEDirectConvolutionLayer
+    - @ref arm_compute::NELocallyConnectedMatrixMultiplyKernel / @ref arm_compute::NELocallyConnectedLayer
+    - @ref arm_compute::NEWeightsReshapeKernel / @ref arm_compute::NEConvolutionLayerReshapeWeights
+
+v17.05 Public bug fixes release
+ - Various bug fixes
+ - Remaining of the functions ported to use accurate padding.
+ - Library does not link against OpenCL anymore (It uses dlopen / dlsym at runtime instead to determine whether or not OpenCL is available).
+ - Added "free" method to allocator.
+ - Minimum version of g++ required for armv7 Linux changed from 4.8 to 4.9
+
+v17.04 Public bug fixes release
+
+ The following functions have been ported to use the new accurate padding:
+ -  @ref arm_compute::CLColorConvertKernel
+ -  @ref arm_compute::CLEdgeNonMaxSuppressionKernel
+ -  @ref arm_compute::CLEdgeTraceKernel
+ -  @ref arm_compute::CLGaussianPyramidHorKernel
+ -  @ref arm_compute::CLGaussianPyramidVertKernel
+ -  @ref arm_compute::CLGradientKernel
+ -  @ref arm_compute::NEChannelCombineKernel
+ -  @ref arm_compute::NEFillArrayKernel
+ -  @ref arm_compute::NEGaussianPyramidHorKernel
+ -  @ref arm_compute::NEGaussianPyramidVertKernel
+ -  @ref arm_compute::NEHarrisScoreFP16Kernel
+ -  @ref arm_compute::NEHarrisScoreKernel
+ -  @ref arm_compute::NEHOGDetectorKernel
+ -  @ref arm_compute::NELogits1DMaxKernel
+ -  @ref arm_compute::NELogits1DShiftExpSumKernel
+ -  @ref arm_compute::NELogits1DNormKernel
+ -  @ref arm_compute::NENonMaximaSuppression3x3FP16Kernel
+ -  @ref arm_compute::NENonMaximaSuppression3x3Kernel
+
+
+v17.03.1 First Major public release of the sources
+ - Renamed the library to arm_compute
+ - New CPP target introduced for C++ kernels shared between NEON and CL functions.
+ - New padding calculation interface introduced and ported most kernels / functions to use it.
+ - New OpenCL kernels / functions:
+   - @ref arm_compute::CLGEMMLowpMatrixMultiplyKernel / @ref arm_compute::CLGEMMLowp
+ - New NEON kernels / functions:
+   - @ref arm_compute::NENormalizationLayerKernel / @ref arm_compute::NENormalizationLayer
+   - @ref arm_compute::NETransposeKernel / @ref arm_compute::NETranspose
+   - @ref arm_compute::NELogits1DMaxKernel, @ref arm_compute::NELogits1DShiftExpSumKernel, @ref arm_compute::NELogits1DNormKernel / @ref arm_compute::NESoftmaxLayer
+   - @ref arm_compute::NEIm2ColKernel, @ref arm_compute::NECol2ImKernel, arm_compute::NEConvolutionLayerWeightsReshapeKernel / @ref arm_compute::NEConvolutionLayer
+   - @ref arm_compute::NEGEMMMatrixAccumulateBiasesKernel / @ref arm_compute::NEFullyConnectedLayer
+   - @ref arm_compute::NEGEMMLowpMatrixMultiplyKernel / @ref arm_compute::NEGEMMLowp
+
+v17.03 Sources preview
+ - New OpenCL kernels / functions:
+   - @ref arm_compute::CLGradientKernel, @ref arm_compute::CLEdgeNonMaxSuppressionKernel, @ref arm_compute::CLEdgeTraceKernel / @ref arm_compute::CLCannyEdge
+   - GEMM refactoring + FP16 support: @ref arm_compute::CLGEMMInterleave4x4Kernel, @ref arm_compute::CLGEMMTranspose1xWKernel, @ref arm_compute::CLGEMMMatrixMultiplyKernel, @ref arm_compute::CLGEMMMatrixAdditionKernel / @ref arm_compute::CLGEMM
+   - @ref arm_compute::CLGEMMMatrixAccumulateBiasesKernel / @ref arm_compute::CLFullyConnectedLayer
+   - @ref arm_compute::CLTransposeKernel / @ref arm_compute::CLTranspose
+   - @ref arm_compute::CLLKTrackerInitKernel, @ref arm_compute::CLLKTrackerStage0Kernel, @ref arm_compute::CLLKTrackerStage1Kernel, @ref arm_compute::CLLKTrackerFinalizeKernel / @ref arm_compute::CLOpticalFlow
+   - @ref arm_compute::CLNormalizationLayerKernel / @ref arm_compute::CLNormalizationLayer
+   - @ref arm_compute::CLLaplacianPyramid, @ref arm_compute::CLLaplacianReconstruct
+ - New NEON kernels / functions:
+   - @ref arm_compute::NEActivationLayerKernel / @ref arm_compute::NEActivationLayer
+   - GEMM refactoring + FP16 support (Requires armv8.2 CPU): @ref arm_compute::NEGEMMInterleave4x4Kernel, @ref arm_compute::NEGEMMTranspose1xWKernel, @ref arm_compute::NEGEMMMatrixMultiplyKernel, @ref arm_compute::NEGEMMMatrixAdditionKernel / @ref arm_compute::NEGEMM
+   - @ref arm_compute::NEPoolingLayerKernel / @ref arm_compute::NEPoolingLayer
+
+v17.02.1 Sources preview
+ - New OpenCL kernels / functions:
+   - @ref arm_compute::CLLogits1DMaxKernel, @ref arm_compute::CLLogits1DShiftExpSumKernel, @ref arm_compute::CLLogits1DNormKernel / @ref arm_compute::CLSoftmaxLayer
+   - @ref arm_compute::CLPoolingLayerKernel / @ref arm_compute::CLPoolingLayer
+   - @ref arm_compute::CLIm2ColKernel, @ref arm_compute::CLCol2ImKernel, @ref arm_compute::CLConvolutionLayerWeightsReshapeKernel / @ref arm_compute::CLConvolutionLayer
+   - @ref arm_compute::CLRemapKernel / @ref arm_compute::CLRemap
+   - @ref arm_compute::CLGaussianPyramidHorKernel, @ref arm_compute::CLGaussianPyramidVertKernel / @ref arm_compute::CLGaussianPyramid, @ref arm_compute::CLGaussianPyramidHalf, @ref arm_compute::CLGaussianPyramidOrb
+   - @ref arm_compute::CLMinMaxKernel, @ref arm_compute::CLMinMaxLocationKernel / @ref arm_compute::CLMinMaxLocation
+   - @ref arm_compute::CLNonLinearFilterKernel / @ref arm_compute::CLNonLinearFilter
+ - New NEON FP16 kernels (Requires armv8.2 CPU)
+   - @ref arm_compute::NEAccumulateWeightedFP16Kernel
+   - @ref arm_compute::NEBox3x3FP16Kernel
+   - @ref arm_compute::NENonMaximaSuppression3x3FP16Kernel
+
+v17.02 Sources preview
+ - New OpenCL kernels / functions:
+   - @ref arm_compute::CLActivationLayerKernel / @ref arm_compute::CLActivationLayer
+   - @ref arm_compute::CLChannelCombineKernel / @ref arm_compute::CLChannelCombine
+   - @ref arm_compute::CLDerivativeKernel / @ref arm_compute::CLChannelExtract
+   - @ref arm_compute::CLFastCornersKernel / @ref arm_compute::CLFastCorners
+   - @ref arm_compute::CLMeanStdDevKernel / @ref arm_compute::CLMeanStdDev
+ - New NEON kernels / functions:
+   - HOG / SVM: @ref arm_compute::NEHOGOrientationBinningKernel, @ref arm_compute::NEHOGBlockNormalizationKernel, @ref arm_compute::NEHOGDetectorKernel, arm_compute::NEHOGNonMaximaSuppressionKernel / @ref arm_compute::NEHOGDescriptor, @ref arm_compute::NEHOGDetector, @ref arm_compute::NEHOGGradient, @ref arm_compute::NEHOGMultiDetection
+   - @ref arm_compute::NENonLinearFilterKernel / @ref arm_compute::NENonLinearFilter
+ - Introduced a CLScheduler to manage the default context and command queue used by the runtime library and create synchronisation events.
+ - Switched all the kernels / functions to use tensors instead of images.
+ - Updated documentation to include instructions to build the library from sources.
+
+v16.12 Binary preview release
+ - Original release
+
+@section S3_how_to_build How to build the library and the examples
+
+@subsection S3_1_build_options Build options
+
+scons 2.3 or above is required to build the library.
+To see the build options available simply run ```scons -h```:
+
+	debug: Debug (default=0) (0|1)
+		default: 0
+		actual: 0
+
+	asserts: Enable asserts (This flag is forced to 1 for debug=1) (default=0) (0|1)
+		default: 0
+		actual: 0
+
+	arch: Target Architecture (default=armv7a) (armv7a|arm64-v8a|arm64-v8.2-a|x86_32|x86_64)
+		default: armv7a
+		actual: armv7a
+
+	os: Target OS (default=linux) (linux|android|bare_metal)
+		default: linux
+		actual: linux
+
+	build: Build type: (default=cross_compile) (native|cross_compile)
+		default: cross_compile
+		actual: cross_compile
+
+	Werror: Enable/disable the -Werror compilation flag (Default=1) (0|1)
+		default: 1
+		actual: 1
+
+	opencl: Enable OpenCL support(Default=1) (0|1)
+		default: 1
+		actual: 1
+
+	neon: Enable Neon support(Default=0) (0|1)
+		default: 0
+		actual: 0
+
+	embed_kernels: Embed OpenCL kernels in library binary(Default=0) (0|1)
+		default: 0
+		actual: 0
+
+	scheduler: Scheduler backend(Default=cpp) (cpp|pthread|openmp)
+		default: cpp
+		actual: cpp
+
+	set_soname: Set the library's soname and shlibversion (Requires SCons 2.4 or above) (yes|no)
+		default: 0
+		actual: False
+
+	extra_cxx_flags: Extra CXX flags to be appended to the build command
+		default:
+		actual:
+
+Debug / asserts:
+ - With debug=1 asserts are enabled, and the library is built with symbols and no optimisations enabled.
+ - With debug=0 and asserts=1: Optimisations are enabled and symbols are removed, however all the asserts are still present (This is about 20% slower than the release build)
+ - With debug=0 and asserts=0: All optimisations are enable and no validation is performed, if the application misuses the library it is likely to result in a crash. (Only use this mode once you are sure your application is working as expected).
+
+Architecture: The x86_32 and x86_64 targets can only be used with neon=0 and opencl=1.
+
+OS: Choose the operating system you are targeting: Linux, Android or bare metal.
+@note bare metal can only be used for NEON (not OpenCL), only static libraries get built and NEON's multi-threading support is disabled.
+
+Build type: you can either build directly on your device (native) or cross compile from your desktop machine (cross-compile). In both cases make sure the compiler is available in your path.
+
+Werror: If you are compiling using the same toolchains as the ones used in this guide then there shouldn't be any warning and therefore you should be able to keep Werror=1. If with a different compiler version the library fails to build because of warnings interpreted as errors then, if you are sure the warnings are not important, you might want to try to build with Werror=0 (But please do report the issue either on Github or by an email to developer@arm.com so that the issue can be addressed).
+
+OpenCL / NEON: Choose which SIMD technology you want to target. (NEON for ARM Cortex-A CPUs or OpenCL for ARM Mali GPUs)
+
+embed_kernels: For OpenCL only: set embed_kernels=1 if you want the OpenCL kernels to be built in the library's binaries instead of being read from separate ".cl" files. If embed_kernels is set to 0 then the application can set the path to the folder containing the OpenCL kernel files by calling CLKernelLibrary::init(). By default the path is set to "./cl_kernels".
+
+set_soname: Do you want to build the versioned version of the library ?
+If enabled the library will contain a SONAME and SHLIBVERSION and some symlinks will automatically be created between the objects.
+Example:
+  libarm_compute_core.so -> libarm_compute_core.so.1.0.0
+  libarm_compute_core.so.1 -> libarm_compute_core.so.1.0.0
+  libarm_compute_core.so.1.0.0
+
+@note This options is disabled by default as it requires SCons version 2.4 or above.
+
+extra_cxx_flags: Custom CXX flags which will be appended to the end of the build command.
+
+@subsection S3_2_linux Linux
+
+@subsubsection S3_2_1_library How to build the library ?
+
+For Linux, the library was successfully built and tested using the following Linaro GCC toolchain:
+
+ - gcc-linaro-arm-linux-gnueabihf-4.9-2014.07_linux
+ - gcc-linaro-4.9-2016.02-x86_64_aarch64-linux-gnu
+ - gcc-linaro-6.3.1-2017.02-i686_aarch64-linux-gnu
+
+@note If you are building with opencl=1 then scons will expect to find libOpenCL.so either in the current directory or in "build" (See the section below if you need a stub OpenCL library to link against)
+
+To cross-compile the library in debug mode, with NEON only support, for Linux 32bit:
+
+	scons Werror=1 -j8 debug=1 neon=1 opencl=0 os=linux arch=armv7a
+
+To cross-compile the library in asserts mode, with OpenCL only support, for Linux 64bit:
+
+	scons Werror=1 -j8 debug=0 asserts=1 neon=0 opencl=1 embed_kernels=1 os=linux arch=arm64-v8a
+
+You can also compile the library natively on an ARM device by using <b>build=native</b>:
+
+	scons Werror=1 -j8 debug=0 neon=1 opencl=0 os=linux arch=arm64-v8a build=native
+	scons Werror=1 -j8 debug=0 neon=1 opencl=0 os=linux arch=armv7a build=native
+
+@note g++ for ARM is mono-arch, therefore if you want to compile for Linux 32bit on a Linux 64bit platform you will have to use a cross compiler.
+
+For example on a 64bit Debian based system you would have to install <b>g++-arm-linux-gnueabihf</b>
+
+	apt-get install g++-arm-linux-gnueabihf
+
+Then run
+
+	scons Werror=1 -j8 debug=0 neon=1 opencl=0 os=linux arch=armv7a build=cross_compile
+
+or simply remove the build parameter as build=cross_compile is the default value:
+
+	scons Werror=1 -j8 debug=0 neon=1 opencl=0 os=linux arch=armv7a
+
+@attention To cross compile with opencl=1 you need to make sure to have a version of libOpenCL matching your target architecture.
+
+@subsubsection S3_2_2_examples How to manually build the examples ?
+
+The examples get automatically built by scons as part of the build process of the library described above. This section just describes how you can build and link your own application against our library.
+
+@note The following command lines assume the arm_compute and libOpenCL binaries are present in the current directory or in the system library path. If this is not the case you can specify the location of the pre-built library with the compiler option -L. When building the OpenCL example the commands below assume that the CL headers are located in the include folder where the command is executed.
+
+To cross compile a NEON example for Linux 32bit:
+
+	arm-linux-gnueabihf-g++ examples/neon_convolution.cpp utils/Utils.cpp -I. -std=c++11 -mfpu=neon -L. -larm_compute -o neon_convolution
+
+To cross compile a NEON example for Linux 64bit:
+
+	aarch64-linux-gnu-g++ examples/neon_convolution.cpp utils/Utils.cpp -I. -std=c++11 -L. -larm_compute -o neon_convolution
+
+(notice the only difference with the 32 bit command is that we don't need the -mfpu option and the compiler's name is different)
+
+To cross compile an OpenCL example for Linux 32bit:
+
+	arm-linux-gnueabihf-g++ examples/cl_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++11 -mfpu=neon -L. -larm_compute -lOpenCL -o cl_convolution
+
+To cross compile an OpenCL example for Linux 64bit:
+
+	aarch64-linux-gnu-g++ examples/cl_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++11 -L. -larm_compute -lOpenCL -o cl_convolution
+
+(notice the only difference with the 32 bit command is that we don't need the -mfpu option and the compiler's name is different)
+
+To compile natively (i.e directly on an ARM device) for NEON for Linux 32bit:
+
+	g++ examples/neon_convolution.cpp utils/Utils.cpp -I. -std=c++11 -mfpu=neon -larm_compute -o neon_convolution
+
+To compile natively (i.e directly on an ARM device) for NEON for Linux 64bit:
+
+	g++ examples/neon_convolution.cpp utils/Utils.cpp -I. -std=c++11 -larm_compute -o neon_convolution
+
+(notice the only difference with the 32 bit command is that we don't need the -mfpu option)
+
+To compile natively (i.e directly on an ARM device) for OpenCL for Linux 32bit or Linux 64bit:
+
+	g++ examples/cl_convolution.cpp utils/Utils.cpp -I. -Iinclude -std=c++11 -larm_compute -lOpenCL -o cl_convolution
+
+
+@note These two commands assume libarm_compute.so is available in your library path, if not add the path to it using -L
+
+To run the built executable simply run:
+
+	LD_LIBRARY_PATH=build ./neon_convolution
+
+or
+
+	LD_LIBRARY_PATH=build ./cl_convolution
+
+@note If you built the library with support for both OpenCL and NEON you will need to link against OpenCL even if your application only uses NEON.
+
+@subsection S3_3_android Android
+
+For Android, the library was successfully built and tested using Google's standalone toolchains:
+ - arm-linux-androideabi-4.9 for armv7a (clang++)
+ - aarch64-linux-android-4.9 for arm64-v8a (g++)
+
+Here is a guide to <a href="https://developer.android.com/ndk/guides/standalone_toolchain.html">create your Android standalone toolchains from the NDK</a>
+
+- Download the NDK r14 from here: https://developer.android.com/ndk/downloads/index.html
+- Make sure you have Python 2 installed on your machine.
+- Generate the 32 and/or 64 toolchains by running the following commands:
+
+
+	$NDK/build/tools/make_standalone_toolchain.py --arch arm64 --install-dir $MY_TOOLCHAINS/aarch64-linux-android-4.9 --stl gnustl
+	$NDK/build/tools/make_standalone_toolchain.py --arch arm --install-dir $MY_TOOLCHAINS/arm-linux-androideabi-4.9 --stl gnustl
+
+@attention Due to some NDK issues make sure you use g++ & gnustl for aarch64 and clang++ & gnustl for armv7
+
+@note Make sure to add the toolchains to your PATH: export PATH=$PATH:$MY_TOOLCHAINS/aarch64-linux-android-4.9/bin:$MY_TOOLCHAINS/arm-linux-androideabi-4.9/bin
+
+@subsubsection S3_3_1_library How to build the library ?
+
+@note If you are building with opencl=1 then scons will expect to find libOpenCL.so either in the current directory or in "build" (See the section below if you need a stub OpenCL library to link against)
+
+To cross-compile the library in debug mode, with NEON only support, for Android 32bit:
+
+	CXX=clang++ CC=clang scons Werror=1 -j8 debug=1 neon=1 opencl=0 os=android arch=armv7a
+
+To cross-compile the library in asserts mode, with OpenCL only support, for Android 64bit:
+
+	scons Werror=1 -j8 debug=0 asserts=1 neon=0 opencl=1 embed_kernels=1 os=android arch=arm64-v8a
+
+@subsubsection S3_3_2_examples How to manually build the examples ?
+
+The examples get automatically built by scons as part of the build process of the library described above. This section just describes how you can build and link your own application against our library.
+
+@note The following command lines assume the arm_compute binaries are present in the current directory or in the system library path.
+
+Once you've got your Android standalone toolchain built and added to your path you can do the following:
+
+To cross compile a NEON example:
+
+	#32 bit:
+	arm-linux-androideabi-clang++ examples/neon_convolution.cpp -I. -Iinclude -std=c++11 -larm_compute-static -L. -o neon_convolution_arm -static-libstdc++ -pie
+	#64 bit:
+	aarch64-linux-android-g++ examples/neon_convolution.cpp -I. -Iinclude -std=c++11 -larm_compute-static -L. -o neon_convolution_aarch64 -static-libstdc++ -pie
+
+To cross compile an OpenCL example:
+
+	#32 bit:
+	arm-linux-androideabi-clang++ examples/cl_convolution.cpp -I. -Iinclude -std=c++11 -larm_compute-static -L. -o cl_convolution_arm -static-libstdc++ -pie -lOpenCL
+	#64 bit:
+	aarch64-linux-android-g++ examples/cl_convolution.cpp -I. -Iinclude -std=c++11 -larm_compute-static -L. -o cl_convolution_aarch64 -static-libstdc++ -pie -lOpenCL
+
+@note Due to some issues in older versions of the Mali OpenCL DDK (<= r13p0), we recommend to link arm_compute statically on Android.
+
+Then you need to do is upload the executable and the shared library to the device using ADB:
+
+	adb push neon_convolution_arm /data/local/tmp/
+	adb push cl_convolution_arm /data/local/tmp/
+	adb shell chmod 777 -R /data/local/tmp/
+
+And finally to run the example:
+
+	adb shell /data/local/tmp/neon_convolution_arm
+	adb shell /data/local/tmp/cl_convolution_arm
+
+For 64bit:
+
+	adb push neon_convolution_aarch64 /data/local/tmp/
+	adb push cl_convolution_aarch64 /data/local/tmp/
+	adb shell chmod 777 -R /data/local/tmp/
+
+And finally to run the example:
+
+	adb shell /data/local/tmp/neon_convolution_aarch64
+	adb shell /data/local/tmp/cl_convolution_aarch64
+
+@subsection S3_4_cl_stub_library The OpenCL stub library
+
+In the opencl-1.2-stubs folder you will find the sources to build a stub OpenCL library which then can be used to link your application or arm_compute against.
+
+If you preferred you could retrieve the OpenCL library from your device and link against this one but often this library will have dependencies on a range of system libraries forcing you to link your application against those too even though it is not using them.
+
+@warning This OpenCL library provided is a stub and *not* a real implementation. You can use it to resolve OpenCL's symbols in arm_compute while building the example but you must make sure the real libOpenCL.so is in your PATH when running the example or it will not work.
+
+To cross-compile the stub OpenCL library simply run:
+
+	<target-prefix>-gcc -o libOpenCL.so -Iinclude opencl-1.2-stubs/opencl_stubs.c -fPIC -shared
+
+For example:
+
+	<target-prefix>-gcc -o libOpenCL.so -Iinclude opencl-1.2-stubs/opencl_stubs.c -fPIC -shared
+	#Linux 32bit
+	arm-linux-gnueabihf-gcc -o libOpenCL.so -Iinclude opencl-1.2-stubs/opencl_stubs.c -fPIC -shared
+	#Linux 64bit
+	aarch64-linux-gnu-gcc -o libOpenCL.so -Iinclude -shared opencl-1.2-stubs/opencl_stubs.c -fPIC
+	#Android 32bit
+	arm-linux-androideabi-clang -o libOpenCL.so -Iinclude -shared opencl-1.2-stubs/opencl_stubs.c -fPIC -shared
+	#Android 64bit
+	aarch64-linux-android-gcc -o libOpenCL.so -Iinclude -shared opencl-1.2-stubs/opencl_stubs.c -fPIC -shared
+*/
diff --git a/docs/01_library.dox b/docs/01_library.dox
new file mode 100644
index 0000000000..738579e7c6
--- /dev/null
+++ b/docs/01_library.dox
@@ -0,0 +1,250 @@
+namespace arm_compute
+{
+/** 
+@page architecture Library architecture
+
+@tableofcontents
+
+@section S4_1 Core vs Runtime libraries
+
+The Core library is a low level collection of algorithms implementations, it is designed to be embedded in existing projects and applications:
+
+- It doesn't allocate any memory (All the memory allocations/mappings have to be handled by the caller).
+- It doesn't perform any kind of multi-threading (but provide information to the caller about how the workload can be split).
+
+The Runtime library is a very basic wrapper around the Core library which can be used for quick prototyping, it is basic in the sense that:
+
+- It allocates images and tensors by using standard malloc().
+- It multi-threads NEON code in a very basic way using a very simple pool of threads.
+- For OpenCL it uses the default CLScheduler command queue for all mapping operations and kernels.
+
+For maximum performance, it is expected that the users would re-implement an equivalent to the runtime library which suits better their needs (With a more clever multi-threading strategy, load-balancing between NEON and OpenCL, etc.)
+
+@section S4_2_windows_kernels_mt_functions Windows, kernels, multi-threading and functions
+
+@subsection S4_2_1_windows Windows
+
+A @ref Window represents a workload to execute, it can handle up to @ref Coordinates::num_max_dimensions dimensions.
+Each dimension is defined by a start, end and step.
+
+It can split into subwindows as long as *all* the following rules remain true for all the dimensions:
+
+- max[n].start() <= sub[n].start() < max[n].end()
+- sub[n].start() < sub[n].end() <= max[n].end()
+- max[n].step() == sub[n].step()
+- (sub[n].start() - max[n].start()) % max[n].step() == 0
+- (sub[n].end() - sub[n].start()) % max[n].step() == 0
+
+@subsection S4_2_2 Kernels
+
+Each implementation of the @ref IKernel interface (base class of all the kernels in the core library) works in the same way:
+
+OpenCL kernels:
+
+@code{.cpp}
+// Initialize the CLScheduler with the default context and default command queue
+// Implicitly initializes the CLKernelLibrary to use ./cl_kernels as location for OpenCL kernels files and sets a default device for which OpenCL programs are built.
+CLScheduler::get().default_init();
+
+cl::CommandQueue q = CLScheduler::get().queue();
+//Create a kernel object:
+MyKernel kernel;
+// Initialize the kernel with the input/output and options you want to use:
+kernel.configure( input, output, option0, option1);
+// Retrieve the execution window of the kernel:
+const Window& max_window = kernel.window();
+// Run the whole kernel in the current thread:
+kernel.run( q, max_window ); // Enqueue the kernel to process the full window on the default queue
+
+// Wait for the processing to complete:
+q.finish();
+@endcode
+
+NEON / CPP kernels:
+
+@code{.cpp}
+//Create a kernel object:
+MyKernel kernel;
+// Initialize the kernel with the input/output and options you want to use:
+kernel.configure( input, output, option0, option1);
+// Retrieve the execution window of the kernel:
+const Window& max_window = kernel.window();
+// Run the whole kernel in the current thread:
+kernel.run( max_window ); // Run the kernel on the full window
+@endcode
+
+@subsection S4_2_3 Multi-threading
+
+The previous section shows how to run a NEON / CPP kernel in the current thread, however if your system has several CPU cores, you will probably want the kernel to use several cores. Here is how this can be done:
+
+@snippet src/runtime/CPP/CPPScheduler.cpp Scheduler example
+
+This is the very basic implementation used in the NEON runtime library by all the NEON functions.
+
+@sa CPPScheduler.
+
+@note Some kernels like for example @ref NEHistogramKernel need some local temporary buffer to perform their calculations. In order to avoid memory corruption between threads, the local buffer must be of size: ```memory_needed_per_thread * num_threads``` and each subwindow must be initialized by calling @ref Window::set_thread_id() with a unique thread_id between 0 and num_threads.
+
+@subsection S4_2_4 Functions
+
+Functions will automatically allocate the temporary buffers mentioned above, and will automatically multi-thread kernels' executions using the very basic scheduler described in the previous section.
+
+Simple functions only call a single kernel (e.g @ref NEConvolution3x3), while more complex ones consist of several kernels pipelined together (e.g @ref NEGaussianPyramid, @ref NEHarrisCorners). Check their documentation to find out which kernels are used by each function.
+
+@code{.cpp}
+//Create a function object:
+MyFunction function;
+// Initialize the function with the input/output and options you want to use:
+function.configure( input, output, option0, option1);
+// Execute the function:
+function.run();
+@endcode
+
+@warning The Compute Library requires Mali OpenCL DDK r8p0 or higher (OpenCL kernels are compiled using the -cl-arm-non-uniform-work-group-size flag)
+
+@note All OpenCL functions and objects in the runtime library use the command queue associated with CLScheduler for all operations, a real implementation would be expected to use different queues for mapping operations and kernels in order to reach a better GPU utilization.
+
+@subsection S4_4_1_cl_scheduler OpenCL Scheduler and kernel library
+
+The Compute Library runtime uses a single command queue and context for all the operations.
+
+The user can get / set this context and command queue through CLScheduler's interface.
+
+The user can get / set the target GPU device through the CLScheduler's interface.
+
+@attention Make sure the application is using the same context as the library as in OpenCL it is forbidden to share objects across contexts. This is done by calling @ref CLScheduler::init() or @ref CLScheduler::default_init() at the beginning of your application.
+
+@attention Make sure the scheduler's target is not changed after function classes are created.
+
+All OpenCL kernels used by the library are built and stored in @ref CLKernelLibrary.
+If the library is compiled with embed_kernels=0 the application can set the path to the OpenCL kernels by calling @ref CLKernelLibrary::init(), by default the path is set to "./cl_kernels"
+
+@subsection S4_4_2_events_sync OpenCL events and synchronization
+
+In order to block until all the jobs in the CLScheduler's command queue are done executing the user can call @ref CLScheduler::sync() or create a sync event using @ref CLScheduler::enqueue_sync_event()
+
+For example:
+@snippet cl_events.cpp OpenCL events
+
+@subsection S4_4_2_cl_neon OpenCL / NEON interoperability
+
+You can mix OpenCL and NEON kernels and functions. However it is the user's responsibility to handle the mapping/unmapping of OpenCL objects, for example:
+
+@snippet neoncl_scale_median_gaussian.cpp NEON / OpenCL Interop
+
+@sa main_neoncl_scale_median_gaussian
+
+@section S4_5_algorithms Algorithms
+
+All algorithms in this library have been implemented following the [OpenVX 1.1 specifications](https://www.khronos.org/registry/vx/specs/1.1/html/). Please refer to the Khronos documentation for more information.
+
+@section S4_6_images_tensors Images, padding, border modes and tensors
+
+Most kernels and functions in the library process images, however, in order to be future proof most of the kernels actually accept tensors. See below for more information about how they are related.
+
+@attention Each memory object can be written by only one kernel, however it can be read by several kernels. Writing to the same object from several kernels will result in undefined behavior. The kernel writing to an object must be configured before the kernel(s) reading from it.
+
+@subsection S4_6_1_padding_and_border Padding and border modes
+
+Several algorithms require a neighborhood around the current pixel to compute it's value. This means the algorithm will not be able to process the borders of the image unless you give it more information about how those border pixels should be processed. The @ref BorderMode enum is used for this purpose.
+
+You have 3 types of @ref BorderMode :
+
+- @ref BorderMode::UNDEFINED : Neighbor pixels outside of the image are treated as undefined. As a result all the pixels which are on the border will have a value which is undefined.
+- @ref BorderMode::REPLICATE : Neighbor pixels outside of the image are treated as having the same value as the closest valid pixel.
+- @ref BorderMode::CONSTANT : Neighbor pixels outside of the image are treated as having the same constant value. (The user can choose what this value should be).
+
+Moreover both OpenCL and NEON use vector loads and stores instructions to access the data in buffers, so in order to avoid having special cases to handle for the borders all the images and tensors used in this library must be padded.
+
+@subsubsection padding Padding
+
+There are different ways padding can be calculated:
+
+- Accurate padding:
+
+@snippet neon_convolution.cpp Accurate padding
+
+@note It's important to call allocate @b after the function is configured: if the image / tensor is already allocated then the function will shrink its execution window instead of increasing the padding. (See below for more details).
+
+- Manual padding / no padding / auto padding: You can allocate your images / tensors up front (before configuring your functions). In that case the function will use whatever padding is available and will shrink its execution window if there isn't enough padding available (which translates into a smaller valid region for the output). See also @ref valid_region).
+If you don't want to manually set the padding but still want to allocate your objects upfront then you can use auto_padding. It guarantees that the allocation will have enough padding to run any of the provided functions.
+
+@code{.cpp}
+Image     src, dst;
+
+// Use auto padding for the input:
+src.info()->init_auto_padding(TensorShape(640u,480u), Format::U8);
+
+// Use manual padding for the destination image
+dst.info()->init(src.info()->tensor_shape(), Format::U8, strides_in_bytes, offset_first_element_in_bytes, total_size_in_bytes);
+
+// Allocate all the images
+src.allocator()->allocate();
+dst.allocator()->allocate();
+// Fill the input image with the content of the PPM image if a filename was provided:
+fill_image(src);
+
+NEGaussian3x3 gauss;
+
+// Apply a Gaussian 3x3 filter to the source image (Note: if the padding provided is not enough then the execution window and valid region of the output will be shrunk)
+gauss.configure(&src, &dst, BorderMode::UNDEFINED);
+
+//Execute the functions:
+gauss.run();
+@endcode
+
+@warning Some kernels need up to 3 neighbor values to calculate the value of a given pixel. Therefore, to be safe, we use a 4-pixel padding all around the image. In addition, some kernels read and write up to 32 pixels at the same time. To cover that case as well we add an extra 32 pixels of padding at the end of each row. As a result auto padded buffers waste a lot of memory and are less cache friendly. It is therefore recommended to use accurate padding or manual padding wherever possible.
+
+@subsubsection valid_region Valid regions
+
+Some kernels (like edge detectors for example) need to read values of neighboring pixels to calculate the value of a given pixel, it is therefore not possible to calculate the values of the pixels on the edges.
+
+Another case is: if a kernel processes 8 pixels per iteration and the image's dimensions are not a multiple of 8 and not enough padding is available then the kernel will not be able to process the pixels near the right edge. As a result these pixels will be left undefined.
+
+In order to know which pixels have been calculated, each kernel sets a valid region for each output image or tensor. See also @ref TensorInfo::valid_region(), @ref ValidRegion
+
+@subsection S4_6_2_tensors Tensors
+
+Tensors are multi-dimensional arrays with a maximum of @ref Coordinates::num_max_dimensions dimensions.
+
+Depending on the number of dimensions tensors can be interpreted as various objects. A scalar can be represented as a zero-dimensional tensor and a vector of numbers can be represented as an one-dimensional tensor. Further, an image is actually just a 2D tensor, a 3D tensor can be seen as an array of images and a 4D tensor as a 2D array of images, etc.
+
+@note Most algorithms process images (i.e a 2D slice of the tensor), therefore only padding along the X and Y axes is required (2D slices can be stored contiguously in memory).
+
+@subsection S4_6_3_description_conventions Images and Tensors description conventions
+
+Image objects are defined by a @ref Format and dimensions expressed as [width, height, batch]
+
+Tensors are defined by a @ref DataType plus a number of channels (Always expected to be 1 for now) and their dimensions are expressed as [width, height, feature_maps, batch].
+
+In other words, the lower three dimensions of a tensor specify a single input in [width, height, feature_maps], while any other specified dimension represents a batch in the appropriate dimension space.
+For example, a tensor with dimensions [128, 128, 64, 16] represents a 1D batch space with 16 batches of 128 elements in width and height and 64 feature maps each.
+Each kernel specifies the expected layout of each of its tensors in its documentation.
+
+@note Unless specified otherwise in the kernel's or function's documentation all tensors and images parameters passed must have identical dimensions.
+
+@note Unless specified otherwise in the kernel's or function's documentation the number of channels for tensors is expected to be 1 (For images, the number of channels is inferred from the @ref Format).
+
+@attention Regardless of the @ref DataType used by a tensor the @ref ITensor::buffer() method will always return a uint8_t pointer, and all the metadata in @ref TensorInfo will be expressed in bytes. It is the user's responsibility to cast the pointer to the correct type.
+
+For example, to read the element located at the coordinates (x,y) of a float tensor:
+
+@code{.cpp}
+float value = *reinterpret_cast<float*>(input.buffer() + input.info()->offset_element_in_bytes(Coordinates(x,y)));
+@endcode
+
+@subsection S4_6_4_working_with_objects Working with Images and Tensors using iterators
+
+The library provides some iterators to access objects' data.
+Iterators are created by associating a data object (An image or a tensor for example) with an iteration window.
+
+Iteration windows are defined by an array of dimensions, each of which consists of a start, end and step.
+
+The @ref execute_window_loop function takes an execution window, a lambda function and one or more iterators.
+It will iterate through every element of the execution window and for each element it will update the iterators accordingly and call the lambda function.
+
+Here are a couple of examples of how to use the iterators to fill / read tensors:
+
+@snippet examples/neon_copy_objects.cpp Copy objects example
+*/
+} // namespace arm_compute
diff --git a/docs/02_tests.dox b/docs/02_tests.dox
new file mode 100644
index 0000000000..fd5bc59194
--- /dev/null
+++ b/docs/02_tests.dox
@@ -0,0 +1,93 @@
+/**
+@page tests Test architecture
+
+@tableofcontents
+
+@section building_test_dependencies Building dependencies
+
+The tests currently make use of Boost (Test and Program options) for validation
+and Google Benchmark for performance runs. Below are instructions about how to
+build these 3rd party libraries.
+
+@subsection building_boost Building Boost
+
+First follow the instructions from the Boost library on how to setup the Boost
+build system
+(http://www.boost.org/doc/libs/1_64_0/more/getting_started/index.html).
+Afterwards the required libraries can be build with:
+
+    ./b2 --with-program_options --with-test link=static \
+    define=BOOST_TEST_ALTERNATIVE_INIT_API
+
+Additionally, depending on your environment, it might be necessary to specify
+the ```toolset=``` option to choose the right compiler. Moreover,
+```address-model=32``` can be used to force building for 32bit and
+```target-os=android``` must be specified to build for Android.
+
+After executing the build command the libraries
+```libboost_program_options.a``` and ```libboost_unit_test_framework.a``` can
+be found in ```./stage/lib```.
+
+@subsection building_google_benchmark Building Google Benchmark
+
+Instructions on how to build Google Benchmark using CMake can be found in their
+repository: https://github.com/google/benchmark. For example, building for
+Android 32bit can be achieved via
+
+    cmake -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_CXX_COMPILER=arm-linux-androideabi-clang++ \
+    -DBENCHMARK_ENABLE_LTO=false -DBENCHMARK_ENABLE_TESTING=false ..
+
+The library required by the compute library is ```libbenchmark.a```.
+
+@section tests_running_tests Running tests
+@subsection tests_running_tests_benchmarking Benchmarking
+@subsubsection tests_running_tests_benchmarking_filter Filter tests
+All tests can be run by invoking
+
+    ./arm_compute_benchmark -- ./data
+
+where `./data` contains the assets needed by the tests.
+
+If only a subset of the tests has to be executed the `--benchmark_filter` option takes a regular expression to select matching tests.
+
+    ./arm_compute_benchmark --benchmark_filter=neon_bitwise_and ./data
+
+All available tests can be displayed with the `--benchmark_list_tests` switch.
+
+    ./arm_compute_benchmark --benchmark_list_tests ./data
+
+@subsubsection tests_running_tests_benchmarking_runtime Runtime
+By default every test is run multiple *iterations* until a minimum time is reached. The minimum time (in seconds) can be controlled with the `--benchmark_min_time` flag. However, each test might have a hard coded value for the number of iterations or minimum execution time. In that case the command line argument is ignored for those specific tests.
+Additionally it is possible to specify multiple *repetitions* (`--benchmark_repetitions`) which will run each test multiple times (including the iterations). The average and standard deviation for all repetitions is automatically computed and reported.
+
+@subsubsection tests_running_tests_benchmarking_verbosity Verbosity
+The verbosity of the test output can be controlled via the `--v` flag. Though it should hardly ever be necessary.
+
+@subsection tests_running_tests_validation Validation
+@subsubsection tests_running_tests_validation_filter Filter tests
+All tests can be run by invoking
+
+    ./arm_compute_validation -- ./data
+
+where `./data` contains the assets needed by the tests.
+
+As running all tests can take a lot of time the suite is split into "precommit" and "nightly" tests. The precommit tests will be fast to execute but still cover the most important features. In contrast the nightly tests offer more extensive coverage but take longer. The different subsets can be selected from the command line as follows:
+
+    ./arm_compute_validation -t @precommit -- ./data
+    ./arm_compute_validation -t @nightly -- ./data
+
+Additionally it is possible to select specific suites or tests:
+
+    ./arm_compute_validation -t CL -- ./data
+    ./arm_compute_validation -t NEON/BitwiseAnd/RunSmall/_0 -- ./data
+
+All available tests can be displayed with the `--list_content` switch.
+
+    ./arm_compute_validation --list_content -- ./data
+
+For a complete list of possible selectors please see: http://www.boost.org/doc/libs/1_64_0/libs/test/doc/html/boost_test/runtime_config/test_unit_filtering.html
+
+@subsubsection tests_running_tests_validation_verbosity Verbosity
+There are two separate flags to control the verbosity of the test output. `--report_level` controls the verbosity of the summary produced after all tests have been executed. `--log_level` controls the verbosity of the information generated during the execution of tests. All available settings can be found in the Boost documentation for [--report_level](http://www.boost.org/doc/libs/1_64_0/libs/test/doc/html/boost_test/utf_reference/rt_param_reference/report_level.html) and [--log_level](http://www.boost.org/doc/libs/1_64_0/libs/test/doc/html/boost_test/utf_reference/rt_param_reference/log_level.html), respectively.
+*/
diff --git a/docs/Doxyfile b/docs/Doxyfile
new file mode 100644
index 0000000000..e70766b916
--- /dev/null
+++ b/docs/Doxyfile
@@ -0,0 +1,2458 @@
+# Doxyfile 1.8.9.1
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all text
+# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
+# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
+# for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME           = "Compute Library"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         = 0.0
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          = 
+
+# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
+# in the documentation. The maximum height of the logo should not exceed 55
+# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
+# the logo to the output directory.
+
+PROJECT_LOGO           = 
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = build/docs/
+
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
+
+CREATE_SUBDIRS         = NO
+
+# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
+# characters to appear in the names of generated files. If set to NO, non-ASCII
+# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
+# U+3044.
+# The default value is: NO.
+
+#ALLOW_UNICODE_NAMES    = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
+# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
+# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
+# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
+# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
+# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
+# Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF       = "The $name class" \
+                         "The $name widget" \
+                         "The $name file" \
+                         is \
+                         provides \
+                         specifies \
+                         contains \
+                         represents \
+                         a \
+                         an \
+                         the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES        = YES
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+#STRIP_FROM_PATH        = arm_compute/
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH    = 
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF      = YES
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
+# page for each member. If set to NO, the documentation of a member will be part
+# of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE               = 4
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:\n"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". You can put \n's in the value part of an alias to insert
+# newlines.
+
+ALIASES                = 
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding "class=itcl::class"
+# will allow you to use the command class in the itcl::class meaning.
+
+TCL_SUBST              = 
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C  = YES
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
+# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
+# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
+# Fortran. In the later case the parser tries to guess whether the code is fixed
+# or free formatted code, this is the default for Fortran type files), VHDL. For
+# instance to make doxygen treat .inc files as Fortran files (default is PHP),
+# and .f files as C (default is Fortran), use: inc=Fortran f=C.
+#
+# Note: For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen.
+
+EXTENSION_MAPPING      = cl=C
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT       = YES
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL            = YES
+
+# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO,
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES  = NO
+
+# This flag is only useful for Objective-C code. If set to YES, local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO, only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS  = YES
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO, these classes will be included in the various overviews. This option
+# has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# (class|struct|union) declarations. If set to NO, these declarations will be
+# included in the documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO, these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
+# names in lower-case letters. If set to YES, upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES       = NO
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES, the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES       = YES
+
+# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
+# append additional text to a page's title, such as Class Reference. If set to
+# YES the compound reference will be hidden.
+# The default value is: NO.
+
+#HIDE_COMPOUND_REFERENCE= NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+#SHOW_GROUPED_MEMB_INC  = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = YES
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
+# list. This list is created by putting \todo commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
+# list. This list is created by putting \test commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS       = 
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES, the
+# list will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER    = 
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            = 
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. See also \cite for info how to create references.
+
+CITE_BIB_FILES         = 
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS               = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some parameters
+# in a documented function, or documenting parameters that don't exist or using
+# markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO, doxygen will only warn about wrong or incomplete
+# parameter documentation, but not about the absence of documentation.
+# The default value is: NO.
+
+WARN_NO_PARAMDOC       = YES
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT            = "$file:$line:[DOXY_WARN] $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr).
+
+WARN_LOGFILE           = 
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces.
+# Note: If this tag is empty the current directory is searched.
+
+INPUT                  = ./docs/00_introduction.dox \
+                         ./docs/01_library.dox \
+                         ./docs/02_tests.dox \
+                         ./arm_compute/ \
+                         ./src/core/CL/cl_kernels/ \
+                         ./examples/ \
+                         ./tests/ \
+                         ./utils/
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see: http://www.gnu.org/software/libiconv) for the list of
+# possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank the
+# following patterns are tested:*.c, *.cc, *.cxx, *.cpp, *.c++, *.java, *.ii,
+# *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, *.hh, *.hxx, *.hpp,
+# *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, *.m, *.markdown,
+# *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf,
+# *.qsf, *.as and *.js.
+
+FILE_PATTERNS          = *.c \
+                         *.cc \
+                         *.cxx \
+                         *.cpp \
+                         *.c++ \
+                         *.java \
+                         *.ii \
+                         *.ixx \
+                         *.ipp \
+                         *.i++ \
+                         *.inl \
+                         *.idl \
+                         *.ddl \
+                         *.odl \
+                         *.h \
+                         *.hh \
+                         *.hxx \
+                         *.hpp \
+                         *.h++ \
+                         *.cs \
+                         *.d \
+                         *.php \
+                         *.php4 \
+                         *.php5 \
+                         *.phtml \
+                         *.inc \
+                         *.m \
+                         *.markdown \
+                         *.md \
+                         *.mm \
+                         *.dox \
+                         *.py \
+                         *.f90 \
+                         *.f \
+                         *.for \
+                         *.tcl \
+                         *.vhd \
+                         *.vhdl \
+                         *.ucf \
+                         *.qsf \
+                         *.as \
+                         *.js \
+                         *.cl
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                = 
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       = 
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS        = 
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH           = ./examples/ \
+                         . \
+                         ./arm_compute/
+
+# "." is Needed by the release script
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS       = *
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH             = ./docs/
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+
+INPUT_FILTER           = 
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS        = 
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS = 
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE = 
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER         = YES
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES         = YES
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# function all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = YES
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION    = YES
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see http://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS       = YES
+
+# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
+# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
+# cost of reduced performance. This can be particularly helpful with template
+# rich C++ code for which doxygen's built-in parser lacks the necessary type
+# information.
+# Note: The availability of this option depends on whether or not doxygen was
+# compiled with the --with-libclang option.
+# The default value is: NO.
+
+#CLANG_ASSISTED_PARSING = NO
+
+# If clang assisted parsing is enabled you can provide the compiler with command
+# line options that you would normally use when invoking the compiler. Note that
+# the include paths will already be set by doxygen for the files and directories
+# specified with INPUT and INCLUDE_PATH.
+# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
+
+#CLANG_OPTIONS          = 
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX     = YES
+
+# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
+# which the alphabetical index list will be split.
+# Minimum value: 1, maximum value: 20, default value: 5.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX          = 
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION    = .xhtml
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER            =  ./docs/header.html
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER            = 
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET        = 
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# cascading style sheets that are included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefore more robust against future updates.
+# Doxygen will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list). For an example see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET  = 
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       = 
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the style sheet and background images according to
+# this color. Hue is specified as an angle on a colorwheel, see
+# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use grayscales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see: http://developer.apple.com/tools/xcode/), introduced with
+# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# Makefile in the HTML output directory. Running make will produce the docset in
+# that directory and running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# Windows.
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE               = 
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler (hhc.exe). If non-empty,
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION           = 
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated
+# (YES) or that it should be included in the master .chm file (NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI           = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     = 
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated
+# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
+# enables the Previous and Next buttons.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               = 
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
+# folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   = 
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  = 
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  = 
+
+# The QHG_LOCATION tag can be used to specify the location of Qt's
+# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
+# generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           = 
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine-tune the look of the index. As an example, the default style
+# sheet generated by doxygen has an example that shows how to put an image at
+# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
+# the same information as the tab index, you could consider setting
+# DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = YES
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH         = 250
+
+# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# http://www.mathjax.org) which uses client side Javascript for the rendering
+# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = YES
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. See the MathJax site (see:
+# http://docs.mathjax.org/en/latest/output.html) for more details.
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility), NativeMML (i.e. MathML) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from http://www.mathjax.org before deployment.
+# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     = 
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE       = 
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using Javascript. There
+# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
+# setting. When disabled, doxygen will generate a PHP script for searching and
+# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
+# and searching needs to be provided by external tools. See the section
+# "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH        = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/). See the section "External Indexing and
+# Searching" for details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL       = 
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID     = 
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS  = 
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when enabling USE_PDFLATEX this option is only used for generating
+# bitmaps for formulas in the HTML output, but not in the Makefile that is
+# written to the output directory.
+# The default file is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. To get the times font for
+# instance you can specify
+# EXTRA_PACKAGES=times
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES         = 
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
+# generated LaTeX document. The header should contain everything until the first
+# chapter. If it is left blank doxygen will generate a standard header. See
+# section "Doxygen usage" for information on how to let doxygen write the
+# default header to a separate file.
+#
+# Note: Only use a user-defined header if you know what you are doing! The
+# following commands have a special meaning inside the header: $title,
+# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
+# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
+# string, for the replacement values of the other commands the user is referred
+# to HTML_HEADER.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER           = 
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
+# generated LaTeX document. The footer should contain everything after the last
+# chapter. If it is left blank doxygen will generate a standard footer. See
+# LATEX_HEADER for more information on how to generate a default footer and what
+# special commands can be used inside the footer.
+#
+# Note: Only use a user-defined footer if you know what you are doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           = 
+
+# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# LaTeX style sheets that are included after the standard style sheets created
+# by doxygen. Using this option one can overrule certain style aspects. Doxygen
+# will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+#LATEX_EXTRA_STYLESHEET = 
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES      = 
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
+# the PDF file directly from the LaTeX files. Set this option to YES, to get a
+# higher quality PDF documentation.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help. This option is also used
+# when generating formulas in HTML.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE        = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES     = NO
+
+# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
+# code with syntax highlighting in the LaTeX output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's config
+# file, i.e. a series of assignments. You only have to provide replacements,
+# missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE    = 
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's config file. A template extensions file can be generated
+# using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE    = 
+
+# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
+# with syntax highlighting in the RTF output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+#RTF_SOURCE_CODE        = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION          = .3
+
+# The MAN_SUBDIR tag determines the name of the directory created within
+# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
+# MAN_EXTENSION with the initial . removed.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+#MAN_SUBDIR             = 
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT             = xml
+
+# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT         = docbook
+
+# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
+# program listings (including syntax highlighting and cross-referencing
+# information) to the DOCBOOK output. Note that enabling this will significantly
+# increase the size of the DOCBOOK output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+#DOCBOOK_PROGRAMLISTING = NO
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
+# AutoGen Definitions (see http://autogen.sf.net) file that captures the
+# structure of the code including all documentation. Note that this feature is
+# still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO, the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX = 
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
+# in the source code. If set to NO, only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION        = YES
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES, the include files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH           = 
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS  = 
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED             = DOXYGEN_SKIP_THIS \
+                         protected=private \
+                         GRAD_X \
+                         GRAD_Y \
+                         MAGNITUDE=1 \
+                         PHASE=1 \
+                         STDDEV \
+                         LOCATE_MIN \
+                         LOCATE_MAX \
+                         HAS_BIAS \
+                         POOL_AVG \
+                         ARM_COMPUTE_ENABLE_FP16
+
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED      = 
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all references to function-like macros that are alone on a line, have
+# an all uppercase name, and do not end with a semicolon. Such function macros
+# are typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have a unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES               = 
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE       = 
+
+# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
+# the class index. If set to NO, only the inherited external classes will be
+# listed.
+# The default value is: NO.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS        = YES
+
+# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES         = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of 'which perl').
+# The default file (with absolute path) is: /usr/bin/perl.
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
+# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
+# NO turns the diagrams off. Note that this option also works with HAVE_DOT
+# disabled, but it is recommended to install and use dot, since it yields more
+# powerful graphs.
+# The default value is: YES.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see:
+# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            = 
+
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
+
+#DIA_PATH               = 
+
+# If set to YES the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: YES.
+
+HAVE_DOT               = YES
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# When you want a differently looking font in the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           = 
+
+# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
+# each documented class showing the direct and indirect inheritance relations.
+# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CLASS_GRAPH            = NO
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK               = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS     = YES
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH          = NO
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH      = NO
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot.
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, png:cairo, png:cairo:cairo, png:cairo:gd, png:gd,
+# png:gd:gd, jpg, jpg:cairo, jpg:cairo:gd, jpg:gd, jpg:gd:gd, gif, gif:cairo,
+# gif:cairo:gd, gif:gd, gif:gd:gd and svg.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT       = svg
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = NO
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH               = 
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS           = 
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           = 
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+#DIAFILE_DIRS           = 
+
+# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
+# path where java can find the plantuml.jar file. If left blank, it is assumed
+# PlantUML is not used or called during a preprocessing step. Doxygen will
+# generate a warning when it encounters a \startuml command in this case and
+# will not generate output for the diagram.
+
+#PLANTUML_JAR_PATH      = 
+
+# When using plantuml, the specified paths are searched for files specified by
+# the !include statement in a plantuml block.
+
+#PLANTUML_INCLUDE_PATH  = 
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES    = 500
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# files that are used to generate the various graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_CLEANUP            = YES
diff --git a/docs/header.html b/docs/header.html
new file mode 100644
index 0000000000..1a4572135c
--- /dev/null
+++ b/docs/header.html
@@ -0,0 +1,56 @@
+<!-- HTML header for doxygen 1.8.9.1-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen $doxygenversion"/>
+<meta name="robots" content="NOINDEX, NOFOLLOW" /> <!-- Prevent indexing by search engines -->
+<!--BEGIN PROJECT_NAME--><title>$projectname: $title</title><!--END PROJECT_NAME-->
+<!--BEGIN !PROJECT_NAME--><title>$title</title><!--END !PROJECT_NAME-->
+<link href="$relpath^tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="$relpath^jquery.js"></script>
+<script type="text/javascript" src="$relpath^dynsections.js"></script>
+$treeview
+$search
+$mathjax
+<link href="$relpath^$stylesheet" rel="stylesheet" type="text/css" />
+$extrastylesheet
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+
+<!--BEGIN TITLEAREA-->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <!--BEGIN PROJECT_LOGO-->
+  <td id="projectlogo"><img alt="Logo" src="$relpath^$projectlogo"/></td>
+  <!--END PROJECT_LOGO-->
+  <!--BEGIN PROJECT_NAME-->
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">$projectname
+   <!--BEGIN PROJECT_NUMBER-->&#160;<span id="projectnumber">$projectnumber</span><!--END PROJECT_NUMBER-->
+   </div>
+   <!--BEGIN PROJECT_BRIEF--><div id="projectbrief">$projectbrief</div><!--END PROJECT_BRIEF-->
+  </td>
+  <!--END PROJECT_NAME-->
+  <!--BEGIN !PROJECT_NAME-->
+   <!--BEGIN PROJECT_BRIEF-->
+    <td style="padding-left: 0.5em;">
+    <div id="projectbrief">$projectbrief</div>
+    </td>
+   <!--END PROJECT_BRIEF-->
+  <!--END !PROJECT_NAME-->
+  <!--BEGIN DISABLE_INDEX-->
+   <!--BEGIN SEARCHENGINE-->
+   <td>$searchbox</td>
+   <!--END SEARCHENGINE-->
+  <!--END DISABLE_INDEX-->
+ </tr>
+ </tbody>
+</table>
+</div>
+<!--END TITLEAREA-->
+<!-- end header part -->
diff --git a/examples/SConscript b/examples/SConscript
new file mode 100644
index 0000000000..748f771ec7
--- /dev/null
+++ b/examples/SConscript
@@ -0,0 +1,70 @@
+# Copyright (c) 2017 ARM Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import SCons
+import os.path
+
+Import('env')
+Import('arm_compute_a')
+Import('arm_compute_so')
+
+if env['opencl']:
+    Import('opencl')
+
+examples_env = env.Clone()
+
+examples_env.Append(CPPPATH = ["#"])
+examples_env.Append(LIBPATH = ["#build/%s" % env['build_dir']])
+examples_env.Append(LIBPATH = ["#build/%s/opencl-1.2-stubs" % env['build_dir']])
+
+# Build examples
+utils = examples_env.Object("../utils/Utils.cpp")
+
+if env['os'] in ['android', 'bare_metal']:
+    arm_compute_lib = arm_compute_a
+    arm_compute_dependency = arm_compute_a
+else:
+    arm_compute_lib = "arm_compute"
+    arm_compute_dependency = arm_compute_so
+
+if env['opencl'] and env['neon']:
+    for file in Glob("./neoncl_*.cpp"):
+        example = os.path.basename(os.path.splitext(str(file))[0])
+        prog = examples_env.Program(example, ["{}.cpp".format(example), utils], LIBS = [arm_compute_lib, "OpenCL"])
+        Depends(prog, [arm_compute_dependency, opencl])
+        alias = examples_env.Alias(example, prog)
+        Default(alias)
+
+if env['opencl']:
+    for file in Glob("./cl_*.cpp"):
+        example = os.path.basename(os.path.splitext(str(file))[0])
+        prog = examples_env.Program(example, ["{}.cpp".format(example), utils], LIBS = [arm_compute_lib, "OpenCL"])
+        Depends(prog, [arm_compute_dependency, opencl])
+        alias = examples_env.Alias(example, prog)
+        Default(alias)
+
+if env['neon']:
+    for file in Glob("./neon_*.cpp"):
+        example = os.path.basename(os.path.splitext(str(file))[0])
+        prog = examples_env.Program(example, ["{}.cpp".format(example), utils], LIBS = [arm_compute_lib])
+        Depends(prog, arm_compute_dependency)
+        alias = examples_env.Alias(example, prog)
+        Default(alias)
diff --git a/examples/cl_convolution.cpp b/examples/cl_convolution.cpp
new file mode 100644
index 0000000000..06f6f144e1
--- /dev/null
+++ b/examples/cl_convolution.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#define ARM_COMPUTE_CL /* So that OpenCL exceptions get caught too */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLFunctions.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "utils/Utils.h"
+
+using namespace arm_compute;
+using namespace utils;
+
+/** Gaussian 3x3 matrix
+ */
+const int16_t gaussian3x3[] =
+{
+    1, 2, 1,
+    2, 4, 2,
+    1, 2, 1
+};
+
+/** Gaussian 5x5 matrix
+ */
+const int16_t gaussian5x5[] =
+{
+    1, 4, 6, 4, 1,
+    4, 16, 24, 16, 4,
+    6, 24, 36, 24, 6,
+    4, 16, 24, 16, 4,
+    1, 4, 6, 4, 1
+};
+
+void main_cl_convolution(int argc, const char **argv)
+{
+    PPMLoader ppm;
+    CLImage   src, tmp, dst;
+
+    CLScheduler::get().default_init();
+
+    if(argc < 2)
+    {
+        // Print help
+        std::cout << "Usage: ./build/cl_convolution [input_image.ppm]\n\n";
+        std::cout << "No input_image provided, creating a dummy 640x480 image\n";
+        // Create an empty grayscale 640x480 image
+        src.allocator()->init(TensorInfo(640, 480, Format::U8));
+    }
+    else
+    {
+        ppm.open(argv[1]);
+        ppm.init_image(src, Format::U8);
+    }
+
+    // Configure the temporary and destination images
+    tmp.allocator()->init(*src.info());
+    dst.allocator()->init(*src.info());
+
+    CLConvolution3x3 conv3x3;
+    CLConvolution5x5 conv5x5;
+
+    // Apply a Gaussian 3x3 filter to the source image followed by a Gaussian 5x5:
+    conv3x3.configure(&src, &tmp, gaussian3x3, 0 /* Let arm_compute calculate the scale */, BorderMode::UNDEFINED);
+    conv5x5.configure(&tmp, &dst, gaussian5x5, 0 /* Let arm_compute calculate the scale */, BorderMode::UNDEFINED);
+
+    // Allocate all the images
+    src.allocator()->allocate();
+    tmp.allocator()->allocate();
+    dst.allocator()->allocate();
+    // Fill the input image with the content of the PPM image if a filename was provided:
+    if(ppm.is_open())
+    {
+        ppm.fill_image(src);
+    }
+
+    // Execute the functions:
+    conv3x3.run();
+    conv5x5.run();
+
+    // Make sure all the OpenCL jobs are done executing:
+    CLScheduler::get().sync();
+
+    // Save the result to file:
+    if(ppm.is_open())
+    {
+        const std::string output_filename = std::string(argv[1]) + "_out.ppm";
+        save_to_ppm(dst, output_filename); // save_to_ppm maps and unmaps the image to store as PPM
+    }
+}
+
+/** Main program for convolution test
+ *
+ * @param[in] argc Number of arguments
+ * @param[in] argv Arguments ( [optional] Path to PPM image to process )
+ */
+int main(int argc, const char **argv)
+{
+    return utils::run_example(argc, argv, main_cl_convolution);
+}
diff --git a/examples/cl_events.cpp b/examples/cl_events.cpp
new file mode 100644
index 0000000000..768f620622
--- /dev/null
+++ b/examples/cl_events.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#define ARM_COMPUTE_CL /* So that OpenCL exceptions get caught too */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLFunctions.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "utils/Utils.h"
+
+using namespace arm_compute;
+using namespace utils;
+
+void main_cl_events(int argc, const char **argv)
+{
+    /** [OpenCL events] **/
+    PPMLoader     ppm;
+    CLImage       src, tmp_scale_median, tmp_median_gauss, dst;
+    constexpr int scale_factor = 2;
+
+    CLScheduler::get().default_init();
+
+    if(argc < 2)
+    {
+        // Print help
+        std::cout << "Usage: ./build/cl_events [input_image.ppm]\n\n";
+        std::cout << "No input_image provided, creating a dummy 640x480 image\n";
+        // Create an empty grayscale 640x480 image
+        src.allocator()->init(TensorInfo(640, 480, Format::U8));
+    }
+    else
+    {
+        ppm.open(argv[1]);
+        ppm.init_image(src, Format::U8);
+    }
+
+    // Declare and configure the functions to create the following pipeline: scale -> median -> gauss
+    CLScale       scale;
+    CLMedian3x3   median;
+    CLGaussian5x5 gauss;
+
+    TensorInfo dst_info(src.info()->dimension(0) / scale_factor, src.info()->dimension(1) / scale_factor, Format::U8);
+
+    // Configure the temporary and destination images
+    dst.allocator()->init(dst_info);
+    tmp_scale_median.allocator()->init(dst_info);
+    tmp_median_gauss.allocator()->init(dst_info);
+
+    //Configure the functions:
+    scale.configure(&src, &tmp_scale_median, InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::REPLICATE);
+    median.configure(&tmp_scale_median, &tmp_median_gauss, BorderMode::REPLICATE);
+    gauss.configure(&tmp_median_gauss, &dst, BorderMode::REPLICATE);
+
+    // Allocate all the images
+    src.allocator()->allocate();
+    dst.allocator()->allocate();
+    tmp_scale_median.allocator()->allocate();
+    tmp_median_gauss.allocator()->allocate();
+    // Fill the input image with the content of the PPM image if a filename was provided:
+    if(ppm.is_open())
+    {
+        ppm.fill_image(src);
+    }
+
+    // Enqueue and flush the scale OpenCL kernel:
+    scale.run();
+    // Create a synchronisation event between scale and median:
+    cl::Event scale_event = CLScheduler::get().enqueue_sync_event();
+    // Enqueue and flush the median OpenCL kernel:
+    median.run();
+    // Enqueue and flush the Gaussian OpenCL kernel:
+    gauss.run();
+
+    //Make sure all the OpenCL jobs are done executing:
+    scale_event.wait();        // Block until Scale is done executing (Median3x3 and Gaussian5x5 might still be running)
+    CLScheduler::get().sync(); // Block until Gaussian5x5 is done executing
+
+    // Save the result to file:
+    if(ppm.is_open())
+    {
+        const std::string output_filename = std::string(argv[1]) + "_out.ppm";
+        save_to_ppm(dst, output_filename); // save_to_ppm maps and unmaps the image to store as PPM
+    }
+    /** [OpenCL events] **/
+}
+
+/** Main program for convolution test
+ *
+ * @param[in] argc Number of arguments
+ * @param[in] argv Arguments ( [optional] Path to PPM image to process )
+ */
+int main(int argc, const char **argv)
+{
+    return utils::run_example(argc, argv, main_cl_events);
+}
diff --git a/examples/neon_cnn.cpp b/examples/neon_cnn.cpp
new file mode 100644
index 0000000000..952ae4d485
--- /dev/null
+++ b/examples/neon_cnn.cpp
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/NEFunctions.h"
+
+#include "arm_compute/core/Types.h"
+#include "utils/Utils.h"
+
+using namespace arm_compute;
+using namespace utils;
+
+void main_cnn(int argc, const char **argv)
+{
+    ARM_COMPUTE_UNUSED(argc);
+    ARM_COMPUTE_UNUSED(argv);
+
+    // The src tensor should contain the input image
+    Tensor src;
+
+    // The weights and biases tensors should be initialized with the values inferred with the training
+    Tensor weights0;
+    Tensor weights1;
+    Tensor weights2;
+    Tensor biases0;
+    Tensor biases1;
+    Tensor biases2;
+
+    Tensor out_conv0;
+    Tensor out_conv1;
+    Tensor out_act0;
+    Tensor out_act1;
+    Tensor out_act2;
+    Tensor out_pool0;
+    Tensor out_pool1;
+    Tensor out_fc0;
+    Tensor out_softmax;
+
+    NEConvolutionLayer    conv0;
+    NEConvolutionLayer    conv1;
+    NEPoolingLayer        pool0;
+    NEPoolingLayer        pool1;
+    NEFullyConnectedLayer fc0;
+    NEActivationLayer     act0;
+    NEActivationLayer     act1;
+    NEActivationLayer     act2;
+    NESoftmaxLayer        softmax;
+
+    /* [Initialize tensors] */
+
+    // Initialize src tensor
+    constexpr unsigned int width_src_image  = 32;
+    constexpr unsigned int height_src_image = 32;
+    constexpr unsigned int ifm_src_img      = 1;
+
+    const TensorShape src_shape(width_src_image, height_src_image, ifm_src_img);
+    src.allocator()->init(TensorInfo(src_shape, 1, DataType::F32));
+
+    // Initialize tensors of conv0
+    constexpr unsigned int kernel_x_conv0 = 5;
+    constexpr unsigned int kernel_y_conv0 = 5;
+    constexpr unsigned int ofm_conv0      = 8;
+
+    const TensorShape weights_shape_conv0(kernel_x_conv0, kernel_y_conv0, src_shape.z(), ofm_conv0);
+    const TensorShape biases_shape_conv0(weights_shape_conv0[3]);
+    const TensorShape out_shape_conv0(src_shape.x(), src_shape.y(), weights_shape_conv0[3]);
+
+    weights0.allocator()->init(TensorInfo(weights_shape_conv0, 1, DataType::F32));
+    biases0.allocator()->init(TensorInfo(biases_shape_conv0, 1, DataType::F32));
+    out_conv0.allocator()->init(TensorInfo(out_shape_conv0, 1, DataType::F32));
+
+    // Initialize tensor of act0
+    out_act0.allocator()->init(TensorInfo(out_shape_conv0, 1, DataType::F32));
+
+    // Initialize tensor of pool0
+    TensorShape out_shape_pool0 = out_shape_conv0;
+    out_shape_pool0.set(0, out_shape_pool0.x() / 2);
+    out_shape_pool0.set(1, out_shape_pool0.y() / 2);
+    out_pool0.allocator()->init(TensorInfo(out_shape_pool0, 1, DataType::F32));
+
+    // Initialize tensors of conv1
+    constexpr unsigned int kernel_x_conv1 = 3;
+    constexpr unsigned int kernel_y_conv1 = 3;
+    constexpr unsigned int ofm_conv1      = 16;
+
+    const TensorShape weights_shape_conv1(kernel_x_conv1, kernel_y_conv1, out_shape_pool0.z(), ofm_conv1);
+
+    const TensorShape biases_shape_conv1(weights_shape_conv1[3]);
+    const TensorShape out_shape_conv1(out_shape_pool0.x(), out_shape_pool0.y(), weights_shape_conv1[3]);
+
+    weights1.allocator()->init(TensorInfo(weights_shape_conv1, 1, DataType::F32));
+    biases1.allocator()->init(TensorInfo(biases_shape_conv1, 1, DataType::F32));
+    out_conv1.allocator()->init(TensorInfo(out_shape_conv1, 1, DataType::F32));
+
+    // Initialize tensor of act1
+    out_act1.allocator()->init(TensorInfo(out_shape_conv1, 1, DataType::F32));
+
+    // Initialize tensor of pool1
+    TensorShape out_shape_pool1 = out_shape_conv1;
+    out_shape_pool1.set(0, out_shape_pool1.x() / 2);
+    out_shape_pool1.set(1, out_shape_pool1.y() / 2);
+    out_pool1.allocator()->init(TensorInfo(out_shape_pool1, 1, DataType::F32));
+
+    // Initialize tensor of fc0
+    constexpr unsigned int num_labels = 128;
+
+    const TensorShape weights_shape_fc0(out_shape_pool1.x() * out_shape_pool1.y() * out_shape_pool1.z(), num_labels);
+    const TensorShape biases_shape_fc0(num_labels);
+    const TensorShape out_shape_fc0(num_labels);
+
+    weights2.allocator()->init(TensorInfo(weights_shape_fc0, 1, DataType::F32));
+    biases2.allocator()->init(TensorInfo(biases_shape_fc0, 1, DataType::F32));
+    out_fc0.allocator()->init(TensorInfo(out_shape_fc0, 1, DataType::F32));
+
+    // Initialize tensor of act2
+    out_act2.allocator()->init(TensorInfo(out_shape_fc0, 1, DataType::F32));
+
+    // Initialize tensor of softmax
+    const TensorShape out_shape_softmax(out_shape_fc0.x());
+    out_softmax.allocator()->init(TensorInfo(out_shape_softmax, 1, DataType::F32));
+
+    /* -----------------------End: [Initialize tensors] */
+
+    /* [Configure functions] */
+
+    // in:32x32x1: 5x5 convolution, 8 output features maps (OFM)
+    conv0.configure(&src, &weights0, &biases0, &out_conv0, PadStrideInfo());
+
+    // in:32x32x8, out:32x32x8, Activation function: relu
+    act0.configure(&out_conv0, &out_act0, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+
+    // in:32x32x8, out:16x16x8 (2x2 pooling), Pool type function: Max
+    pool0.configure(&out_act0, &out_pool0, PoolingLayerInfo(PoolingType::MAX, 2));
+
+    // in:16x16x8: 3x3 convolution, 16 output features maps (OFM)
+    conv1.configure(&out_pool0, &weights1, &biases1, &out_conv1, PadStrideInfo());
+
+    // in:16x16x16, out:16x16x16, Activation function: relu
+    act1.configure(&out_conv1, &out_act1, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+
+    // in:16x16x16, out:8x8x16 (2x2 pooling), Pool type function: Average
+    pool1.configure(&out_act1, &out_pool1, PoolingLayerInfo(PoolingType::AVG, 2));
+
+    // in:8x8x16, out:128
+    fc0.configure(&out_pool1, &weights2, &biases2, &out_fc0);
+
+    // in:128, out:128, Activation function: relu
+    act2.configure(&out_fc0, &out_act2, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+
+    // in:128, out:128
+    softmax.configure(&out_act2, &out_softmax);
+
+    /* -----------------------End: [Configure functions] */
+
+    /* [Allocate tensors] */
+
+    // Now that the padding requirements are known we can allocate the images:
+    src.allocator()->allocate();
+    weights0.allocator()->allocate();
+    weights1.allocator()->allocate();
+    weights2.allocator()->allocate();
+    biases0.allocator()->allocate();
+    biases1.allocator()->allocate();
+    biases2.allocator()->allocate();
+    out_conv0.allocator()->allocate();
+    out_conv1.allocator()->allocate();
+    out_act0.allocator()->allocate();
+    out_act1.allocator()->allocate();
+    out_act2.allocator()->allocate();
+    out_pool0.allocator()->allocate();
+    out_pool1.allocator()->allocate();
+    out_fc0.allocator()->allocate();
+    out_softmax.allocator()->allocate();
+
+    /* -----------------------End: [Allocate tensors] */
+
+    /* [Initialize weights and biases tensors] */
+
+    // Once the tensors have been allocated, the src, weights and biases tensors can be initialized
+    // ...
+
+    /* -----------------------[Initialize weights and biases tensors] */
+
+    /* [Execute the functions] */
+
+    conv0.run();
+    act0.run();
+    pool0.run();
+    conv1.run();
+    act1.run();
+    pool1.run();
+    fc0.run();
+    act2.run();
+    softmax.run();
+
+    /* -----------------------End: [Execute the functions] */
+}
+
+/** Main program for cnn test
+ *
+ * The example implements the following CNN architecture:
+ *
+ * Input -> conv0:5x5 -> act0:relu -> pool:2x2 -> conv1:3x3 -> act1:relu -> pool:2x2 -> fc0 -> act2:relu -> softmax
+ *
+ * @param[in] argc Number of arguments
+ * @param[in] argv Arguments
+ */
+int main(int argc, const char **argv)
+{
+    return utils::run_example(argc, argv, main_cnn);
+}
+\ No newline at end of file
diff --git a/examples/neon_convolution.cpp b/examples/neon_convolution.cpp
new file mode 100644
index 0000000000..222c8f9a37
--- /dev/null
+++ b/examples/neon_convolution.cpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/NEFunctions.h"
+
+#include "arm_compute/core/Types.h"
+#include "utils/Utils.h"
+
+using namespace arm_compute;
+using namespace utils;
+
+/** Gaussian 3x3 matrix
+ */
+const int16_t gaussian3x3[] =
+{
+    1, 2, 1,
+    2, 4, 2,
+    1, 2, 1
+};
+
+/** Gaussian 5x5 matrix
+ */
+const int16_t gaussian5x5[] =
+{
+    1, 4, 6, 4, 1,
+    4, 16, 24, 16, 4,
+    6, 24, 36, 24, 6,
+    4, 16, 24, 16, 4,
+    1, 4, 6, 4, 1
+};
+
+void main_neon_convolution(int argc, const char **argv)
+{
+    /** [Accurate padding] **/
+    PPMLoader ppm;
+    Image     src, tmp, dst;
+
+    if(argc < 2)
+    {
+        // Print help
+        std::cout << "Usage: ./build/neon_convolution [input_image.ppm]\n\n";
+        std::cout << "No input_image provided, creating a dummy 640x480 image\n";
+        // Initialize just the dimensions and format of your buffers:
+        src.allocator()->init(TensorInfo(640, 480, Format::U8));
+    }
+    else
+    {
+        ppm.open(argv[1]);
+        // Initialize just the dimensions and format of your buffers:
+        ppm.init_image(src, Format::U8);
+    }
+
+    // Initialize just the dimensions and format of the temporary and destination images:
+    tmp.allocator()->init(*src.info());
+    dst.allocator()->init(*src.info());
+
+    NEConvolution3x3 conv3x3;
+    NEConvolution5x5 conv5x5;
+
+    // Apply a Gaussian 3x3 filter to the source image followed by a Gaussian 5x5:
+    // The function will automatically update the padding information inside input and output to match its requirements
+    conv3x3.configure(&src, &tmp, gaussian3x3, 0 /* Let arm_compute calculate the scale */, BorderMode::UNDEFINED);
+    conv5x5.configure(&tmp, &dst, gaussian5x5, 0 /* Let arm_compute calculate the scale */, BorderMode::UNDEFINED);
+
+    // Now that the padding requirements are known we can allocate the images:
+    src.allocator()->allocate();
+    tmp.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    // Fill the input image with the content of the PPM image if a filename was provided:
+    if(ppm.is_open())
+    {
+        ppm.fill_image(src);
+    }
+
+    //Execute the functions:
+    conv3x3.run();
+    conv5x5.run();
+
+    // Save the result to file:
+    if(ppm.is_open())
+    {
+        const std::string output_filename = std::string(argv[1]) + "_out.ppm";
+        save_to_ppm(dst, output_filename);
+    }
+    /** [Accurate padding] **/
+}
+
+/** Main program for convolution test
+ *
+ * @param[in] argc Number of arguments
+ * @param[in] argv Arguments ( [optional] Path to PPM image to process )
+ */
+int main(int argc, const char **argv)
+{
+    return utils::run_example(argc, argv, main_neon_convolution);
+}
diff --git a/examples/neon_copy_objects.cpp b/examples/neon_copy_objects.cpp
new file mode 100644
index 0000000000..191f455557
--- /dev/null
+++ b/examples/neon_copy_objects.cpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/NEFunctions.h"
+
+#include "arm_compute/core/Types.h"
+#include "utils/Utils.h"
+
+#include <cstring>
+#include <iostream>
+
+using namespace arm_compute;
+
+void main_neon_copy_objects(int argc, const char **argv)
+{
+    ARM_COMPUTE_UNUSED(argc);
+    ARM_COMPUTE_UNUSED(argv);
+
+    /** [Copy objects example] */
+    constexpr unsigned int width  = 4;
+    constexpr unsigned int height = 3;
+    constexpr unsigned int batch  = 2;
+
+    auto *src_data = new float[width * height * batch];
+    auto *dst_data = new float[width * height * batch];
+
+    // Fill src_data with dummy values:
+    for(unsigned int b = 0; b < batch; b++)
+    {
+        for(unsigned int h = 0; h < height; h++)
+        {
+            for(unsigned int w = 0; w < width; w++)
+            {
+                src_data[b * (width * height) + h * width + w] = static_cast<float>(100 * b + 10 * h + w);
+            }
+        }
+    }
+
+    Tensor         input, output;
+    NESoftmaxLayer softmax;
+
+    // Initialize the tensors dimensions and type:
+    const TensorShape shape(width, height, batch);
+    input.allocator()->init(TensorInfo(shape, 1, DataType::F32));
+    output.allocator()->init(TensorInfo(shape, 1, DataType::F32));
+
+    // Configure softmax:
+    softmax.configure(&input, &output);
+
+    // Allocate the input / output tensors:
+    input.allocator()->allocate();
+    output.allocator()->allocate();
+
+    // Fill the input tensor:
+    // Simplest way: create an iterator to iterate through each element of the input tensor:
+    Window input_window;
+    input_window.use_tensor_dimensions(input.info());
+    std::cout << " Dimensions of the input's iterator:\n";
+    std::cout << " X = [start=" << input_window.x().start() << ", end=" << input_window.x().end() << ", step=" << input_window.x().step() << "]\n";
+    std::cout << " Y = [start=" << input_window.y().start() << ", end=" << input_window.y().end() << ", step=" << input_window.y().step() << "]\n";
+    std::cout << " Z = [start=" << input_window.z().start() << ", end=" << input_window.z().end() << ", step=" << input_window.z().step() << "]\n";
+
+    // Create an iterator:
+    Iterator input_it(&input, input_window);
+
+    // Iterate through the elements of src_data and copy them one by one to the input tensor:
+    // This is equivalent to:
+    // for( unsigned int z = 0; z < batch; ++z)
+    // {
+    //   for( unsigned int y = 0; y < height; ++y)
+    //   {
+    //     for( unsigned int x = 0; x < width; ++x)
+    //     {
+    //       *reinterpret_cast<float*>( input.buffer() + input.info()->offset_element_in_bytes(Coordinates(x,y,z))) = src_data[ z * (width*height) + y * width + x];
+    //     }
+    //   }
+    // }
+    // Except it works for an arbitrary number of dimensions
+    execute_window_loop(input_window, [&](const Coordinates & id)
+    {
+        std::cout << "Setting item [" << id.x() << "," << id.y() << "," << id.z() << "]\n";
+        *reinterpret_cast<float *>(input_it.ptr()) = src_data[id.z() * (width * height) + id.y() * width + id.x()];
+    },
+    input_it);
+
+    // Run NEON softmax:
+    softmax.run();
+
+    // More efficient way: create an iterator to iterate through each row (instead of each element) of the output tensor:
+    Window output_window;
+    output_window.use_tensor_dimensions(output.info(), /* first_dimension =*/Window::DimY); // Iterate through the rows (not each element)
+    std::cout << " Dimensions of the output's iterator:\n";
+    std::cout << " X = [start=" << output_window.x().start() << ", end=" << output_window.x().end() << ", step=" << output_window.x().step() << "]\n";
+    std::cout << " Y = [start=" << output_window.y().start() << ", end=" << output_window.y().end() << ", step=" << output_window.y().step() << "]\n";
+    std::cout << " Z = [start=" << output_window.z().start() << ", end=" << output_window.z().end() << ", step=" << output_window.z().step() << "]\n";
+
+    // Create an iterator:
+    Iterator output_it(&output, output_window);
+
+    // Iterate through the rows of the output tensor and copy them to dst_data:
+    // This is equivalent to:
+    // for( unsigned int z = 0; z < batch; ++z)
+    // {
+    //   for( unsigned int y = 0; y < height; ++y)
+    //   {
+    //     memcpy( dst_data + z * (width*height) + y * width, input.buffer() + input.info()->offset_element_in_bytes(Coordinates(0,y,z)), width * sizeof(float));
+    //   }
+    // }
+    // Except it works for an arbitrary number of dimensions
+    execute_window_loop(output_window, [&](const Coordinates & id)
+    {
+        std::cout << "Copying one row starting from [" << id.x() << "," << id.y() << "," << id.z() << "]\n";
+        // Copy one whole row:
+        memcpy(dst_data + id.z() * (width * height) + id.y() * width, output_it.ptr(), width * sizeof(float));
+    },
+    output_it);
+
+    delete[] src_data;
+    delete[] dst_data;
+    /** [Copy objects example] */
+}
+
+/** Main program for the copy objects test
+ *
+ * @param[in] argc Number of arguments
+ * @param[in] argv Arguments
+ */
+int main(int argc, const char **argv)
+{
+    return utils::run_example(argc, argv, main_neon_copy_objects);
+}
diff --git a/examples/neon_scale.cpp b/examples/neon_scale.cpp
new file mode 100644
index 0000000000..75780c9bdb
--- /dev/null
+++ b/examples/neon_scale.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/NEFunctions.h"
+
+#include "arm_compute/core/Types.h"
+#include "utils/Utils.h"
+
+using namespace arm_compute;
+using namespace utils;
+
+void main_neon_scale(int argc, const char **argv)
+{
+    PPMLoader ppm;
+    Image     src, dst;
+
+    if(argc < 2)
+    {
+        // Print help
+        std::cout << "Usage: ./build/neon_scale[input_image.ppm]\n\n";
+        std::cout << "No input_image provided, creating a dummy 640x480 image\n";
+        // Create an empty grayscale 640x480 image
+        src.allocator()->init(TensorInfo(640, 480, Format::U8));
+    }
+    else
+    {
+        ppm.open(argv[1]);
+        ppm.init_image(src, Format::U8);
+    }
+
+    constexpr int scale_factor = 2;
+
+    TensorInfo dst_tensor_info(src.info()->dimension(0) / scale_factor, src.info()->dimension(1) / scale_factor, Format::U8);
+
+    // Configure the destination image
+    dst.allocator()->init(dst_tensor_info);
+
+    // Create and initialize a Scale function object:
+    NEScale scale;
+    scale.configure(&src, &dst, InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED);
+
+    // Allocate all the images
+    src.allocator()->allocate();
+    dst.allocator()->allocate();
+    // Fill the input image with the content of the PPM image if a filename was provided:
+    if(ppm.is_open())
+    {
+        ppm.fill_image(src);
+    }
+
+    // Run the scale operation:
+    scale.run();
+
+    // Save the result to file:
+    if(ppm.is_open())
+    {
+        const std::string output_filename = std::string(argv[1]) + "_out.ppm";
+        save_to_ppm(dst, output_filename);
+    }
+}
+
+/** Main program for convolution test
+ *
+ * @param[in] argc Number of arguments
+ * @param[in] argv Arguments ( [optional] Path to PPM image to process )
+ */
+int main(int argc, const char **argv)
+{
+    return utils::run_example(argc, argv, main_neon_scale);
+}
diff --git a/examples/neoncl_scale_median_gaussian.cpp b/examples/neoncl_scale_median_gaussian.cpp
new file mode 100644
index 0000000000..a32ba6daf6
--- /dev/null
+++ b/examples/neoncl_scale_median_gaussian.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#define ARM_COMPUTE_CL /* So that OpenCL exceptions get caught too */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLFunctions.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/NEON/NEFunctions.h"
+#include "utils/Utils.h"
+
+using namespace arm_compute;
+using namespace utils;
+
+/** Example demonstrating how to use both CL and NEON functions in the same pipeline
+ *
+ * @param[in] argc Number of arguments
+ * @param[in] argv Arguments ( [optional] Path to PPM image to process )
+ */
+void main_neoncl_scale_median_gaussian(int argc, const char **argv)
+{
+    /** [NEON / OpenCL Interop] */
+    PPMLoader ppm;
+    CLImage   src, scale_median, median_gauss, dst;
+
+    CLScheduler::get().default_init();
+
+    if(argc < 2)
+    {
+        // Print help
+        std::cout << "Usage: ./build/cl_convolution [input_image.ppm]\n\n";
+        std::cout << "No input_image provided, creating a dummy 640x480 image\n";
+        // Create an empty grayscale 640x480 image
+        src.allocator()->init(TensorInfo(640, 480, Format::U8));
+    }
+    else
+    {
+        ppm.open(argv[1]);
+        ppm.init_image(src, Format::U8);
+    }
+
+    TensorInfo scale_median_info(TensorInfo(src.info()->dimension(0) / 2, src.info()->dimension(1) / 2, Format::U8));
+
+    // Configure the temporary and destination images
+    scale_median.allocator()->init(scale_median_info);
+    median_gauss.allocator()->init(scale_median_info);
+    dst.allocator()->init(scale_median_info);
+
+    // Declare and configure the functions to create the following pipeline: scale -> median -> gauss
+    CLScale       scale;
+    NEMedian3x3   median;
+    CLGaussian5x5 gauss;
+
+    scale.configure(&src, &scale_median, InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::REPLICATE);
+    median.configure(&scale_median, &median_gauss, BorderMode::REPLICATE);
+    gauss.configure(&median_gauss, &dst, BorderMode::REPLICATE);
+
+    // Allocate all the images
+    src.allocator()->allocate();
+    scale_median.allocator()->allocate();
+    median_gauss.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    // Fill the input image with the content of the PPM image if a filename was provided:
+    if(ppm.is_open())
+    {
+        ppm.fill_image(src);
+    }
+
+    // Enqueue and flush the OpenCL kernel:
+    scale.run();
+
+    // Do a blocking map of the input and output buffers of the NEON function:
+    scale_median.map();
+    median_gauss.map();
+
+    // Run the NEON function:
+    median.run();
+
+    // Unmap the output buffer before it's used again by OpenCL:
+    scale_median.unmap();
+    median_gauss.unmap();
+
+    // Run the final OpenCL function:
+    gauss.run();
+
+    // Make sure all the OpenCL jobs are done executing:
+    CLScheduler::get().sync();
+
+    // Save the result to file:
+    if(ppm.is_open())
+    {
+        const std::string output_filename = std::string(argv[1]) + "_out.ppm";
+        save_to_ppm(dst, output_filename); // save_to_ppm maps and unmaps the image to store as PPM
+    }
+    /** [NEON / OpenCL Interop] */
+}
+
+/** Main program for convolution test
+ *
+ * @param[in] argc Number of arguments
+ * @param[in] argv Arguments ( [optional] Path to PPM image to process )
+ */
+int main(int argc, const char **argv)
+{
+    return utils::run_example(argc, argv, main_neoncl_scale_median_gaussian);
+}
diff --git a/include/CL/cl.h b/include/CL/cl.h
new file mode 100644
index 0000000000..b41b1f9ff8
--- /dev/null
+++ b/include/CL/cl.h
@@ -0,0 +1,1214 @@
+/*******************************************************************************
+ * Copyright (c) 2008 - 2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#ifndef __OPENCL_CL_H
+#define __OPENCL_CL_H
+
+#ifdef __APPLE__
+#include <OpenCL/cl_platform.h>
+#else
+#include <CL/cl_platform.h>
+#endif	
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************/
+
+typedef struct _cl_platform_id *    cl_platform_id;
+typedef struct _cl_device_id *      cl_device_id;
+typedef struct _cl_context *        cl_context;
+typedef struct _cl_command_queue *  cl_command_queue;
+typedef struct _cl_mem *            cl_mem;
+typedef struct _cl_program *        cl_program;
+typedef struct _cl_kernel *         cl_kernel;
+typedef struct _cl_event *          cl_event;
+typedef struct _cl_sampler *        cl_sampler;
+
+typedef cl_uint             cl_bool;                     /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */ 
+typedef cl_ulong            cl_bitfield;
+typedef cl_bitfield         cl_device_type;
+typedef cl_uint             cl_platform_info;
+typedef cl_uint             cl_device_info;
+typedef cl_bitfield         cl_device_fp_config;
+typedef cl_uint             cl_device_mem_cache_type;
+typedef cl_uint             cl_device_local_mem_type;
+typedef cl_bitfield         cl_device_exec_capabilities;
+typedef cl_bitfield         cl_command_queue_properties;
+typedef intptr_t            cl_device_partition_property;
+typedef cl_bitfield         cl_device_affinity_domain;
+
+typedef intptr_t            cl_context_properties;
+typedef cl_uint             cl_context_info;
+typedef cl_uint             cl_command_queue_info;
+typedef cl_uint             cl_channel_order;
+typedef cl_uint             cl_channel_type;
+typedef cl_bitfield         cl_mem_flags;
+typedef cl_uint             cl_mem_object_type;
+typedef cl_uint             cl_mem_info;
+typedef cl_bitfield         cl_mem_migration_flags;
+typedef cl_uint             cl_image_info;
+typedef cl_uint             cl_buffer_create_type;
+typedef cl_uint             cl_addressing_mode;
+typedef cl_uint             cl_filter_mode;
+typedef cl_uint             cl_sampler_info;
+typedef cl_bitfield         cl_map_flags;
+typedef cl_uint             cl_program_info;
+typedef cl_uint             cl_program_build_info;
+typedef cl_uint             cl_program_binary_type;
+typedef cl_int              cl_build_status;
+typedef cl_uint             cl_kernel_info;
+typedef cl_uint             cl_kernel_arg_info;
+typedef cl_uint             cl_kernel_arg_address_qualifier;
+typedef cl_uint             cl_kernel_arg_access_qualifier;
+typedef cl_bitfield         cl_kernel_arg_type_qualifier;
+typedef cl_uint             cl_kernel_work_group_info;
+typedef cl_uint             cl_event_info;
+typedef cl_uint             cl_command_type;
+typedef cl_uint             cl_profiling_info;
+
+
+typedef struct _cl_image_format {
+    cl_channel_order        image_channel_order;
+    cl_channel_type         image_channel_data_type;
+} cl_image_format;
+
+typedef struct _cl_image_desc {
+    cl_mem_object_type      image_type;
+    size_t                  image_width;
+    size_t                  image_height;
+    size_t                  image_depth;
+    size_t                  image_array_size;
+    size_t                  image_row_pitch;
+    size_t                  image_slice_pitch;
+    cl_uint                 num_mip_levels;
+    cl_uint                 num_samples;
+    cl_mem                  buffer;
+} cl_image_desc;
+
+typedef struct _cl_buffer_region {
+    size_t                  origin;
+    size_t                  size;
+} cl_buffer_region;
+
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_SUCCESS                                  0
+#define CL_DEVICE_NOT_FOUND                         -1
+#define CL_DEVICE_NOT_AVAILABLE                     -2
+#define CL_COMPILER_NOT_AVAILABLE                   -3
+#define CL_MEM_OBJECT_ALLOCATION_FAILURE            -4
+#define CL_OUT_OF_RESOURCES                         -5
+#define CL_OUT_OF_HOST_MEMORY                       -6
+#define CL_PROFILING_INFO_NOT_AVAILABLE             -7
+#define CL_MEM_COPY_OVERLAP                         -8
+#define CL_IMAGE_FORMAT_MISMATCH                    -9
+#define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10
+#define CL_BUILD_PROGRAM_FAILURE                    -11
+#define CL_MAP_FAILURE                              -12
+#define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13
+#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
+#define CL_COMPILE_PROGRAM_FAILURE                  -15
+#define CL_LINKER_NOT_AVAILABLE                     -16
+#define CL_LINK_PROGRAM_FAILURE                     -17
+#define CL_DEVICE_PARTITION_FAILED                  -18
+#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE            -19
+
+#define CL_INVALID_VALUE                            -30
+#define CL_INVALID_DEVICE_TYPE                      -31
+#define CL_INVALID_PLATFORM                         -32
+#define CL_INVALID_DEVICE                           -33
+#define CL_INVALID_CONTEXT                          -34
+#define CL_INVALID_QUEUE_PROPERTIES                 -35
+#define CL_INVALID_COMMAND_QUEUE                    -36
+#define CL_INVALID_HOST_PTR                         -37
+#define CL_INVALID_MEM_OBJECT                       -38
+#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR          -39
+#define CL_INVALID_IMAGE_SIZE                       -40
+#define CL_INVALID_SAMPLER                          -41
+#define CL_INVALID_BINARY                           -42
+#define CL_INVALID_BUILD_OPTIONS                    -43
+#define CL_INVALID_PROGRAM                          -44
+#define CL_INVALID_PROGRAM_EXECUTABLE               -45
+#define CL_INVALID_KERNEL_NAME                      -46
+#define CL_INVALID_KERNEL_DEFINITION                -47
+#define CL_INVALID_KERNEL                           -48
+#define CL_INVALID_ARG_INDEX                        -49
+#define CL_INVALID_ARG_VALUE                        -50
+#define CL_INVALID_ARG_SIZE                         -51
+#define CL_INVALID_KERNEL_ARGS                      -52
+#define CL_INVALID_WORK_DIMENSION                   -53
+#define CL_INVALID_WORK_GROUP_SIZE                  -54
+#define CL_INVALID_WORK_ITEM_SIZE                   -55
+#define CL_INVALID_GLOBAL_OFFSET                    -56
+#define CL_INVALID_EVENT_WAIT_LIST                  -57
+#define CL_INVALID_EVENT                            -58
+#define CL_INVALID_OPERATION                        -59
+#define CL_INVALID_GL_OBJECT                        -60
+#define CL_INVALID_BUFFER_SIZE                      -61
+#define CL_INVALID_MIP_LEVEL                        -62
+#define CL_INVALID_GLOBAL_WORK_SIZE                 -63
+#define CL_INVALID_PROPERTY                         -64
+#define CL_INVALID_IMAGE_DESCRIPTOR                 -65
+#define CL_INVALID_COMPILER_OPTIONS                 -66
+#define CL_INVALID_LINKER_OPTIONS                   -67
+#define CL_INVALID_DEVICE_PARTITION_COUNT           -68
+
+/* OpenCL Version */
+#define CL_VERSION_1_0                              1
+#define CL_VERSION_1_1                              1
+#define CL_VERSION_1_2                              1
+
+/* cl_bool */
+#define CL_FALSE                                    0
+#define CL_TRUE                                     1
+#define CL_BLOCKING                                 CL_TRUE
+#define CL_NON_BLOCKING                             CL_FALSE
+
+/* cl_platform_info */
+#define CL_PLATFORM_PROFILE                         0x0900
+#define CL_PLATFORM_VERSION                         0x0901
+#define CL_PLATFORM_NAME                            0x0902
+#define CL_PLATFORM_VENDOR                          0x0903
+#define CL_PLATFORM_EXTENSIONS                      0x0904
+
+/* cl_device_type - bitfield */
+#define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)
+#define CL_DEVICE_TYPE_CPU                          (1 << 1)
+#define CL_DEVICE_TYPE_GPU                          (1 << 2)
+#define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
+#define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)
+#define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
+
+/* cl_device_info */
+#define CL_DEVICE_TYPE                              0x1000
+#define CL_DEVICE_VENDOR_ID                         0x1001
+#define CL_DEVICE_MAX_COMPUTE_UNITS                 0x1002
+#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS          0x1003
+#define CL_DEVICE_MAX_WORK_GROUP_SIZE               0x1004
+#define CL_DEVICE_MAX_WORK_ITEM_SIZES               0x1005
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR       0x1006
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT      0x1007
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT        0x1008
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG       0x1009
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT      0x100A
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE     0x100B
+#define CL_DEVICE_MAX_CLOCK_FREQUENCY               0x100C
+#define CL_DEVICE_ADDRESS_BITS                      0x100D
+#define CL_DEVICE_MAX_READ_IMAGE_ARGS               0x100E
+#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS              0x100F
+#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                0x1010
+#define CL_DEVICE_IMAGE2D_MAX_WIDTH                 0x1011
+#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                0x1012
+#define CL_DEVICE_IMAGE3D_MAX_WIDTH                 0x1013
+#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                0x1014
+#define CL_DEVICE_IMAGE3D_MAX_DEPTH                 0x1015
+#define CL_DEVICE_IMAGE_SUPPORT                     0x1016
+#define CL_DEVICE_MAX_PARAMETER_SIZE                0x1017
+#define CL_DEVICE_MAX_SAMPLERS                      0x1018
+#define CL_DEVICE_MEM_BASE_ADDR_ALIGN               0x1019
+#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE          0x101A
+#define CL_DEVICE_SINGLE_FP_CONFIG                  0x101B
+#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE             0x101C
+#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE         0x101D
+#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE             0x101E
+#define CL_DEVICE_GLOBAL_MEM_SIZE                   0x101F
+#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE          0x1020
+#define CL_DEVICE_MAX_CONSTANT_ARGS                 0x1021
+#define CL_DEVICE_LOCAL_MEM_TYPE                    0x1022
+#define CL_DEVICE_LOCAL_MEM_SIZE                    0x1023
+#define CL_DEVICE_ERROR_CORRECTION_SUPPORT          0x1024
+#define CL_DEVICE_PROFILING_TIMER_RESOLUTION        0x1025
+#define CL_DEVICE_ENDIAN_LITTLE                     0x1026
+#define CL_DEVICE_AVAILABLE                         0x1027
+#define CL_DEVICE_COMPILER_AVAILABLE                0x1028
+#define CL_DEVICE_EXECUTION_CAPABILITIES            0x1029
+#define CL_DEVICE_QUEUE_PROPERTIES                  0x102A
+#define CL_DEVICE_NAME                              0x102B
+#define CL_DEVICE_VENDOR                            0x102C
+#define CL_DRIVER_VERSION                           0x102D
+#define CL_DEVICE_PROFILE                           0x102E
+#define CL_DEVICE_VERSION                           0x102F
+#define CL_DEVICE_EXTENSIONS                        0x1030
+#define CL_DEVICE_PLATFORM                          0x1031
+#define CL_DEVICE_DOUBLE_FP_CONFIG                  0x1032
+/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG */
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF       0x1034
+#define CL_DEVICE_HOST_UNIFIED_MEMORY               0x1035
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR          0x1036
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT         0x1037
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT           0x1038
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG          0x1039
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT         0x103A
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE        0x103B
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF          0x103C
+#define CL_DEVICE_OPENCL_C_VERSION                  0x103D
+#define CL_DEVICE_LINKER_AVAILABLE                  0x103E
+#define CL_DEVICE_BUILT_IN_KERNELS                  0x103F
+#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE             0x1040
+#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE              0x1041
+#define CL_DEVICE_PARENT_DEVICE                     0x1042
+#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES         0x1043
+#define CL_DEVICE_PARTITION_PROPERTIES              0x1044
+#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN         0x1045
+#define CL_DEVICE_PARTITION_TYPE                    0x1046
+#define CL_DEVICE_REFERENCE_COUNT                   0x1047
+#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC       0x1048
+#define CL_DEVICE_PRINTF_BUFFER_SIZE                0x1049
+#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT             0x104A
+#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT      0x104B
+
+/* cl_device_fp_config - bitfield */
+#define CL_FP_DENORM                                (1 << 0)
+#define CL_FP_INF_NAN                               (1 << 1)
+#define CL_FP_ROUND_TO_NEAREST                      (1 << 2)
+#define CL_FP_ROUND_TO_ZERO                         (1 << 3)
+#define CL_FP_ROUND_TO_INF                          (1 << 4)
+#define CL_FP_FMA                                   (1 << 5)
+#define CL_FP_SOFT_FLOAT                            (1 << 6)
+#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT         (1 << 7)
+
+/* cl_device_mem_cache_type */
+#define CL_NONE                                     0x0
+#define CL_READ_ONLY_CACHE                          0x1
+#define CL_READ_WRITE_CACHE                         0x2
+
+/* cl_device_local_mem_type */
+#define CL_LOCAL                                    0x1
+#define CL_GLOBAL                                   0x2
+
+/* cl_device_exec_capabilities - bitfield */
+#define CL_EXEC_KERNEL                              (1 << 0)
+#define CL_EXEC_NATIVE_KERNEL                       (1 << 1)
+
+/* cl_command_queue_properties - bitfield */
+#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE      (1 << 0)
+#define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
+
+/* cl_context_info  */
+#define CL_CONTEXT_REFERENCE_COUNT                  0x1080
+#define CL_CONTEXT_DEVICES                          0x1081
+#define CL_CONTEXT_PROPERTIES                       0x1082
+#define CL_CONTEXT_NUM_DEVICES                      0x1083
+
+/* cl_context_properties */
+#define CL_CONTEXT_PLATFORM                         0x1084
+#define CL_CONTEXT_INTEROP_USER_SYNC                0x1085
+    
+/* cl_device_partition_property */
+#define CL_DEVICE_PARTITION_EQUALLY                 0x1086
+#define CL_DEVICE_PARTITION_BY_COUNTS               0x1087
+#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END      0x0
+#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN      0x1088
+    
+/* cl_device_affinity_domain */
+#define CL_DEVICE_AFFINITY_DOMAIN_NUMA                     (1 << 0)
+#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE                 (1 << 1)
+#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE                 (1 << 2)
+#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE                 (1 << 3)
+#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE                 (1 << 4)
+#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE       (1 << 5)
+
+/* cl_command_queue_info */
+#define CL_QUEUE_CONTEXT                            0x1090
+#define CL_QUEUE_DEVICE                             0x1091
+#define CL_QUEUE_REFERENCE_COUNT                    0x1092
+#define CL_QUEUE_PROPERTIES                         0x1093
+
+/* cl_mem_flags - bitfield */
+#define CL_MEM_READ_WRITE                           (1 << 0)
+#define CL_MEM_WRITE_ONLY                           (1 << 1)
+#define CL_MEM_READ_ONLY                            (1 << 2)
+#define CL_MEM_USE_HOST_PTR                         (1 << 3)
+#define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
+#define CL_MEM_COPY_HOST_PTR                        (1 << 5)
+/* reserved                                         (1 << 6)*/
+#define CL_MEM_HOST_WRITE_ONLY                      (1 << 7)
+#define CL_MEM_HOST_READ_ONLY                       (1 << 8)
+#define CL_MEM_HOST_NO_ACCESS                       (1 << 9)
+
+/* cl_mem_migration_flags - bitfield */
+#define CL_MIGRATE_MEM_OBJECT_HOST                  (1 << 0)
+#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED     (1 << 1)
+
+/* cl_channel_order */
+#define CL_R                                        0x10B0
+#define CL_A                                        0x10B1
+#define CL_RG                                       0x10B2
+#define CL_RA                                       0x10B3
+#define CL_RGB                                      0x10B4
+#define CL_RGBA                                     0x10B5
+#define CL_BGRA                                     0x10B6
+#define CL_ARGB                                     0x10B7
+#define CL_INTENSITY                                0x10B8
+#define CL_LUMINANCE                                0x10B9
+#define CL_Rx                                       0x10BA
+#define CL_RGx                                      0x10BB
+#define CL_RGBx                                     0x10BC
+#define CL_DEPTH                                    0x10BD
+#define CL_DEPTH_STENCIL                            0x10BE
+
+/* cl_channel_type */
+#define CL_SNORM_INT8                               0x10D0
+#define CL_SNORM_INT16                              0x10D1
+#define CL_UNORM_INT8                               0x10D2
+#define CL_UNORM_INT16                              0x10D3
+#define CL_UNORM_SHORT_565                          0x10D4
+#define CL_UNORM_SHORT_555                          0x10D5
+#define CL_UNORM_INT_101010                         0x10D6
+#define CL_SIGNED_INT8                              0x10D7
+#define CL_SIGNED_INT16                             0x10D8
+#define CL_SIGNED_INT32                             0x10D9
+#define CL_UNSIGNED_INT8                            0x10DA
+#define CL_UNSIGNED_INT16                           0x10DB
+#define CL_UNSIGNED_INT32                           0x10DC
+#define CL_HALF_FLOAT                               0x10DD
+#define CL_FLOAT                                    0x10DE
+#define CL_UNORM_INT24                              0x10DF
+
+/* cl_mem_object_type */
+#define CL_MEM_OBJECT_BUFFER                        0x10F0
+#define CL_MEM_OBJECT_IMAGE2D                       0x10F1
+#define CL_MEM_OBJECT_IMAGE3D                       0x10F2
+#define CL_MEM_OBJECT_IMAGE2D_ARRAY                 0x10F3
+#define CL_MEM_OBJECT_IMAGE1D                       0x10F4
+#define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
+#define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
+
+/* cl_mem_info */
+#define CL_MEM_TYPE                                 0x1100
+#define CL_MEM_FLAGS                                0x1101
+#define CL_MEM_SIZE                                 0x1102
+#define CL_MEM_HOST_PTR                             0x1103
+#define CL_MEM_MAP_COUNT                            0x1104
+#define CL_MEM_REFERENCE_COUNT                      0x1105
+#define CL_MEM_CONTEXT                              0x1106
+#define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107
+#define CL_MEM_OFFSET                               0x1108
+
+/* cl_image_info */
+#define CL_IMAGE_FORMAT                             0x1110
+#define CL_IMAGE_ELEMENT_SIZE                       0x1111
+#define CL_IMAGE_ROW_PITCH                          0x1112
+#define CL_IMAGE_SLICE_PITCH                        0x1113
+#define CL_IMAGE_WIDTH                              0x1114
+#define CL_IMAGE_HEIGHT                             0x1115
+#define CL_IMAGE_DEPTH                              0x1116
+#define CL_IMAGE_ARRAY_SIZE                         0x1117
+#define CL_IMAGE_BUFFER                             0x1118
+#define CL_IMAGE_NUM_MIP_LEVELS                     0x1119
+#define CL_IMAGE_NUM_SAMPLES                        0x111A
+
+/* cl_addressing_mode */
+#define CL_ADDRESS_NONE                             0x1130
+#define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
+#define CL_ADDRESS_CLAMP                            0x1132
+#define CL_ADDRESS_REPEAT                           0x1133
+#define CL_ADDRESS_MIRRORED_REPEAT                  0x1134
+
+/* cl_filter_mode */
+#define CL_FILTER_NEAREST                           0x1140
+#define CL_FILTER_LINEAR                            0x1141
+
+/* cl_sampler_info */
+#define CL_SAMPLER_REFERENCE_COUNT                  0x1150
+#define CL_SAMPLER_CONTEXT                          0x1151
+#define CL_SAMPLER_NORMALIZED_COORDS                0x1152
+#define CL_SAMPLER_ADDRESSING_MODE                  0x1153
+#define CL_SAMPLER_FILTER_MODE                      0x1154
+
+/* cl_map_flags - bitfield */
+#define CL_MAP_READ                                 (1 << 0)
+#define CL_MAP_WRITE                                (1 << 1)
+#define CL_MAP_WRITE_INVALIDATE_REGION              (1 << 2)
+
+/* cl_program_info */
+#define CL_PROGRAM_REFERENCE_COUNT                  0x1160
+#define CL_PROGRAM_CONTEXT                          0x1161
+#define CL_PROGRAM_NUM_DEVICES                      0x1162
+#define CL_PROGRAM_DEVICES                          0x1163
+#define CL_PROGRAM_SOURCE                           0x1164
+#define CL_PROGRAM_BINARY_SIZES                     0x1165
+#define CL_PROGRAM_BINARIES                         0x1166
+#define CL_PROGRAM_NUM_KERNELS                      0x1167
+#define CL_PROGRAM_KERNEL_NAMES                     0x1168
+
+/* cl_program_build_info */
+#define CL_PROGRAM_BUILD_STATUS                     0x1181
+#define CL_PROGRAM_BUILD_OPTIONS                    0x1182
+#define CL_PROGRAM_BUILD_LOG                        0x1183
+#define CL_PROGRAM_BINARY_TYPE                      0x1184
+    
+/* cl_program_binary_type */
+#define CL_PROGRAM_BINARY_TYPE_NONE                 0x0
+#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT      0x1
+#define CL_PROGRAM_BINARY_TYPE_LIBRARY              0x2
+#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE           0x4
+
+/* cl_build_status */
+#define CL_BUILD_SUCCESS                            0
+#define CL_BUILD_NONE                               -1
+#define CL_BUILD_ERROR                              -2
+#define CL_BUILD_IN_PROGRESS                        -3
+
+/* cl_kernel_info */
+#define CL_KERNEL_FUNCTION_NAME                     0x1190
+#define CL_KERNEL_NUM_ARGS                          0x1191
+#define CL_KERNEL_REFERENCE_COUNT                   0x1192
+#define CL_KERNEL_CONTEXT                           0x1193
+#define CL_KERNEL_PROGRAM                           0x1194
+#define CL_KERNEL_ATTRIBUTES                        0x1195
+
+/* cl_kernel_arg_info */
+#define CL_KERNEL_ARG_ADDRESS_QUALIFIER             0x1196
+#define CL_KERNEL_ARG_ACCESS_QUALIFIER              0x1197
+#define CL_KERNEL_ARG_TYPE_NAME                     0x1198
+#define CL_KERNEL_ARG_TYPE_QUALIFIER                0x1199
+#define CL_KERNEL_ARG_NAME                          0x119A
+
+/* cl_kernel_arg_address_qualifier */
+#define CL_KERNEL_ARG_ADDRESS_GLOBAL                0x119B
+#define CL_KERNEL_ARG_ADDRESS_LOCAL                 0x119C
+#define CL_KERNEL_ARG_ADDRESS_CONSTANT              0x119D
+#define CL_KERNEL_ARG_ADDRESS_PRIVATE               0x119E
+
+/* cl_kernel_arg_access_qualifier */
+#define CL_KERNEL_ARG_ACCESS_READ_ONLY              0x11A0
+#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY             0x11A1
+#define CL_KERNEL_ARG_ACCESS_READ_WRITE             0x11A2
+#define CL_KERNEL_ARG_ACCESS_NONE                   0x11A3
+    
+/* cl_kernel_arg_type_qualifer */
+#define CL_KERNEL_ARG_TYPE_NONE                     0
+#define CL_KERNEL_ARG_TYPE_CONST                    (1 << 0)
+#define CL_KERNEL_ARG_TYPE_RESTRICT                 (1 << 1)
+#define CL_KERNEL_ARG_TYPE_VOLATILE                 (1 << 2)
+
+/* cl_kernel_work_group_info */
+#define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
+#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1
+#define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2
+#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
+#define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
+#define CL_KERNEL_GLOBAL_WORK_SIZE                  0x11B5
+
+/* cl_event_info  */
+#define CL_EVENT_COMMAND_QUEUE                      0x11D0
+#define CL_EVENT_COMMAND_TYPE                       0x11D1
+#define CL_EVENT_REFERENCE_COUNT                    0x11D2
+#define CL_EVENT_COMMAND_EXECUTION_STATUS           0x11D3
+#define CL_EVENT_CONTEXT                            0x11D4
+
+/* cl_command_type */
+#define CL_COMMAND_NDRANGE_KERNEL                   0x11F0
+#define CL_COMMAND_TASK                             0x11F1
+#define CL_COMMAND_NATIVE_KERNEL                    0x11F2
+#define CL_COMMAND_READ_BUFFER                      0x11F3
+#define CL_COMMAND_WRITE_BUFFER                     0x11F4
+#define CL_COMMAND_COPY_BUFFER                      0x11F5
+#define CL_COMMAND_READ_IMAGE                       0x11F6
+#define CL_COMMAND_WRITE_IMAGE                      0x11F7
+#define CL_COMMAND_COPY_IMAGE                       0x11F8
+#define CL_COMMAND_COPY_IMAGE_TO_BUFFER             0x11F9
+#define CL_COMMAND_COPY_BUFFER_TO_IMAGE             0x11FA
+#define CL_COMMAND_MAP_BUFFER                       0x11FB
+#define CL_COMMAND_MAP_IMAGE                        0x11FC
+#define CL_COMMAND_UNMAP_MEM_OBJECT                 0x11FD
+#define CL_COMMAND_MARKER                           0x11FE
+#define CL_COMMAND_ACQUIRE_GL_OBJECTS               0x11FF
+#define CL_COMMAND_RELEASE_GL_OBJECTS               0x1200
+#define CL_COMMAND_READ_BUFFER_RECT                 0x1201
+#define CL_COMMAND_WRITE_BUFFER_RECT                0x1202
+#define CL_COMMAND_COPY_BUFFER_RECT                 0x1203
+#define CL_COMMAND_USER                             0x1204
+#define CL_COMMAND_BARRIER                          0x1205
+#define CL_COMMAND_MIGRATE_MEM_OBJECTS              0x1206
+#define CL_COMMAND_FILL_BUFFER                      0x1207
+#define CL_COMMAND_FILL_IMAGE                       0x1208
+
+/* command execution status */
+#define CL_COMPLETE                                 0x0
+#define CL_RUNNING                                  0x1
+#define CL_SUBMITTED                                0x2
+#define CL_QUEUED                                   0x3
+
+/* cl_buffer_create_type  */
+#define CL_BUFFER_CREATE_TYPE_REGION                0x1220
+
+/* cl_profiling_info  */
+#define CL_PROFILING_COMMAND_QUEUED                 0x1280
+#define CL_PROFILING_COMMAND_SUBMIT                 0x1281
+#define CL_PROFILING_COMMAND_START                  0x1282
+#define CL_PROFILING_COMMAND_END                    0x1283
+
+/********************************************************************************************************/
+
+/* Platform API */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformIDs(cl_uint          /* num_entries */,
+                 cl_platform_id * /* platforms */,
+                 cl_uint *        /* num_platforms */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL 
+clGetPlatformInfo(cl_platform_id   /* platform */, 
+                  cl_platform_info /* param_name */,
+                  size_t           /* param_value_size */, 
+                  void *           /* param_value */,
+                  size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Device APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDs(cl_platform_id   /* platform */,
+               cl_device_type   /* device_type */, 
+               cl_uint          /* num_entries */, 
+               cl_device_id *   /* devices */, 
+               cl_uint *        /* num_devices */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceInfo(cl_device_id    /* device */,
+                cl_device_info  /* param_name */, 
+                size_t          /* param_value_size */, 
+                void *          /* param_value */,
+                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+    
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateSubDevices(cl_device_id                         /* in_device */,
+                   const cl_device_partition_property * /* properties */,
+                   cl_uint                              /* num_devices */,
+                   cl_device_id *                       /* out_devices */,
+                   cl_uint *                            /* num_devices_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2;
+    
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2;
+    
+/* Context APIs  */
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContext(const cl_context_properties * /* properties */,
+                cl_uint                 /* num_devices */,
+                const cl_device_id *    /* devices */,
+                void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *),
+                void *                  /* user_data */,
+                cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContextFromType(const cl_context_properties * /* properties */,
+                        cl_device_type          /* device_type */,
+                        void (CL_CALLBACK *     /* pfn_notify*/ )(const char *, const void *, size_t, void *),
+                        void *                  /* user_data */,
+                        cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetContextInfo(cl_context         /* context */, 
+                 cl_context_info    /* param_name */, 
+                 size_t             /* param_value_size */, 
+                 void *             /* param_value */, 
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Command Queue APIs */
+extern CL_API_ENTRY cl_command_queue CL_API_CALL
+clCreateCommandQueue(cl_context                     /* context */, 
+                     cl_device_id                   /* device */, 
+                     cl_command_queue_properties    /* properties */,
+                     cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetCommandQueueInfo(cl_command_queue      /* command_queue */,
+                      cl_command_queue_info /* param_name */,
+                      size_t                /* param_value_size */,
+                      void *                /* param_value */,
+                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Memory Object APIs */
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBuffer(cl_context   /* context */,
+               cl_mem_flags /* flags */,
+               size_t       /* size */,
+               void *       /* host_ptr */,
+               cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateSubBuffer(cl_mem                   /* buffer */,
+                  cl_mem_flags             /* flags */,
+                  cl_buffer_create_type    /* buffer_create_type */,
+                  const void *             /* buffer_create_info */,
+                  cl_int *                 /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateImage(cl_context              /* context */,
+              cl_mem_flags            /* flags */,
+              const cl_image_format * /* image_format */,
+              const cl_image_desc *   /* image_desc */, 
+              void *                  /* host_ptr */,
+              cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+                        
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSupportedImageFormats(cl_context           /* context */,
+                           cl_mem_flags         /* flags */,
+                           cl_mem_object_type   /* image_type */,
+                           cl_uint              /* num_entries */,
+                           cl_image_format *    /* image_formats */,
+                           cl_uint *            /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0;
+                                    
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetMemObjectInfo(cl_mem           /* memobj */,
+                   cl_mem_info      /* param_name */, 
+                   size_t           /* param_value_size */,
+                   void *           /* param_value */,
+                   size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetImageInfo(cl_mem           /* image */,
+               cl_image_info    /* param_name */, 
+               size_t           /* param_value_size */,
+               void *           /* param_value */,
+               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetMemObjectDestructorCallback(  cl_mem /* memobj */, 
+                                    void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), 
+                                    void * /*user_data */ )             CL_API_SUFFIX__VERSION_1_1;  
+
+/* Sampler APIs */
+extern CL_API_ENTRY cl_sampler CL_API_CALL
+clCreateSampler(cl_context          /* context */,
+                cl_bool             /* normalized_coords */, 
+                cl_addressing_mode  /* addressing_mode */, 
+                cl_filter_mode      /* filter_mode */,
+                cl_int *            /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSamplerInfo(cl_sampler         /* sampler */,
+                 cl_sampler_info    /* param_name */,
+                 size_t             /* param_value_size */,
+                 void *             /* param_value */,
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+                            
+/* Program Object APIs  */
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithSource(cl_context        /* context */,
+                          cl_uint           /* count */,
+                          const char **     /* strings */,
+                          const size_t *    /* lengths */,
+                          cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBinary(cl_context                     /* context */,
+                          cl_uint                        /* num_devices */,
+                          const cl_device_id *           /* device_list */,
+                          const size_t *                 /* lengths */,
+                          const unsigned char **         /* binaries */,
+                          cl_int *                       /* binary_status */,
+                          cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBuiltInKernels(cl_context            /* context */,
+                                  cl_uint               /* num_devices */,
+                                  const cl_device_id *  /* device_list */,
+                                  const char *          /* kernel_names */,
+                                  cl_int *              /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clBuildProgram(cl_program           /* program */,
+               cl_uint              /* num_devices */,
+               const cl_device_id * /* device_list */,
+               const char *         /* options */, 
+               void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+               void *               /* user_data */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCompileProgram(cl_program           /* program */,
+                 cl_uint              /* num_devices */,
+                 const cl_device_id * /* device_list */,
+                 const char *         /* options */, 
+                 cl_uint              /* num_input_headers */,
+                 const cl_program *   /* input_headers */,
+                 const char **        /* header_include_names */,
+                 void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+                 void *               /* user_data */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clLinkProgram(cl_context           /* context */,
+              cl_uint              /* num_devices */,
+              const cl_device_id * /* device_list */,
+              const char *         /* options */, 
+              cl_uint              /* num_input_programs */,
+              const cl_program *   /* input_programs */,
+              void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+              void *               /* user_data */,
+              cl_int *             /* errcode_ret */ ) CL_API_SUFFIX__VERSION_1_2;
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clUnloadPlatformCompiler(cl_platform_id /* platform */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramInfo(cl_program         /* program */,
+                 cl_program_info    /* param_name */,
+                 size_t             /* param_value_size */,
+                 void *             /* param_value */,
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramBuildInfo(cl_program            /* program */,
+                      cl_device_id          /* device */,
+                      cl_program_build_info /* param_name */,
+                      size_t                /* param_value_size */,
+                      void *                /* param_value */,
+                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+                            
+/* Kernel Object APIs */
+extern CL_API_ENTRY cl_kernel CL_API_CALL
+clCreateKernel(cl_program      /* program */,
+               const char *    /* kernel_name */,
+               cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateKernelsInProgram(cl_program     /* program */,
+                         cl_uint        /* num_kernels */,
+                         cl_kernel *    /* kernels */,
+                         cl_uint *      /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainKernel(cl_kernel    /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseKernel(cl_kernel   /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArg(cl_kernel    /* kernel */,
+               cl_uint      /* arg_index */,
+               size_t       /* arg_size */,
+               const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelInfo(cl_kernel       /* kernel */,
+                cl_kernel_info  /* param_name */,
+                size_t          /* param_value_size */,
+                void *          /* param_value */,
+                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelArgInfo(cl_kernel       /* kernel */,
+                   cl_uint         /* arg_indx */,
+                   cl_kernel_arg_info  /* param_name */,
+                   size_t          /* param_value_size */,
+                   void *          /* param_value */,
+                   size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelWorkGroupInfo(cl_kernel                  /* kernel */,
+                         cl_device_id               /* device */,
+                         cl_kernel_work_group_info  /* param_name */,
+                         size_t                     /* param_value_size */,
+                         void *                     /* param_value */,
+                         size_t *                   /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Event Object APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clWaitForEvents(cl_uint             /* num_events */,
+                const cl_event *    /* event_list */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventInfo(cl_event         /* event */,
+               cl_event_info    /* param_name */,
+               size_t           /* param_value_size */,
+               void *           /* param_value */,
+               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+                            
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateUserEvent(cl_context    /* context */,
+                  cl_int *      /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;               
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetUserEventStatus(cl_event   /* event */,
+                     cl_int     /* execution_status */) CL_API_SUFFIX__VERSION_1_1;
+                     
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetEventCallback( cl_event    /* event */,
+                    cl_int      /* command_exec_callback_type */,
+                    void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
+                    void *      /* user_data */) CL_API_SUFFIX__VERSION_1_1;
+
+/* Profiling APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventProfilingInfo(cl_event            /* event */,
+                        cl_profiling_info   /* param_name */,
+                        size_t              /* param_value_size */,
+                        void *              /* param_value */,
+                        size_t *            /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+                                
+/* Flush and Finish APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFlush(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFinish(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Enqueued Commands APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBuffer(cl_command_queue    /* command_queue */,
+                    cl_mem              /* buffer */,
+                    cl_bool             /* blocking_read */,
+                    size_t              /* offset */,
+                    size_t              /* size */, 
+                    void *              /* ptr */,
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBufferRect(cl_command_queue    /* command_queue */,
+                        cl_mem              /* buffer */,
+                        cl_bool             /* blocking_read */,
+                        const size_t *      /* buffer_offset */,
+                        const size_t *      /* host_offset */, 
+                        const size_t *      /* region */,
+                        size_t              /* buffer_row_pitch */,
+                        size_t              /* buffer_slice_pitch */,
+                        size_t              /* host_row_pitch */,
+                        size_t              /* host_slice_pitch */,                        
+                        void *              /* ptr */,
+                        cl_uint             /* num_events_in_wait_list */,
+                        const cl_event *    /* event_wait_list */,
+                        cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBuffer(cl_command_queue   /* command_queue */, 
+                     cl_mem             /* buffer */, 
+                     cl_bool            /* blocking_write */, 
+                     size_t             /* offset */, 
+                     size_t             /* size */, 
+                     const void *       /* ptr */, 
+                     cl_uint            /* num_events_in_wait_list */, 
+                     const cl_event *   /* event_wait_list */, 
+                     cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_0;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBufferRect(cl_command_queue    /* command_queue */,
+                         cl_mem              /* buffer */,
+                         cl_bool             /* blocking_write */,
+                         const size_t *      /* buffer_offset */,
+                         const size_t *      /* host_offset */, 
+                         const size_t *      /* region */,
+                         size_t              /* buffer_row_pitch */,
+                         size_t              /* buffer_slice_pitch */,
+                         size_t              /* host_row_pitch */,
+                         size_t              /* host_slice_pitch */,                        
+                         const void *        /* ptr */,
+                         cl_uint             /* num_events_in_wait_list */,
+                         const cl_event *    /* event_wait_list */,
+                         cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillBuffer(cl_command_queue   /* command_queue */,
+                    cl_mem             /* buffer */, 
+                    const void *       /* pattern */, 
+                    size_t             /* pattern_size */, 
+                    size_t             /* offset */, 
+                    size_t             /* size */, 
+                    cl_uint            /* num_events_in_wait_list */, 
+                    const cl_event *   /* event_wait_list */, 
+                    cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_2;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBuffer(cl_command_queue    /* command_queue */, 
+                    cl_mem              /* src_buffer */,
+                    cl_mem              /* dst_buffer */, 
+                    size_t              /* src_offset */,
+                    size_t              /* dst_offset */,
+                    size_t              /* size */, 
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferRect(cl_command_queue    /* command_queue */, 
+                        cl_mem              /* src_buffer */,
+                        cl_mem              /* dst_buffer */, 
+                        const size_t *      /* src_origin */,
+                        const size_t *      /* dst_origin */,
+                        const size_t *      /* region */, 
+                        size_t              /* src_row_pitch */,
+                        size_t              /* src_slice_pitch */,
+                        size_t              /* dst_row_pitch */,
+                        size_t              /* dst_slice_pitch */,
+                        cl_uint             /* num_events_in_wait_list */,
+                        const cl_event *    /* event_wait_list */,
+                        cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadImage(cl_command_queue     /* command_queue */,
+                   cl_mem               /* image */,
+                   cl_bool              /* blocking_read */, 
+                   const size_t *       /* origin[3] */,
+                   const size_t *       /* region[3] */,
+                   size_t               /* row_pitch */,
+                   size_t               /* slice_pitch */, 
+                   void *               /* ptr */,
+                   cl_uint              /* num_events_in_wait_list */,
+                   const cl_event *     /* event_wait_list */,
+                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteImage(cl_command_queue    /* command_queue */,
+                    cl_mem              /* image */,
+                    cl_bool             /* blocking_write */, 
+                    const size_t *      /* origin[3] */,
+                    const size_t *      /* region[3] */,
+                    size_t              /* input_row_pitch */,
+                    size_t              /* input_slice_pitch */, 
+                    const void *        /* ptr */,
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillImage(cl_command_queue   /* command_queue */,
+                   cl_mem             /* image */, 
+                   const void *       /* fill_color */, 
+                   const size_t *     /* origin[3] */, 
+                   const size_t *     /* region[3] */, 
+                   cl_uint            /* num_events_in_wait_list */, 
+                   const cl_event *   /* event_wait_list */, 
+                   cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_2;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImage(cl_command_queue     /* command_queue */,
+                   cl_mem               /* src_image */,
+                   cl_mem               /* dst_image */, 
+                   const size_t *       /* src_origin[3] */,
+                   const size_t *       /* dst_origin[3] */,
+                   const size_t *       /* region[3] */, 
+                   cl_uint              /* num_events_in_wait_list */,
+                   const cl_event *     /* event_wait_list */,
+                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImageToBuffer(cl_command_queue /* command_queue */,
+                           cl_mem           /* src_image */,
+                           cl_mem           /* dst_buffer */, 
+                           const size_t *   /* src_origin[3] */,
+                           const size_t *   /* region[3] */, 
+                           size_t           /* dst_offset */,
+                           cl_uint          /* num_events_in_wait_list */,
+                           const cl_event * /* event_wait_list */,
+                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferToImage(cl_command_queue /* command_queue */,
+                           cl_mem           /* src_buffer */,
+                           cl_mem           /* dst_image */, 
+                           size_t           /* src_offset */,
+                           const size_t *   /* dst_origin[3] */,
+                           const size_t *   /* region[3] */, 
+                           cl_uint          /* num_events_in_wait_list */,
+                           const cl_event * /* event_wait_list */,
+                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapBuffer(cl_command_queue /* command_queue */,
+                   cl_mem           /* buffer */,
+                   cl_bool          /* blocking_map */, 
+                   cl_map_flags     /* map_flags */,
+                   size_t           /* offset */,
+                   size_t           /* size */,
+                   cl_uint          /* num_events_in_wait_list */,
+                   const cl_event * /* event_wait_list */,
+                   cl_event *       /* event */,
+                   cl_int *         /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapImage(cl_command_queue  /* command_queue */,
+                  cl_mem            /* image */, 
+                  cl_bool           /* blocking_map */, 
+                  cl_map_flags      /* map_flags */, 
+                  const size_t *    /* origin[3] */,
+                  const size_t *    /* region[3] */,
+                  size_t *          /* image_row_pitch */,
+                  size_t *          /* image_slice_pitch */,
+                  cl_uint           /* num_events_in_wait_list */,
+                  const cl_event *  /* event_wait_list */,
+                  cl_event *        /* event */,
+                  cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueUnmapMemObject(cl_command_queue /* command_queue */,
+                        cl_mem           /* memobj */,
+                        void *           /* mapped_ptr */,
+                        cl_uint          /* num_events_in_wait_list */,
+                        const cl_event *  /* event_wait_list */,
+                        cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMigrateMemObjects(cl_command_queue       /* command_queue */,
+                           cl_uint                /* num_mem_objects */,
+                           const cl_mem *         /* mem_objects */,
+                           cl_mem_migration_flags /* flags */,
+                           cl_uint                /* num_events_in_wait_list */,
+                           const cl_event *       /* event_wait_list */,
+                           cl_event *             /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
+                       cl_kernel        /* kernel */,
+                       cl_uint          /* work_dim */,
+                       const size_t *   /* global_work_offset */,
+                       const size_t *   /* global_work_size */,
+                       const size_t *   /* local_work_size */,
+                       cl_uint          /* num_events_in_wait_list */,
+                       const cl_event * /* event_wait_list */,
+                       cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueTask(cl_command_queue  /* command_queue */,
+              cl_kernel         /* kernel */,
+              cl_uint           /* num_events_in_wait_list */,
+              const cl_event *  /* event_wait_list */,
+              cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNativeKernel(cl_command_queue  /* command_queue */,
+					  void (CL_CALLBACK * /*user_func*/)(void *), 
+                      void *            /* args */,
+                      size_t            /* cb_args */, 
+                      cl_uint           /* num_mem_objects */,
+                      const cl_mem *    /* mem_list */,
+                      const void **     /* args_mem_loc */,
+                      cl_uint           /* num_events_in_wait_list */,
+                      const cl_event *  /* event_wait_list */,
+                      cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMarkerWithWaitList(cl_command_queue /* command_queue */,
+                            cl_uint           /* num_events_in_wait_list */,
+                            const cl_event *  /* event_wait_list */,
+                            cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueBarrierWithWaitList(cl_command_queue /* command_queue */,
+                             cl_uint           /* num_events_in_wait_list */,
+                             const cl_event *  /* event_wait_list */,
+                             cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+
+/* Extension function access
+ *
+ * Returns the extension function address for the given function name,
+ * or NULL if a valid function can not be found.  The client must
+ * check to make sure the address is not NULL, before using or 
+ * calling the returned function address.
+ */
+extern CL_API_ENTRY void * CL_API_CALL 
+clGetExtensionFunctionAddressForPlatform(cl_platform_id /* platform */,
+                                         const char *   /* func_name */) CL_API_SUFFIX__VERSION_1_2;
+    
+
+/* Deprecated OpenCL 1.1 APIs*/
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateImage2D(cl_context              /* context */,
+                cl_mem_flags            /* flags */,
+                const cl_image_format * /* image_format */,
+                size_t                  /* image_width */,
+                size_t                  /* image_height */,
+                size_t                  /* image_row_pitch */, 
+                void *                  /* host_ptr */,
+                cl_int *                /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateImage3D(cl_context              /* context */,
+                cl_mem_flags            /* flags */,
+                const cl_image_format * /* image_format */,
+                size_t                  /* image_width */, 
+                size_t                  /* image_height */,
+                size_t                  /* image_depth */, 
+                size_t                  /* image_row_pitch */, 
+                size_t                  /* image_slice_pitch */, 
+                void *                  /* host_ptr */,
+                cl_int *                /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueMarker(cl_command_queue    /* command_queue */,
+                cl_event *          /* event */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueWaitForEvents(cl_command_queue /* command_queue */,
+                        cl_uint          /* num_events */,
+                        const cl_event * /* event_list */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueBarrier(cl_command_queue /* command_queue */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL
+clGetExtensionFunctionAddress(const char * /* func_name */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_H */
+
diff --git a/include/CL/cl.hpp b/include/CL/cl.hpp
new file mode 100644
index 0000000000..38fac1962a
--- /dev/null
+++ b/include/CL/cl.hpp
@@ -0,0 +1,12452 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2013 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/*! \file
+ *
+ *   \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33) and 
+ *       OpenCL 1.2 (rev 15)    
+ *   \author Benedict R. Gaster, Laurent Morichetti and Lee Howes
+ *   
+ *   Additions and fixes from:
+ *       Brian Cole, March 3rd 2010 and April 2012 
+ *       Matt Gruenke, April 2012.
+ *       Bruce Merry, February 2013.
+ *       Tom Deakin and Simon McIntosh-Smith, July 2013
+ *   
+ *   \version 1.2.6
+ *   \date August 2013
+ *
+ *   Optional extension support
+ *
+ *         cl
+ *         cl_ext_device_fission
+ *				#define USE_CL_DEVICE_FISSION
+ */
+
+/*! \mainpage
+ * \section intro Introduction
+ * For many large applications C++ is the language of choice and so it seems
+ * reasonable to define C++ bindings for OpenCL.
+ *
+ *
+ * The interface is contained with a single C++ header file \em cl.hpp and all
+ * definitions are contained within the namespace \em cl. There is no additional
+ * requirement to include \em cl.h and to use either the C++ or original C
+ * bindings it is enough to simply include \em cl.hpp.
+ *
+ * The bindings themselves are lightweight and correspond closely to the
+ * underlying C API. Using the C++ bindings introduces no additional execution
+ * overhead.
+ *
+ * For detail documentation on the bindings see:
+ *
+ * The OpenCL C++ Wrapper API 1.2 (revision 09)
+ *  http://www.khronos.org/registry/cl/specs/opencl-cplusplus-1.2.pdf
+ *
+ * \section example Example
+ *
+ * The following example shows a general use case for the C++
+ * bindings, including support for the optional exception feature and
+ * also the supplied vector and string classes, see following sections for
+ * decriptions of these features.
+ *
+ * \code
+ * #define __CL_ENABLE_EXCEPTIONS
+ * 
+ * #if defined(__APPLE__) || defined(__MACOSX)
+ * #include <OpenCL/cl.hpp>
+ * #else
+ * #include <CL/cl.hpp>
+ * #endif
+ * #include <cstdio>
+ * #include <cstdlib>
+ * #include <iostream>
+ * 
+ *  const char * helloStr  = "__kernel void "
+ *                           "hello(void) "
+ *                           "{ "
+ *                           "  "
+ *                           "} ";
+ * 
+ *  int
+ *  main(void)
+ *  {
+ *     cl_int err = CL_SUCCESS;
+ *     try {
+ *
+ *       std::vector<cl::Platform> platforms;
+ *       cl::Platform::get(&platforms);
+ *       if (platforms.size() == 0) {
+ *           std::cout << "Platform size 0\n";
+ *           return -1;
+ *       }
+ *
+ *       cl_context_properties properties[] = 
+ *          { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0};
+ *       cl::Context context(CL_DEVICE_TYPE_CPU, properties); 
+ * 
+ *       std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
+ * 
+ *       cl::Program::Sources source(1,
+ *           std::make_pair(helloStr,strlen(helloStr)));
+ *       cl::Program program_ = cl::Program(context, source);
+ *       program_.build(devices);
+ * 
+ *       cl::Kernel kernel(program_, "hello", &err);
+ * 
+ *       cl::Event event;
+ *       cl::CommandQueue queue(context, devices[0], 0, &err);
+ *       queue.enqueueNDRangeKernel(
+ *           kernel, 
+ *           cl::NullRange, 
+ *           cl::NDRange(4,4),
+ *           cl::NullRange,
+ *           NULL,
+ *           &event); 
+ * 
+ *       event.wait();
+ *     }
+ *     catch (cl::Error err) {
+ *        std::cerr 
+ *           << "ERROR: "
+ *           << err.what()
+ *           << "("
+ *           << err.err()
+ *           << ")"
+ *           << std::endl;
+ *     }
+ * 
+ *    return EXIT_SUCCESS;
+ *  }
+ * 
+ * \endcode
+ *
+ */
+#ifndef CL_HPP_
+#define CL_HPP_
+
+#ifdef _WIN32
+
+#include <windows.h>
+#include <malloc.h>
+#include <iterator>
+#include <intrin.h>
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+#include <exception>
+#endif // #if defined(__CL_ENABLE_EXCEPTIONS)
+
+#pragma push_macro("max")
+#undef max
+#if defined(USE_DX_INTEROP)
+#include <CL/cl_d3d10.h>
+#include <CL/cl_dx9_media_sharing.h>
+#endif
+#endif // _WIN32
+
+// 
+#if defined(USE_CL_DEVICE_FISSION)
+#include <CL/cl_ext.h>
+#endif
+
+#if defined(__APPLE__) || defined(__MACOSX)
+#include <OpenGL/OpenGL.h>
+#include <OpenCL/opencl.h>
+#include <libkern/OSAtomic.h>
+#else
+#include <GL/gl.h>
+#include <CL/opencl.h>
+#endif // !__APPLE__
+
+// To avoid accidentally taking ownership of core OpenCL types
+// such as cl_kernel constructors are made explicit
+// under OpenCL 1.2
+#if defined(CL_VERSION_1_2) && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+#define __CL_EXPLICIT_CONSTRUCTORS explicit
+#else // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+#define __CL_EXPLICIT_CONSTRUCTORS 
+#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+
+// Define deprecated prefixes and suffixes to ensure compilation
+// in case they are not pre-defined
+#if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED  
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+#if !defined(CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED)
+#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+
+#if !defined(CL_CALLBACK)
+#define CL_CALLBACK
+#endif //CL_CALLBACK
+
+#include <utility>
+#include <limits>
+
+#if !defined(__NO_STD_VECTOR)
+#include <vector>
+#endif
+
+#if !defined(__NO_STD_STRING)
+#include <string>
+#endif 
+
+#if defined(linux) || defined(__APPLE__) || defined(__MACOSX)
+#include <alloca.h>
+
+#include <emmintrin.h>
+#include <xmmintrin.h>
+#endif // linux
+
+#include <cstring>
+
+
+/*! \namespace cl
+ *
+ * \brief The OpenCL C++ bindings are defined within this namespace.
+ *
+ */
+namespace cl {
+
+class Memory;
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+#define __INIT_CL_EXT_FCN_PTR(name) \
+    if(!pfn_##name) { \
+        pfn_##name = (PFN_##name) \
+            clGetExtensionFunctionAddress(#name); \
+        if(!pfn_##name) { \
+        } \
+    }
+#endif // #if defined(CL_VERSION_1_1)
+
+#if defined(CL_VERSION_1_2)
+#define __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, name) \
+    if(!pfn_##name) { \
+        pfn_##name = (PFN_##name) \
+            clGetExtensionFunctionAddressForPlatform(platform, #name); \
+        if(!pfn_##name) { \
+        } \
+    }
+#endif // #if defined(CL_VERSION_1_1)
+
+class Program;
+class Device;
+class Context;
+class CommandQueue;
+class Memory;
+class Buffer;
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+/*! \brief Exception class 
+ * 
+ *  This may be thrown by API functions when __CL_ENABLE_EXCEPTIONS is defined.
+ */
+class Error : public std::exception
+{
+private:
+    cl_int err_;
+    const char * errStr_;
+public:
+    /*! \brief Create a new CL error exception for a given error code
+     *  and corresponding message.
+     * 
+     *  \param err error code value.
+     *
+     *  \param errStr a descriptive string that must remain in scope until
+     *                handling of the exception has concluded.  If set, it
+     *                will be returned by what().
+     */
+    Error(cl_int err, const char * errStr = NULL) : err_(err), errStr_(errStr)
+    {}
+
+    ~Error() throw() {}
+
+    /*! \brief Get error string associated with exception
+     *
+     * \return A memory pointer to the error message string.
+     */
+    virtual const char * what() const throw ()
+    {
+        if (errStr_ == NULL) {
+            return "empty";
+        }
+        else {
+            return errStr_;
+        }
+    }
+
+    /*! \brief Get error code associated with exception
+     *
+     *  \return The error code.
+     */
+    cl_int err(void) const { return err_; }
+};
+
+#define __ERR_STR(x) #x
+#else
+#define __ERR_STR(x) NULL
+#endif // __CL_ENABLE_EXCEPTIONS
+
+
+namespace detail
+{
+#if defined(__CL_ENABLE_EXCEPTIONS)
+static inline cl_int errHandler (
+    cl_int err,
+    const char * errStr = NULL)
+{
+    if (err != CL_SUCCESS) {
+        throw Error(err, errStr);
+    }
+    return err;
+}
+#else
+static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
+{
+    (void) errStr; // suppress unused variable warning
+    return err;
+}
+#endif // __CL_ENABLE_EXCEPTIONS
+}
+
+
+
+//! \cond DOXYGEN_DETAIL
+#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
+#define __GET_DEVICE_INFO_ERR               __ERR_STR(clGetDeviceInfo)
+#define __GET_PLATFORM_INFO_ERR             __ERR_STR(clGetPlatformInfo)
+#define __GET_DEVICE_IDS_ERR                __ERR_STR(clGetDeviceIDs)
+#define __GET_PLATFORM_IDS_ERR              __ERR_STR(clGetPlatformIDs)
+#define __GET_CONTEXT_INFO_ERR              __ERR_STR(clGetContextInfo)
+#define __GET_EVENT_INFO_ERR                __ERR_STR(clGetEventInfo)
+#define __GET_EVENT_PROFILE_INFO_ERR        __ERR_STR(clGetEventProfileInfo)
+#define __GET_MEM_OBJECT_INFO_ERR           __ERR_STR(clGetMemObjectInfo)
+#define __GET_IMAGE_INFO_ERR                __ERR_STR(clGetImageInfo)
+#define __GET_SAMPLER_INFO_ERR              __ERR_STR(clGetSamplerInfo)
+#define __GET_KERNEL_INFO_ERR               __ERR_STR(clGetKernelInfo)
+#if defined(CL_VERSION_1_2)
+#define __GET_KERNEL_ARG_INFO_ERR               __ERR_STR(clGetKernelArgInfo)
+#endif // #if defined(CL_VERSION_1_2)
+#define __GET_KERNEL_WORK_GROUP_INFO_ERR    __ERR_STR(clGetKernelWorkGroupInfo)
+#define __GET_PROGRAM_INFO_ERR              __ERR_STR(clGetProgramInfo)
+#define __GET_PROGRAM_BUILD_INFO_ERR        __ERR_STR(clGetProgramBuildInfo)
+#define __GET_COMMAND_QUEUE_INFO_ERR        __ERR_STR(clGetCommandQueueInfo)
+
+#define __CREATE_CONTEXT_ERR                __ERR_STR(clCreateContext)
+#define __CREATE_CONTEXT_FROM_TYPE_ERR      __ERR_STR(clCreateContextFromType)
+#define __GET_SUPPORTED_IMAGE_FORMATS_ERR   __ERR_STR(clGetSupportedImageFormats)
+
+#define __CREATE_BUFFER_ERR                 __ERR_STR(clCreateBuffer)
+#define __COPY_ERR                          __ERR_STR(cl::copy)
+#define __CREATE_SUBBUFFER_ERR              __ERR_STR(clCreateSubBuffer)
+#define __CREATE_GL_BUFFER_ERR              __ERR_STR(clCreateFromGLBuffer)
+#define __CREATE_GL_RENDER_BUFFER_ERR       __ERR_STR(clCreateFromGLBuffer)
+#define __GET_GL_OBJECT_INFO_ERR            __ERR_STR(clGetGLObjectInfo)
+#if defined(CL_VERSION_1_2)
+#define __CREATE_IMAGE_ERR                  __ERR_STR(clCreateImage)
+#define __CREATE_GL_TEXTURE_ERR             __ERR_STR(clCreateFromGLTexture)
+#define __IMAGE_DIMENSION_ERR               __ERR_STR(Incorrect image dimensions)
+#endif // #if defined(CL_VERSION_1_2)
+#define __CREATE_SAMPLER_ERR                __ERR_STR(clCreateSampler)
+#define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR __ERR_STR(clSetMemObjectDestructorCallback)
+
+#define __CREATE_USER_EVENT_ERR             __ERR_STR(clCreateUserEvent)
+#define __SET_USER_EVENT_STATUS_ERR         __ERR_STR(clSetUserEventStatus)
+#define __SET_EVENT_CALLBACK_ERR            __ERR_STR(clSetEventCallback)
+#define __WAIT_FOR_EVENTS_ERR               __ERR_STR(clWaitForEvents)
+
+#define __CREATE_KERNEL_ERR                 __ERR_STR(clCreateKernel)
+#define __SET_KERNEL_ARGS_ERR               __ERR_STR(clSetKernelArg)
+#define __CREATE_PROGRAM_WITH_SOURCE_ERR    __ERR_STR(clCreateProgramWithSource)
+#define __CREATE_PROGRAM_WITH_BINARY_ERR    __ERR_STR(clCreateProgramWithBinary)
+#if defined(CL_VERSION_1_2)
+#define __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR    __ERR_STR(clCreateProgramWithBuiltInKernels)
+#endif // #if defined(CL_VERSION_1_2)
+#define __BUILD_PROGRAM_ERR                 __ERR_STR(clBuildProgram)
+#if defined(CL_VERSION_1_2)
+#define __COMPILE_PROGRAM_ERR                  __ERR_STR(clCompileProgram)
+
+#endif // #if defined(CL_VERSION_1_2)
+#define __CREATE_KERNELS_IN_PROGRAM_ERR     __ERR_STR(clCreateKernelsInProgram)
+
+#define __CREATE_COMMAND_QUEUE_ERR          __ERR_STR(clCreateCommandQueue)
+#define __SET_COMMAND_QUEUE_PROPERTY_ERR    __ERR_STR(clSetCommandQueueProperty)
+#define __ENQUEUE_READ_BUFFER_ERR           __ERR_STR(clEnqueueReadBuffer)
+#define __ENQUEUE_READ_BUFFER_RECT_ERR      __ERR_STR(clEnqueueReadBufferRect)
+#define __ENQUEUE_WRITE_BUFFER_ERR          __ERR_STR(clEnqueueWriteBuffer)
+#define __ENQUEUE_WRITE_BUFFER_RECT_ERR     __ERR_STR(clEnqueueWriteBufferRect)
+#define __ENQEUE_COPY_BUFFER_ERR            __ERR_STR(clEnqueueCopyBuffer)
+#define __ENQEUE_COPY_BUFFER_RECT_ERR       __ERR_STR(clEnqueueCopyBufferRect)
+#define __ENQUEUE_FILL_BUFFER_ERR           __ERR_STR(clEnqueueFillBuffer)
+#define __ENQUEUE_READ_IMAGE_ERR            __ERR_STR(clEnqueueReadImage)
+#define __ENQUEUE_WRITE_IMAGE_ERR           __ERR_STR(clEnqueueWriteImage)
+#define __ENQUEUE_COPY_IMAGE_ERR            __ERR_STR(clEnqueueCopyImage)
+#define __ENQUEUE_FILL_IMAGE_ERR           __ERR_STR(clEnqueueFillImage)
+#define __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR  __ERR_STR(clEnqueueCopyImageToBuffer)
+#define __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR  __ERR_STR(clEnqueueCopyBufferToImage)
+#define __ENQUEUE_MAP_BUFFER_ERR            __ERR_STR(clEnqueueMapBuffer)
+#define __ENQUEUE_MAP_IMAGE_ERR             __ERR_STR(clEnqueueMapImage)
+#define __ENQUEUE_UNMAP_MEM_OBJECT_ERR      __ERR_STR(clEnqueueUnMapMemObject)
+#define __ENQUEUE_NDRANGE_KERNEL_ERR        __ERR_STR(clEnqueueNDRangeKernel)
+#define __ENQUEUE_TASK_ERR                  __ERR_STR(clEnqueueTask)
+#define __ENQUEUE_NATIVE_KERNEL             __ERR_STR(clEnqueueNativeKernel)
+#if defined(CL_VERSION_1_2)
+#define __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR   __ERR_STR(clEnqueueMigrateMemObjects)
+#endif // #if defined(CL_VERSION_1_2)
+
+#define __ENQUEUE_ACQUIRE_GL_ERR            __ERR_STR(clEnqueueAcquireGLObjects)
+#define __ENQUEUE_RELEASE_GL_ERR            __ERR_STR(clEnqueueReleaseGLObjects)
+
+
+#define __RETAIN_ERR                        __ERR_STR(Retain Object)
+#define __RELEASE_ERR                       __ERR_STR(Release Object)
+#define __FLUSH_ERR                         __ERR_STR(clFlush)
+#define __FINISH_ERR                        __ERR_STR(clFinish)
+#define __VECTOR_CAPACITY_ERR               __ERR_STR(Vector capacity error)
+
+/**
+ * CL 1.2 version that uses device fission.
+ */
+#if defined(CL_VERSION_1_2)
+#define __CREATE_SUB_DEVICES                __ERR_STR(clCreateSubDevices)
+#else
+#define __CREATE_SUB_DEVICES                __ERR_STR(clCreateSubDevicesEXT)
+#endif // #if defined(CL_VERSION_1_2)
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+#define __ENQUEUE_MARKER_ERR                __ERR_STR(clEnqueueMarker)
+#define __ENQUEUE_WAIT_FOR_EVENTS_ERR       __ERR_STR(clEnqueueWaitForEvents)
+#define __ENQUEUE_BARRIER_ERR               __ERR_STR(clEnqueueBarrier)
+#define __UNLOAD_COMPILER_ERR               __ERR_STR(clUnloadCompiler)
+#define __CREATE_GL_TEXTURE_2D_ERR          __ERR_STR(clCreateFromGLTexture2D)
+#define __CREATE_GL_TEXTURE_3D_ERR          __ERR_STR(clCreateFromGLTexture3D)
+#define __CREATE_IMAGE2D_ERR                __ERR_STR(clCreateImage2D)
+#define __CREATE_IMAGE3D_ERR                __ERR_STR(clCreateImage3D)
+#endif // #if defined(CL_VERSION_1_1)
+
+#endif // __CL_USER_OVERRIDE_ERROR_STRINGS
+//! \endcond
+
+/**
+ * CL 1.2 marker and barrier commands
+ */
+#if defined(CL_VERSION_1_2)
+#define __ENQUEUE_MARKER_WAIT_LIST_ERR                __ERR_STR(clEnqueueMarkerWithWaitList)
+#define __ENQUEUE_BARRIER_WAIT_LIST_ERR               __ERR_STR(clEnqueueBarrierWithWaitList)
+#endif // #if defined(CL_VERSION_1_2)
+
+#if !defined(__USE_DEV_STRING) && !defined(__NO_STD_STRING)
+typedef std::string STRING_CLASS;
+#elif !defined(__USE_DEV_STRING) 
+
+/*! \class string
+ * \brief Simple string class, that provides a limited subset of std::string
+ * functionality but avoids many of the issues that come with that class.
+ 
+ *  \note Deprecated. Please use std::string as default or
+ *  re-define the string class to match the std::string
+ *  interface by defining STRING_CLASS
+ */
+class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED string CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+{
+private:
+    ::size_t size_;
+    char * str_;
+public:
+    //! \brief Constructs an empty string, allocating no memory.
+    string(void) : size_(0), str_(NULL)
+    {
+    }
+
+    /*! \brief Constructs a string populated from an arbitrary value of
+     *  specified size.
+     * 
+     *  An extra '\0' is added, in case none was contained in str.
+     *
+     *  \param str the initial value of the string instance.  Note that '\0'     
+     *             characters receive no special treatment.  If NULL,
+     *             the string is left empty, with a size of 0.
+     *
+     *  \param size the number of characters to copy from str.
+     */
+    string(const char * str, ::size_t size) :
+        size_(size),
+        str_(NULL)
+    {
+        if( size > 0 ) {
+            str_ = new char[size_+1];
+            if (str_ != NULL) {
+                memcpy(str_, str, size_  * sizeof(char));
+                str_[size_] = '\0';
+            }
+            else {
+                size_ = 0;
+            }
+        }
+    }
+
+    /*! \brief Constructs a string populated from a null-terminated value.
+     *
+     *  \param str the null-terminated initial value of the string instance.
+     *             If NULL, the string is left empty, with a size of 0.
+     */
+    string(const char * str) :
+        size_(0),
+        str_(NULL)
+    {
+        if( str ) {
+            size_= ::strlen(str);
+        }
+        if( size_ > 0 ) {
+            str_ = new char[size_ + 1];
+            if (str_ != NULL) {
+                memcpy(str_, str, (size_ + 1) * sizeof(char));
+            }
+        }
+    }
+
+    void resize( ::size_t n )
+    {
+        if( size_ == n ) {
+            return;
+        }
+        if (n == 0) {
+            if( str_ ) {
+                delete [] str_;
+            }
+            str_ = NULL;
+            size_ = 0;
+        } 
+        else {
+            char *newString = new char[n + 1];
+            int copySize = n;
+            if( size_ < n ) {
+                copySize = size_;
+            }
+            size_ = n;
+            
+            if(str_) {
+                memcpy(newString, str_, (copySize + 1) * sizeof(char));
+            }
+            if( copySize < size_ ) {
+                memset(newString + copySize, 0, size_ - copySize);
+            }
+            newString[size_] = '\0';
+
+            delete [] str_;
+            str_ = newString;
+        }
+    }
+
+    const char& operator[] ( ::size_t pos ) const
+    {
+        return str_[pos];
+    }
+
+    char& operator[] ( ::size_t pos )
+    {
+        return str_[pos];
+    }
+
+    /*! \brief Copies the value of another string to this one.
+     *
+     *  \param rhs the string to copy.
+     *
+     *  \returns a reference to the modified instance.
+     */
+    string& operator=(const string& rhs)
+    {
+        if (this == &rhs) {
+            return *this;
+        }
+
+        if( str_ != NULL ) {
+            delete [] str_;
+            str_ = NULL;
+            size_ = 0;
+        }
+
+        if (rhs.size_ == 0 || rhs.str_ == NULL) {
+            str_ = NULL;
+            size_ = 0;
+        } 
+        else {
+            str_ = new char[rhs.size_ + 1];
+            size_ = rhs.size_;
+            
+            if (str_ != NULL) {
+                memcpy(str_, rhs.str_, (size_ + 1) * sizeof(char));
+            }
+            else {
+                size_ = 0;
+            }
+        }
+
+        return *this;
+    }
+
+    /*! \brief Constructs a string by copying the value of another instance.
+     *
+     *  \param rhs the string to copy.
+     */
+    string(const string& rhs) :
+        size_(0),
+        str_(NULL)
+    {
+        *this = rhs;
+    }
+
+    //! \brief Destructor - frees memory used to hold the current value.
+    ~string()
+    {
+        delete[] str_;
+        str_ = NULL;
+    }
+    
+    //! \brief Queries the length of the string, excluding any added '\0's.
+    ::size_t size(void) const   { return size_; }
+
+    //! \brief Queries the length of the string, excluding any added '\0's.
+    ::size_t length(void) const { return size(); }
+
+    /*! \brief Returns a pointer to the private copy held by this instance,
+     *  or "" if empty/unset.
+     */
+    const char * c_str(void) const { return (str_) ? str_ : "";}
+};
+typedef cl::string STRING_CLASS;
+#endif // #elif !defined(__USE_DEV_STRING) 
+
+#if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR)
+#define VECTOR_CLASS std::vector
+#elif !defined(__USE_DEV_VECTOR) 
+#define VECTOR_CLASS cl::vector 
+
+#if !defined(__MAX_DEFAULT_VECTOR_SIZE)
+#define __MAX_DEFAULT_VECTOR_SIZE 10
+#endif
+
+/*! \class vector
+ * \brief Fixed sized vector implementation that mirroring 
+ *
+ *  \note Deprecated. Please use std::vector as default or
+ *  re-define the vector class to match the std::vector
+ *  interface by defining VECTOR_CLASS
+
+ *  \note Not recommended for use with custom objects as
+ *  current implementation will construct N elements
+ *
+ * std::vector functionality.
+ *  \brief Fixed sized vector compatible with std::vector.
+ *
+ *  \note
+ *  This differs from std::vector<> not just in memory allocation,
+ *  but also in terms of when members are constructed, destroyed,
+ *  and assigned instead of being copy constructed.
+ *
+ *  \param T type of element contained in the vector.
+ *
+ *  \param N maximum size of the vector.
+ */
+template <typename T, unsigned int N = __MAX_DEFAULT_VECTOR_SIZE>
+class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED vector CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+{
+private:
+    T data_[N];
+    unsigned int size_;
+
+public:
+    //! \brief Constructs an empty vector with no memory allocated.
+    vector() :  
+        size_(static_cast<unsigned int>(0))
+    {}
+
+    //! \brief Deallocates the vector's memory and destroys all of its elements.
+    ~vector() 
+    {
+        clear();
+    }
+
+    //! \brief Returns the number of elements currently contained.
+    unsigned int size(void) const
+    {
+        return size_;
+    }
+    
+    /*! \brief Empties the vector of all elements.
+     *  \note
+     *  This does not deallocate memory but will invoke destructors
+     *  on contained elements.
+     */
+    void clear()
+    {
+        while(!empty()) {
+            pop_back();
+        }
+    }
+
+    /*! \brief Appends an element after the last valid element.
+     * Calling this on a vector that has reached capacity will throw an 
+     * exception if exceptions are enabled.
+     */
+    void push_back (const T& x)
+    { 
+        if (size() < N) {    
+            new (&data_[size_]) T(x);
+            size_++;
+        } else {
+            detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR);
+        }
+    }
+
+    /*! \brief Removes the last valid element from the vector.
+     * Calling this on an empty vector will throw an exception
+     * if exceptions are enabled.
+     */
+    void pop_back(void)
+    {
+        if (size_ != 0) {
+            --size_;
+            data_[size_].~T();
+        } else {
+            detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR);
+        }
+    }
+  
+    /*! \brief Constructs with a value copied from another.
+     *
+     *  \param vec the vector to copy.
+     */
+    vector(const vector<T, N>& vec) : 
+        size_(vec.size_)
+    {
+        if (size_ != 0) {	
+            assign(vec.begin(), vec.end());
+        }
+    } 
+
+    /*! \brief Constructs with a specified number of initial elements.
+     *
+     *  \param size number of initial elements.
+     *
+     *  \param val value of initial elements.
+     */
+    vector(unsigned int size, const T& val = T()) :
+        size_(0)
+    {
+        for (unsigned int i = 0; i < size; i++) {
+            push_back(val);
+        }
+    }
+
+    /*! \brief Overwrites the current content with that copied from another
+     *         instance.
+     *
+     *  \param rhs vector to copy.
+     *
+     *  \returns a reference to this.
+     */
+    vector<T, N>& operator=(const vector<T, N>& rhs)
+    {
+        if (this == &rhs) {
+            return *this;
+        }
+
+        if (rhs.size_ != 0) {	
+            assign(rhs.begin(), rhs.end());
+        } else {
+            clear();
+        }
+    
+        return *this;
+    }
+
+    /*! \brief Tests equality against another instance.
+     *
+     *  \param vec the vector against which to compare.
+     */
+    bool operator==(vector<T,N> &vec)
+    {
+        if (size() != vec.size()) {
+            return false;
+        }
+
+        for( unsigned int i = 0; i < size(); ++i ) {
+            if( operator[](i) != vec[i] ) {
+                return false;
+            }
+        }
+        return true;
+    }
+  
+    //! \brief Conversion operator to T*.
+    operator T* ()             { return data_; }
+
+    //! \brief Conversion operator to const T*.
+    operator const T* () const { return data_; }
+   
+    //! \brief Tests whether this instance has any elements.
+    bool empty (void) const
+    {
+        return size_==0;
+    }
+  
+    //! \brief Returns the maximum number of elements this instance can hold.
+    unsigned int max_size (void) const
+    {
+        return N;
+    }
+
+    //! \brief Returns the maximum number of elements this instance can hold.
+    unsigned int capacity () const
+    {
+        return N;
+    }
+
+    /*! \brief Returns a reference to a given element.
+     *
+     *  \param index which element to access.     *
+     *  \note
+     *  The caller is responsible for ensuring index is >= 0 and < size().
+     */
+    T& operator[](int index)
+    {
+        return data_[index];
+    }
+  
+    /*! \brief Returns a const reference to a given element.
+     *
+     *  \param index which element to access.
+     *
+     *  \note
+     *  The caller is responsible for ensuring index is >= 0 and < size().
+     */
+    const T& operator[](int index) const
+    {
+        return data_[index];
+    }
+  
+    /*! \brief Assigns elements of the vector based on a source iterator range.
+     *
+     *  \param start Beginning iterator of source range
+     *  \param end Enditerator of source range
+     *
+     *  \note
+     *  Will throw an exception if exceptions are enabled and size exceeded.
+     */
+    template<class I>
+    void assign(I start, I end)
+    {
+        clear();   
+        while(start != end) {
+            push_back(*start);
+            start++;
+        }
+    }
+
+    /*! \class iterator
+     * \brief Const iterator class for vectors
+     */
+    class iterator
+    {
+    private:
+        const vector<T,N> *vec_;
+        int index_;
+
+        /**
+         * Internal iterator constructor to capture reference
+         * to the vector it iterates over rather than taking 
+         * the vector by copy.
+         */
+        iterator (const vector<T,N> &vec, int index) :
+            vec_(&vec)
+        {            
+            if( !vec.empty() ) {
+                index_ = index;
+            } else {
+                index_ = -1;
+            }
+        }
+
+    public:
+        iterator(void) : 
+            index_(-1),
+            vec_(NULL)
+        {
+        }
+
+        iterator(const iterator& rhs) :
+            vec_(rhs.vec_),
+            index_(rhs.index_)
+        {
+        }
+
+        ~iterator(void) {}
+
+        static iterator begin(const cl::vector<T,N> &vec)
+        {
+            iterator i(vec, 0);
+
+            return i;
+        }
+
+        static iterator end(const cl::vector<T,N> &vec)
+        {
+            iterator i(vec, vec.size());
+
+            return i;
+        }
+    
+        bool operator==(iterator i)
+        {
+            return ((vec_ == i.vec_) && 
+                    (index_ == i.index_));
+        }
+
+        bool operator!=(iterator i)
+        {
+            return (!(*this==i));
+        }
+
+        iterator& operator++()
+        {
+            ++index_;
+            return *this;
+        }
+
+        iterator operator++(int)
+        {
+            iterator retVal(*this);
+            ++index_;
+            return retVal;
+        }
+
+        iterator& operator--()
+        {
+            --index_;
+            return *this;
+        }
+
+        iterator operator--(int)
+        {
+            iterator retVal(*this);
+            --index_;
+            return retVal;
+        }
+
+        const T& operator *() const
+        {
+            return (*vec_)[index_];
+        }
+    };
+
+    iterator begin(void)
+    {
+        return iterator::begin(*this);
+    }
+
+    iterator begin(void) const
+    {
+        return iterator::begin(*this);
+    }
+
+    iterator end(void)
+    {
+        return iterator::end(*this);
+    }
+
+    iterator end(void) const
+    {
+        return iterator::end(*this);
+    }
+
+    T& front(void)
+    {
+        return data_[0];
+    }
+
+    T& back(void)
+    {
+        return data_[size_];
+    }
+
+    const T& front(void) const
+    {
+        return data_[0];
+    }
+
+    const T& back(void) const
+    {
+        return data_[size_-1];
+    }
+};  
+#endif // #if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR)
+
+
+
+
+
+namespace detail {
+#define __DEFAULT_NOT_INITIALIZED 1 
+#define __DEFAULT_BEING_INITIALIZED 2
+#define __DEFAULT_INITIALIZED 4
+
+    /*
+     * Compare and exchange primitives are needed for handling of defaults
+    */
+    inline int compare_exchange(volatile int * dest, int exchange, int comparand)
+    {
+#ifdef _WIN32
+        return (int)(InterlockedCompareExchange(
+           (volatile long*)dest, 
+           (long)exchange, 
+           (long)comparand));
+#elif defined(__APPLE__) || defined(__MACOSX)
+		return OSAtomicOr32Orig((uint32_t)exchange, (volatile uint32_t*)dest);
+#else // !_WIN32 || defined(__APPLE__) || defined(__MACOSX)
+        return (__sync_val_compare_and_swap(
+            dest, 
+            comparand, 
+            exchange));
+#endif // !_WIN32
+    }
+
+    inline void fence() { _mm_mfence(); }
+}; // namespace detail
+
+    
+/*! \brief class used to interface between C++ and
+ *  OpenCL C calls that require arrays of size_t values, whose
+ *  size is known statically.
+ */
+template <int N>
+class size_t
+{ 
+private:
+    ::size_t data_[N];
+
+public:
+    //! \brief Initialize size_t to all 0s
+    size_t()
+    {
+        for( int i = 0; i < N; ++i ) {
+            data_[i] = 0;
+        }
+    }
+
+    ::size_t& operator[](int index)
+    {
+        return data_[index];
+    }
+
+    const ::size_t& operator[](int index) const
+    {
+        return data_[index];
+    }
+
+    //! \brief Conversion operator to T*.
+    operator ::size_t* ()             { return data_; }
+
+    //! \brief Conversion operator to const T*.
+    operator const ::size_t* () const { return data_; }
+};
+
+namespace detail {
+
+// Generic getInfoHelper. The final parameter is used to guide overload
+// resolution: the actual parameter passed is an int, which makes this
+// a worse conversion sequence than a specialization that declares the
+// parameter as an int.
+template<typename Functor, typename T>
+inline cl_int getInfoHelper(Functor f, cl_uint name, T* param, long)
+{
+    return f(name, sizeof(T), param, NULL);
+}
+
+// Specialized getInfoHelper for VECTOR_CLASS params
+template <typename Func, typename T>
+inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, long)
+{
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    T* value = (T*) alloca(required);
+    err = f(name, required, value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    param->assign(&value[0], &value[required/sizeof(T)]);
+    return CL_SUCCESS;
+}
+
+/* Specialization for reference-counted types. This depends on the
+ * existence of Wrapper<T>::cl_type, and none of the other types having the
+ * cl_type member. Note that simplify specifying the parameter as Wrapper<T>
+ * does not work, because when using a derived type (e.g. Context) the generic
+ * template will provide a better match.
+ */
+template <typename Func, typename T>
+inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, int, typename T::cl_type = 0)
+{
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    typename T::cl_type * value = (typename T::cl_type *) alloca(required);
+    err = f(name, required, value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    ::size_t elements = required / sizeof(typename T::cl_type);
+    param->assign(&value[0], &value[elements]);
+    for (::size_t i = 0; i < elements; i++)
+    {
+        if (value[i] != NULL)
+        {
+            err = (*param)[i].retain();
+            if (err != CL_SUCCESS) {
+                return err;
+            }
+        }
+    }
+    return CL_SUCCESS;
+}
+
+// Specialized for getInfo<CL_PROGRAM_BINARIES>
+template <typename Func>
+inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<char *>* param, int)
+{
+    cl_int err = f(name, param->size() * sizeof(char *), &(*param)[0], NULL);
+
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    return CL_SUCCESS;
+}
+
+// Specialized GetInfoHelper for STRING_CLASS params
+template <typename Func>
+inline cl_int getInfoHelper(Func f, cl_uint name, STRING_CLASS* param, long)
+{
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    char* value = (char*) alloca(required);
+    err = f(name, required, value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    *param = value;
+    return CL_SUCCESS;
+}
+
+// Specialized GetInfoHelper for cl::size_t params
+template <typename Func, ::size_t N>
+inline cl_int getInfoHelper(Func f, cl_uint name, size_t<N>* param, long)
+{
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    ::size_t* value = (::size_t*) alloca(required);
+    err = f(name, required, value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    for(int i = 0; i < N; ++i) {
+        (*param)[i] = value[i];
+    }
+
+    return CL_SUCCESS;
+}
+
+template<typename T> struct ReferenceHandler;
+
+/* Specialization for reference-counted types. This depends on the
+ * existence of Wrapper<T>::cl_type, and none of the other types having the
+ * cl_type member. Note that simplify specifying the parameter as Wrapper<T>
+ * does not work, because when using a derived type (e.g. Context) the generic
+ * template will provide a better match.
+ */
+template<typename Func, typename T>
+inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_type = 0)
+{
+    typename T::cl_type value;
+    cl_int err = f(name, sizeof(value), &value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+    *param = value;
+    if (value != NULL)
+    {
+        err = param->retain();
+        if (err != CL_SUCCESS) {
+            return err;
+        }
+    }
+    return CL_SUCCESS;
+}
+
+#define __PARAM_NAME_INFO_1_0(F) \
+    F(cl_platform_info, CL_PLATFORM_PROFILE, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_VERSION, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_NAME, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_VENDOR, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_EXTENSIONS, STRING_CLASS) \
+    \
+    F(cl_device_info, CL_DEVICE_TYPE, cl_device_type) \
+    F(cl_device_info, CL_DEVICE_VENDOR_ID, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_GROUP_SIZE, ::size_t) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_SIZES, VECTOR_CLASS< ::size_t>) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) \
+    F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_WIDTH, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_HEIGHT, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_DEPTH, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_MAX_PARAMETER_SIZE, ::size_t) \
+    F(cl_device_info, CL_DEVICE_MAX_SAMPLERS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint)\
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type) \
+    F(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_PROFILING_TIMER_RESOLUTION, ::size_t) \
+    F(cl_device_info, CL_DEVICE_ENDIAN_LITTLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_AVAILABLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_COMPILER_AVAILABLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities) \
+    F(cl_device_info, CL_DEVICE_QUEUE_PROPERTIES, cl_command_queue_properties) \
+    F(cl_device_info, CL_DEVICE_PLATFORM, cl_platform_id) \
+    F(cl_device_info, CL_DEVICE_NAME, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_VENDOR, STRING_CLASS) \
+    F(cl_device_info, CL_DRIVER_VERSION, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_PROFILE, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_VERSION, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_EXTENSIONS, STRING_CLASS) \
+    \
+    F(cl_context_info, CL_CONTEXT_REFERENCE_COUNT, cl_uint) \
+    F(cl_context_info, CL_CONTEXT_DEVICES, VECTOR_CLASS<Device>) \
+    F(cl_context_info, CL_CONTEXT_PROPERTIES, VECTOR_CLASS<cl_context_properties>) \
+    \
+    F(cl_event_info, CL_EVENT_COMMAND_QUEUE, cl::CommandQueue) \
+    F(cl_event_info, CL_EVENT_COMMAND_TYPE, cl_command_type) \
+    F(cl_event_info, CL_EVENT_REFERENCE_COUNT, cl_uint) \
+    F(cl_event_info, CL_EVENT_COMMAND_EXECUTION_STATUS, cl_uint) \
+    \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_QUEUED, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_SUBMIT, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_START, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_END, cl_ulong) \
+    \
+    F(cl_mem_info, CL_MEM_TYPE, cl_mem_object_type) \
+    F(cl_mem_info, CL_MEM_FLAGS, cl_mem_flags) \
+    F(cl_mem_info, CL_MEM_SIZE, ::size_t) \
+    F(cl_mem_info, CL_MEM_HOST_PTR, void*) \
+    F(cl_mem_info, CL_MEM_MAP_COUNT, cl_uint) \
+    F(cl_mem_info, CL_MEM_REFERENCE_COUNT, cl_uint) \
+    F(cl_mem_info, CL_MEM_CONTEXT, cl::Context) \
+    \
+    F(cl_image_info, CL_IMAGE_FORMAT, cl_image_format) \
+    F(cl_image_info, CL_IMAGE_ELEMENT_SIZE, ::size_t) \
+    F(cl_image_info, CL_IMAGE_ROW_PITCH, ::size_t) \
+    F(cl_image_info, CL_IMAGE_SLICE_PITCH, ::size_t) \
+    F(cl_image_info, CL_IMAGE_WIDTH, ::size_t) \
+    F(cl_image_info, CL_IMAGE_HEIGHT, ::size_t) \
+    F(cl_image_info, CL_IMAGE_DEPTH, ::size_t) \
+    \
+    F(cl_sampler_info, CL_SAMPLER_REFERENCE_COUNT, cl_uint) \
+    F(cl_sampler_info, CL_SAMPLER_CONTEXT, cl::Context) \
+    F(cl_sampler_info, CL_SAMPLER_NORMALIZED_COORDS, cl_addressing_mode) \
+    F(cl_sampler_info, CL_SAMPLER_ADDRESSING_MODE, cl_filter_mode) \
+    F(cl_sampler_info, CL_SAMPLER_FILTER_MODE, cl_bool) \
+    \
+    F(cl_program_info, CL_PROGRAM_REFERENCE_COUNT, cl_uint) \
+    F(cl_program_info, CL_PROGRAM_CONTEXT, cl::Context) \
+    F(cl_program_info, CL_PROGRAM_NUM_DEVICES, cl_uint) \
+    F(cl_program_info, CL_PROGRAM_DEVICES, VECTOR_CLASS<Device>) \
+    F(cl_program_info, CL_PROGRAM_SOURCE, STRING_CLASS) \
+    F(cl_program_info, CL_PROGRAM_BINARY_SIZES, VECTOR_CLASS< ::size_t>) \
+    F(cl_program_info, CL_PROGRAM_BINARIES, VECTOR_CLASS<char *>) \
+    \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_STATUS, cl_build_status) \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_OPTIONS, STRING_CLASS) \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_LOG, STRING_CLASS) \
+    \
+    F(cl_kernel_info, CL_KERNEL_FUNCTION_NAME, STRING_CLASS) \
+    F(cl_kernel_info, CL_KERNEL_NUM_ARGS, cl_uint) \
+    F(cl_kernel_info, CL_KERNEL_REFERENCE_COUNT, cl_uint) \
+    F(cl_kernel_info, CL_KERNEL_CONTEXT, cl::Context) \
+    F(cl_kernel_info, CL_KERNEL_PROGRAM, cl::Program) \
+    \
+    F(cl_kernel_work_group_info, CL_KERNEL_WORK_GROUP_SIZE, ::size_t) \
+    F(cl_kernel_work_group_info, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, cl::size_t<3>) \
+    F(cl_kernel_work_group_info, CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong) \
+    \
+    F(cl_command_queue_info, CL_QUEUE_CONTEXT, cl::Context) \
+    F(cl_command_queue_info, CL_QUEUE_DEVICE, cl::Device) \
+    F(cl_command_queue_info, CL_QUEUE_REFERENCE_COUNT, cl_uint) \
+    F(cl_command_queue_info, CL_QUEUE_PROPERTIES, cl_command_queue_properties)
+
+#if defined(CL_VERSION_1_1)
+#define __PARAM_NAME_INFO_1_1(F) \
+    F(cl_context_info, CL_CONTEXT_NUM_DEVICES, cl_uint)\
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) \
+    F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool) \
+    F(cl_device_info, CL_DEVICE_OPENCL_C_VERSION, STRING_CLASS) \
+    \
+    F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \
+    F(cl_mem_info, CL_MEM_OFFSET, ::size_t) \
+    \
+    F(cl_kernel_work_group_info, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, ::size_t) \
+    F(cl_kernel_work_group_info, CL_KERNEL_PRIVATE_MEM_SIZE, cl_ulong) \
+    \
+    F(cl_event_info, CL_EVENT_CONTEXT, cl::Context)
+#endif // CL_VERSION_1_1
+
+    
+#if defined(CL_VERSION_1_2)
+#define __PARAM_NAME_INFO_1_2(F) \
+    F(cl_image_info, CL_IMAGE_BUFFER, cl::Buffer) \
+    \
+    F(cl_program_info, CL_PROGRAM_NUM_KERNELS, ::size_t) \
+    F(cl_program_info, CL_PROGRAM_KERNEL_NAMES, STRING_CLASS) \
+    \
+    F(cl_program_build_info, CL_PROGRAM_BINARY_TYPE, cl_program_binary_type) \
+    \
+    F(cl_kernel_info, CL_KERNEL_ATTRIBUTES, STRING_CLASS) \
+    \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_ADDRESS_QUALIFIER, cl_kernel_arg_address_qualifier) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_ACCESS_QUALIFIER, cl_kernel_arg_access_qualifier) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_NAME, STRING_CLASS) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, STRING_CLASS) \
+    \
+    F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl_device_id) \
+    F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, VECTOR_CLASS<cl_device_partition_property>) \
+    F(cl_device_info, CL_DEVICE_PARTITION_TYPE, VECTOR_CLASS<cl_device_partition_property>)  \
+    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, ::size_t) \
+    F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \
+    F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS, STRING_CLASS)
+#endif // #if defined(CL_VERSION_1_2)
+
+#if defined(USE_CL_DEVICE_FISSION)
+#define __PARAM_NAME_DEVICE_FISSION(F) \
+    F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \
+    F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
+    F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
+    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \
+    F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, VECTOR_CLASS<cl_device_partition_property_ext>)
+#endif // USE_CL_DEVICE_FISSION
+
+template <typename enum_type, cl_int Name>
+struct param_traits {};
+
+#define __CL_DECLARE_PARAM_TRAITS(token, param_name, T) \
+struct token;                                        \
+template<>                                           \
+struct param_traits<detail:: token,param_name>       \
+{                                                    \
+    enum { value = param_name };                     \
+    typedef T param_type;                            \
+};
+
+__PARAM_NAME_INFO_1_0(__CL_DECLARE_PARAM_TRAITS)
+#if defined(CL_VERSION_1_1)
+__PARAM_NAME_INFO_1_1(__CL_DECLARE_PARAM_TRAITS)
+#endif // CL_VERSION_1_1
+#if defined(CL_VERSION_1_2)
+__PARAM_NAME_INFO_1_2(__CL_DECLARE_PARAM_TRAITS)
+#endif // CL_VERSION_1_1
+
+#if defined(USE_CL_DEVICE_FISSION)
+__PARAM_NAME_DEVICE_FISSION(__CL_DECLARE_PARAM_TRAITS);
+#endif // USE_CL_DEVICE_FISSION
+
+#ifdef CL_PLATFORM_ICD_SUFFIX_KHR
+__CL_DECLARE_PARAM_TRAITS(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, STRING_CLASS)
+#endif
+
+#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong)
+#endif
+
+#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, VECTOR_CLASS< ::size_t>)
+#endif
+#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_SIMD_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WAVEFRONT_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint)
+#endif
+
+#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_REGISTERS_PER_BLOCK_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_REGISTERS_PER_BLOCK_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_WARP_SIZE_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WARP_SIZE_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_GPU_OVERLAP_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GPU_OVERLAP_NV, cl_bool)
+#endif
+#ifdef CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, cl_bool)
+#endif
+#ifdef CL_DEVICE_INTEGRATED_MEMORY_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_INTEGRATED_MEMORY_NV, cl_bool)
+#endif
+
+// Convenience functions
+
+template <typename Func, typename T>
+inline cl_int
+getInfo(Func f, cl_uint name, T* param)
+{
+    return getInfoHelper(f, name, param, 0);
+}
+
+template <typename Func, typename Arg0>
+struct GetInfoFunctor0
+{
+    Func f_; const Arg0& arg0_;
+    cl_int operator ()(
+        cl_uint param, ::size_t size, void* value, ::size_t* size_ret)
+    { return f_(arg0_, param, size, value, size_ret); }
+};
+
+template <typename Func, typename Arg0, typename Arg1>
+struct GetInfoFunctor1
+{
+    Func f_; const Arg0& arg0_; const Arg1& arg1_;
+    cl_int operator ()(
+        cl_uint param, ::size_t size, void* value, ::size_t* size_ret)
+    { return f_(arg0_, arg1_, param, size, value, size_ret); }
+};
+
+template <typename Func, typename Arg0, typename T>
+inline cl_int
+getInfo(Func f, const Arg0& arg0, cl_uint name, T* param)
+{
+    GetInfoFunctor0<Func, Arg0> f0 = { f, arg0 };
+    return getInfoHelper(f0, name, param, 0);
+}
+
+template <typename Func, typename Arg0, typename Arg1, typename T>
+inline cl_int
+getInfo(Func f, const Arg0& arg0, const Arg1& arg1, cl_uint name, T* param)
+{
+    GetInfoFunctor1<Func, Arg0, Arg1> f0 = { f, arg0, arg1 };
+    return getInfoHelper(f0, name, param, 0);
+}
+
+template<typename T>
+struct ReferenceHandler
+{ };
+
+#if defined(CL_VERSION_1_2)
+/**
+ * OpenCL 1.2 devices do have retain/release.
+ */
+template <>
+struct ReferenceHandler<cl_device_id>
+{
+    /**
+     * Retain the device.
+     * \param device A valid device created using createSubDevices
+     * \return 
+     *   CL_SUCCESS if the function executed successfully.
+     *   CL_INVALID_DEVICE if device was not a valid subdevice
+     *   CL_OUT_OF_RESOURCES
+     *   CL_OUT_OF_HOST_MEMORY
+     */
+    static cl_int retain(cl_device_id device)
+    { return ::clRetainDevice(device); }
+    /**
+     * Retain the device.
+     * \param device A valid device created using createSubDevices
+     * \return 
+     *   CL_SUCCESS if the function executed successfully.
+     *   CL_INVALID_DEVICE if device was not a valid subdevice
+     *   CL_OUT_OF_RESOURCES
+     *   CL_OUT_OF_HOST_MEMORY
+     */
+    static cl_int release(cl_device_id device)
+    { return ::clReleaseDevice(device); }
+};
+#else // #if defined(CL_VERSION_1_2)
+/**
+ * OpenCL 1.1 devices do not have retain/release.
+ */
+template <>
+struct ReferenceHandler<cl_device_id>
+{
+    // cl_device_id does not have retain().
+    static cl_int retain(cl_device_id)
+    { return CL_SUCCESS; }
+    // cl_device_id does not have release().
+    static cl_int release(cl_device_id)
+    { return CL_SUCCESS; }
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+template <>
+struct ReferenceHandler<cl_platform_id>
+{
+    // cl_platform_id does not have retain().
+    static cl_int retain(cl_platform_id)
+    { return CL_SUCCESS; }
+    // cl_platform_id does not have release().
+    static cl_int release(cl_platform_id)
+    { return CL_SUCCESS; }
+};
+
+template <>
+struct ReferenceHandler<cl_context>
+{
+    static cl_int retain(cl_context context)
+    { return ::clRetainContext(context); }
+    static cl_int release(cl_context context)
+    { return ::clReleaseContext(context); }
+};
+
+template <>
+struct ReferenceHandler<cl_command_queue>
+{
+    static cl_int retain(cl_command_queue queue)
+    { return ::clRetainCommandQueue(queue); }
+    static cl_int release(cl_command_queue queue)
+    { return ::clReleaseCommandQueue(queue); }
+};
+
+template <>
+struct ReferenceHandler<cl_mem>
+{
+    static cl_int retain(cl_mem memory)
+    { return ::clRetainMemObject(memory); }
+    static cl_int release(cl_mem memory)
+    { return ::clReleaseMemObject(memory); }
+};
+
+template <>
+struct ReferenceHandler<cl_sampler>
+{
+    static cl_int retain(cl_sampler sampler)
+    { return ::clRetainSampler(sampler); }
+    static cl_int release(cl_sampler sampler)
+    { return ::clReleaseSampler(sampler); }
+};
+
+template <>
+struct ReferenceHandler<cl_program>
+{
+    static cl_int retain(cl_program program)
+    { return ::clRetainProgram(program); }
+    static cl_int release(cl_program program)
+    { return ::clReleaseProgram(program); }
+};
+
+template <>
+struct ReferenceHandler<cl_kernel>
+{
+    static cl_int retain(cl_kernel kernel)
+    { return ::clRetainKernel(kernel); }
+    static cl_int release(cl_kernel kernel)
+    { return ::clReleaseKernel(kernel); }
+};
+
+template <>
+struct ReferenceHandler<cl_event>
+{
+    static cl_int retain(cl_event event)
+    { return ::clRetainEvent(event); }
+    static cl_int release(cl_event event)
+    { return ::clReleaseEvent(event); }
+};
+
+
+// Extracts version number with major in the upper 16 bits, minor in the lower 16
+static cl_uint getVersion(const char *versionInfo)
+{
+    int highVersion = 0;
+    int lowVersion = 0;
+    int index = 7;
+    while(versionInfo[index] != '.' ) {
+        highVersion *= 10;
+        highVersion += versionInfo[index]-'0';
+        ++index;
+    }
+    ++index;
+    while(versionInfo[index] != ' ' ) {
+        lowVersion *= 10;
+        lowVersion += versionInfo[index]-'0';
+        ++index;
+    }
+    return (highVersion << 16) | lowVersion;
+}
+
+static cl_uint getPlatformVersion(cl_platform_id platform)
+{
+    ::size_t size = 0;
+    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, NULL, &size);
+    char *versionInfo = (char *) alloca(size);
+    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, size, &versionInfo[0], &size);
+    return getVersion(versionInfo);
+}
+
+static cl_uint getDevicePlatformVersion(cl_device_id device)
+{
+    cl_platform_id platform;
+    clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, NULL);
+    return getPlatformVersion(platform);
+}
+
+#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+static cl_uint getContextPlatformVersion(cl_context context)
+{
+    // The platform cannot be queried directly, so we first have to grab a
+    // device and obtain its context
+    ::size_t size = 0;
+    clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size);
+    if (size == 0)
+        return 0;
+    cl_device_id *devices = (cl_device_id *) alloca(size);
+    clGetContextInfo(context, CL_CONTEXT_DEVICES, size, devices, NULL);
+    return getDevicePlatformVersion(devices[0]);
+}
+#endif // #if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+
+template <typename T>
+class Wrapper
+{
+public:
+    typedef T cl_type;
+
+protected:
+    cl_type object_;
+
+public:
+    Wrapper() : object_(NULL) { }
+
+    Wrapper(const cl_type &obj) : object_(obj) { }
+
+    ~Wrapper()
+    {
+        if (object_ != NULL) { release(); }
+    }
+
+    Wrapper(const Wrapper<cl_type>& rhs)
+    {
+        object_ = rhs.object_;
+        if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+    }
+
+    Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
+    {
+        if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+        object_ = rhs.object_;
+        if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+        return *this;
+    }
+
+    Wrapper<cl_type>& operator = (const cl_type &rhs)
+    {
+        if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+        object_ = rhs;
+        return *this;
+    }
+
+    cl_type operator ()() const { return object_; }
+
+    cl_type& operator ()() { return object_; }
+
+protected:
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
+
+    cl_int retain() const
+    {
+        return ReferenceHandler<cl_type>::retain(object_);
+    }
+
+    cl_int release() const
+    {
+        return ReferenceHandler<cl_type>::release(object_);
+    }
+};
+
+template <>
+class Wrapper<cl_device_id>
+{
+public:
+    typedef cl_device_id cl_type;
+
+protected:
+    cl_type object_;
+    bool referenceCountable_;
+
+    static bool isReferenceCountable(cl_device_id device)
+    {
+        bool retVal = false;
+        if (device != NULL) {
+            int version = getDevicePlatformVersion(device);
+            if(version > ((1 << 16) + 1)) {
+                retVal = true;
+            }
+        }
+        return retVal;
+    }
+
+public:
+    Wrapper() : object_(NULL), referenceCountable_(false) 
+    { 
+    }
+    
+    Wrapper(const cl_type &obj) : object_(obj), referenceCountable_(false) 
+    {
+        referenceCountable_ = isReferenceCountable(obj); 
+    }
+
+    ~Wrapper()
+    {
+        if (object_ != NULL) { release(); }
+    }
+    
+    Wrapper(const Wrapper<cl_type>& rhs)
+    {
+        object_ = rhs.object_;
+        referenceCountable_ = isReferenceCountable(object_); 
+        if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+    }
+
+    Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
+    {
+        if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+        object_ = rhs.object_;
+        referenceCountable_ = rhs.referenceCountable_;
+        if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+        return *this;
+    }
+
+    Wrapper<cl_type>& operator = (const cl_type &rhs)
+    {
+        if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+        object_ = rhs;
+        referenceCountable_ = isReferenceCountable(object_); 
+        return *this;
+    }
+
+    cl_type operator ()() const { return object_; }
+
+    cl_type& operator ()() { return object_; }
+
+protected:
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
+
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, VECTOR_CLASS<U>*, int, typename U::cl_type);
+
+    cl_int retain() const
+    {
+        if( referenceCountable_ ) {
+            return ReferenceHandler<cl_type>::retain(object_);
+        }
+        else {
+            return CL_SUCCESS;
+        }
+    }
+
+    cl_int release() const
+    {
+        if( referenceCountable_ ) {
+            return ReferenceHandler<cl_type>::release(object_);
+        }
+        else {
+            return CL_SUCCESS;
+        }
+    }
+};
+
+} // namespace detail
+//! \endcond
+
+/*! \stuct ImageFormat
+ *  \brief Adds constructors and member functions for cl_image_format.
+ *
+ *  \see cl_image_format
+ */
+struct ImageFormat : public cl_image_format
+{
+    //! \brief Default constructor - performs no initialization.
+    ImageFormat(){}
+
+    //! \brief Initializing constructor.
+    ImageFormat(cl_channel_order order, cl_channel_type type)
+    {
+        image_channel_order = order;
+        image_channel_data_type = type;
+    }
+
+    //! \brief Assignment operator.
+    ImageFormat& operator = (const ImageFormat& rhs)
+    {
+        if (this != &rhs) {
+            this->image_channel_data_type = rhs.image_channel_data_type;
+            this->image_channel_order     = rhs.image_channel_order;
+        }
+        return *this;
+    }
+};
+
+/*! \brief Class interface for cl_device_id.
+ *
+ *  \note Copies of these objects are inexpensive, since they don't 'own'
+ *        any underlying resources or data structures.
+ *
+ *  \see cl_device_id
+ */
+class Device : public detail::Wrapper<cl_device_id>
+{
+public:
+    //! \brief Default constructor - initializes to NULL.
+    Device() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Copy constructor.
+     * 
+     *  This simply copies the device ID value, which is an inexpensive operation.
+     */
+    Device(const Device& device) : detail::Wrapper<cl_type>(device) { }
+
+    /*! \brief Constructor from cl_device_id.
+     * 
+     *  This simply copies the device ID value, which is an inexpensive operation.
+     */
+    Device(const cl_device_id &device) : detail::Wrapper<cl_type>(device) { }
+
+    /*! \brief Returns the first device on the default context.
+     *
+     *  \see Context::getDefault()
+     */
+    static Device getDefault(cl_int * err = NULL);
+
+    /*! \brief Assignment operator from Device.
+     * 
+     *  This simply copies the device ID value, which is an inexpensive operation.
+     */
+    Device& operator = (const Device& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment operator from cl_device_id.
+     * 
+     *  This simply copies the device ID value, which is an inexpensive operation.
+     */
+    Device& operator = (const cl_device_id& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetDeviceInfo().
+    template <typename T>
+    cl_int getInfo(cl_device_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetDeviceInfo, object_, name, param),
+            __GET_DEVICE_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetDeviceInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_device_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_device_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /**
+     * CL 1.2 version
+     */
+#if defined(CL_VERSION_1_2)
+    //! \brief Wrapper for clCreateSubDevicesEXT().
+    cl_int createSubDevices(
+        const cl_device_partition_property * properties,
+        VECTOR_CLASS<Device>* devices)
+    {
+        cl_uint n = 0;
+        cl_int err = clCreateSubDevices(object_, properties, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES);
+        }
+
+        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+        err = clCreateSubDevices(object_, properties, n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES);
+        }
+
+        devices->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+/**
+ * CL 1.1 version that uses device fission.
+ */
+#if defined(CL_VERSION_1_1)
+#if defined(USE_CL_DEVICE_FISSION)
+    cl_int createSubDevices(
+        const cl_device_partition_property_ext * properties,
+        VECTOR_CLASS<Device>* devices)
+    {
+        typedef CL_API_ENTRY cl_int 
+            ( CL_API_CALL * PFN_clCreateSubDevicesEXT)(
+                cl_device_id /*in_device*/,
+                const cl_device_partition_property_ext * /* properties */,
+                cl_uint /*num_entries*/,
+                cl_device_id * /*out_devices*/,
+                cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+        static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL;
+        __INIT_CL_EXT_FCN_PTR(clCreateSubDevicesEXT);
+
+        cl_uint n = 0;
+        cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES);
+        }
+
+        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+        err = pfn_clCreateSubDevicesEXT(object_, properties, n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES);
+        }
+
+        devices->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+#endif // #if defined(USE_CL_DEVICE_FISSION)
+#endif // #if defined(CL_VERSION_1_1)
+};
+
+/*! \brief Class interface for cl_platform_id.
+ *
+ *  \note Copies of these objects are inexpensive, since they don't 'own'
+ *        any underlying resources or data structures.
+ *
+ *  \see cl_platform_id
+ */
+class Platform : public detail::Wrapper<cl_platform_id>
+{
+public:
+    //! \brief Default constructor - initializes to NULL.
+    Platform() : detail::Wrapper<cl_type>()  { }
+
+    /*! \brief Copy constructor.
+     * 
+     *  This simply copies the platform ID value, which is an inexpensive operation.
+     */
+    Platform(const Platform& platform) : detail::Wrapper<cl_type>(platform) { }
+
+    /*! \brief Constructor from cl_platform_id.
+     * 
+     *  This simply copies the platform ID value, which is an inexpensive operation.
+     */
+    Platform(const cl_platform_id &platform) : detail::Wrapper<cl_type>(platform) { }
+
+    /*! \brief Assignment operator from Platform.
+     * 
+     *  This simply copies the platform ID value, which is an inexpensive operation.
+     */
+    Platform& operator = (const Platform& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment operator from cl_platform_id.
+     * 
+     *  This simply copies the platform ID value, which is an inexpensive operation.
+     */
+    Platform& operator = (const cl_platform_id& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetPlatformInfo().
+    cl_int getInfo(cl_platform_info name, STRING_CLASS* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetPlatformInfo, object_, name, param),
+            __GET_PLATFORM_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetPlatformInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_platform_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_platform_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*! \brief Gets a list of devices for this platform.
+     * 
+     *  Wraps clGetDeviceIDs().
+     */
+    cl_int getDevices(
+        cl_device_type type,
+        VECTOR_CLASS<Device>* devices) const
+    {
+        cl_uint n = 0;
+        if( devices == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
+        }
+        cl_int err = ::clGetDeviceIDs(object_, type, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+        err = ::clGetDeviceIDs(object_, type, n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        devices->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+
+#if defined(USE_DX_INTEROP)
+   /*! \brief Get the list of available D3D10 devices.
+     *
+     *  \param d3d_device_source.
+     *
+     *  \param d3d_object.
+     *
+     *  \param d3d_device_set.
+     *
+     *  \param devices returns a vector of OpenCL D3D10 devices found. The cl::Device
+     *  values returned in devices can be used to identify a specific OpenCL
+     *  device. If \a devices argument is NULL, this argument is ignored.
+     *
+     *  \return One of the following values:
+     *    - CL_SUCCESS if the function is executed successfully.
+     *
+     *  The application can query specific capabilities of the OpenCL device(s)
+     *  returned by cl::getDevices. This can be used by the application to
+     *  determine which device(s) to use.
+     *
+     * \note In the case that exceptions are enabled and a return value
+     * other than CL_SUCCESS is generated, then cl::Error exception is
+     * generated.
+     */
+    cl_int getDevices(
+        cl_d3d10_device_source_khr d3d_device_source,
+        void *                     d3d_object,
+        cl_d3d10_device_set_khr    d3d_device_set,
+        VECTOR_CLASS<Device>* devices) const
+    {
+        typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clGetDeviceIDsFromD3D10KHR)(
+            cl_platform_id platform, 
+            cl_d3d10_device_source_khr d3d_device_source, 
+            void * d3d_object,
+            cl_d3d10_device_set_khr d3d_device_set,
+            cl_uint num_entries,
+            cl_device_id * devices,
+            cl_uint* num_devices);
+
+        if( devices == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
+        }
+
+        static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = NULL;
+        __INIT_CL_EXT_FCN_PTR_PLATFORM(object_, clGetDeviceIDsFromD3D10KHR);
+
+        cl_uint n = 0;
+        cl_int err = pfn_clGetDeviceIDsFromD3D10KHR(
+            object_, 
+            d3d_device_source, 
+            d3d_object,
+            d3d_device_set, 
+            0, 
+            NULL, 
+            &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+        err = pfn_clGetDeviceIDsFromD3D10KHR(
+            object_, 
+            d3d_device_source, 
+            d3d_object,
+            d3d_device_set,
+            n, 
+            ids, 
+            NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        devices->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+#endif
+
+    /*! \brief Gets a list of available platforms.
+     * 
+     *  Wraps clGetPlatformIDs().
+     */
+    static cl_int get(
+        VECTOR_CLASS<Platform>* platforms)
+    {
+        cl_uint n = 0;
+
+        if( platforms == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_platform_id* ids = (cl_platform_id*) alloca(
+            n * sizeof(cl_platform_id));
+        err = ::clGetPlatformIDs(n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        platforms->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+
+    /*! \brief Gets the first available platform.
+     * 
+     *  Wraps clGetPlatformIDs(), returning the first result.
+     */
+    static cl_int get(
+        Platform * platform)
+    {
+        cl_uint n = 0;
+
+        if( platform == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_platform_id* ids = (cl_platform_id*) alloca(
+            n * sizeof(cl_platform_id));
+        err = ::clGetPlatformIDs(n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        *platform = ids[0];
+        return CL_SUCCESS;
+    }
+
+    /*! \brief Gets the first available platform, returning it by value.
+     * 
+     *  Wraps clGetPlatformIDs(), returning the first result.
+     */
+    static Platform get(
+        cl_int * errResult = NULL)
+    {
+        Platform platform;
+        cl_uint n = 0;
+        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+            if (errResult != NULL) {
+                *errResult = err;
+            }
+        }
+
+        cl_platform_id* ids = (cl_platform_id*) alloca(
+            n * sizeof(cl_platform_id));
+        err = ::clGetPlatformIDs(n, ids, NULL);
+
+        if (err != CL_SUCCESS) {
+            detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        if (errResult != NULL) {
+            *errResult = err;
+        }
+        
+        return ids[0];
+    }
+
+    static Platform getDefault( 
+        cl_int *errResult = NULL )
+    {
+        return get(errResult);
+    }
+
+    
+#if defined(CL_VERSION_1_2)
+    //! \brief Wrapper for clUnloadCompiler().
+    cl_int
+    unloadCompiler()
+    {
+        return ::clUnloadPlatformCompiler(object_);
+    }
+#endif // #if defined(CL_VERSION_1_2)
+}; // class Platform
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2))
+/**
+ * Unload the OpenCL compiler.
+ * \note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead.
+ */
+inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int
+UnloadCompiler() CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+inline cl_int
+UnloadCompiler()
+{
+    return ::clUnloadCompiler();
+}
+#endif // #if defined(CL_VERSION_1_1)
+
+/*! \brief Class interface for cl_context.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_context as the original.  For details, see
+ *        clRetainContext() and clReleaseContext().
+ *
+ *  \see cl_context
+ */
+class Context 
+    : public detail::Wrapper<cl_context>
+{
+private:
+    static volatile int default_initialized_;
+    static Context default_;
+    static volatile cl_int default_error_;
+public:
+    /*! \brief Destructor.
+     *
+     *  This calls clReleaseContext() on the value held by this instance.
+     */
+    ~Context() { }
+
+    /*! \brief Constructs a context including a list of specified devices.
+     *
+     *  Wraps clCreateContext().
+     */
+    Context(
+        const VECTOR_CLASS<Device>& devices,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            ::size_t,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        ::size_t numDevices = devices.size();
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        object_ = ::clCreateContext(
+            properties, (cl_uint) numDevices,
+            deviceIDs,
+            notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Context(
+        const Device& device,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            ::size_t,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        cl_device_id deviceID = device();
+
+        object_ = ::clCreateContext(
+            properties, 1,
+            &deviceID,
+            notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Constructs a context including all or a subset of devices of a specified type.
+     *
+     *  Wraps clCreateContextFromType().
+     */
+    Context(
+        cl_device_type type,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            ::size_t,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+#if !defined(__APPLE__) || !defined(__MACOS)
+        cl_context_properties prop[4] = {CL_CONTEXT_PLATFORM, 0, 0, 0 };
+
+        if (properties == NULL) {
+            // Get a valid platform ID as we cannot send in a blank one
+            VECTOR_CLASS<Platform> platforms;
+            error = Platform::get(&platforms);
+            if (error != CL_SUCCESS) {
+                detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+                if (err != NULL) {
+                    *err = error;
+                }
+                return;
+            }
+
+            // Check the platforms we found for a device of our specified type
+            cl_context_properties platform_id = 0;
+            for (unsigned int i = 0; i < platforms.size(); i++) {
+
+                VECTOR_CLASS<Device> devices;
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+                try {
+#endif
+
+                    error = platforms[i].getDevices(type, &devices);
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+                } catch (Error) {}
+    // Catch if exceptions are enabled as we don't want to exit if first platform has no devices of type
+    // We do error checking next anyway, and can throw there if needed
+#endif
+
+                // Only squash CL_SUCCESS and CL_DEVICE_NOT_FOUND
+                if (error != CL_SUCCESS && error != CL_DEVICE_NOT_FOUND) {
+                    detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+                    if (err != NULL) {
+                        *err = error;
+                    }
+                }
+
+                if (devices.size() > 0) {
+                    platform_id = (cl_context_properties)platforms[i]();
+                    break;
+                }
+            }
+
+            if (platform_id == 0) {
+                detail::errHandler(CL_DEVICE_NOT_FOUND, __CREATE_CONTEXT_FROM_TYPE_ERR);
+                if (err != NULL) {
+                    *err = CL_DEVICE_NOT_FOUND;
+                }
+                return;
+            }
+
+            prop[1] = platform_id;
+            properties = &prop[0];
+        }
+#endif
+        object_ = ::clCreateContextFromType(
+            properties, type, notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Returns a singleton context including all devices of CL_DEVICE_TYPE_DEFAULT.
+     *
+     *  \note All calls to this function return the same cl_context as the first.
+     */
+    static Context getDefault(cl_int * err = NULL) 
+    {
+        int state = detail::compare_exchange(
+            &default_initialized_, 
+            __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED);
+        
+        if (state & __DEFAULT_INITIALIZED) {
+            if (err != NULL) {
+                *err = default_error_;
+            }
+            return default_;
+        }
+
+        if (state & __DEFAULT_BEING_INITIALIZED) {
+              // Assume writes will propagate eventually...
+              while(default_initialized_ != __DEFAULT_INITIALIZED) {
+                  detail::fence();
+              }
+
+            if (err != NULL) {
+                *err = default_error_;
+            }
+            return default_;
+        }
+
+        cl_int error;
+        default_ = Context(
+            CL_DEVICE_TYPE_DEFAULT,
+            NULL,
+            NULL,
+            NULL,
+            &error);
+
+        detail::fence();
+
+        default_error_ = error;
+        // Assume writes will propagate eventually...
+        default_initialized_ = __DEFAULT_INITIALIZED;
+
+        detail::fence();
+
+        if (err != NULL) {
+            *err = default_error_;
+        }
+        return default_;
+
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Context() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Copy constructor.
+     * 
+     *  This calls clRetainContext() on the parameter's cl_context.
+     */
+    Context(const Context& context) : detail::Wrapper<cl_type>(context) { }
+
+    /*! \brief Constructor from cl_context - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_context
+     *  into the new Context object.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Context(const cl_context& context) : detail::Wrapper<cl_type>(context) { }
+
+    /*! \brief Assignment operator from Context.
+     * 
+     *  This calls clRetainContext() on the parameter and clReleaseContext() on
+     *  the previous value held by this instance.
+     */
+    Context& operator = (const Context& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment operator from cl_context - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseContext() on the value previously held by this instance.
+     */
+    Context& operator = (const cl_context& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetContextInfo().
+    template <typename T>
+    cl_int getInfo(cl_context_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetContextInfo, object_, name, param),
+            __GET_CONTEXT_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetContextInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_context_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_context_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*! \brief Gets a list of supported image formats.
+     *  
+     *  Wraps clGetSupportedImageFormats().
+     */
+    cl_int getSupportedImageFormats(
+        cl_mem_flags flags,
+        cl_mem_object_type type,
+        VECTOR_CLASS<ImageFormat>* formats) const
+    {
+        cl_uint numEntries;
+        cl_int err = ::clGetSupportedImageFormats(
+           object_, 
+           flags,
+           type, 
+           0, 
+           NULL, 
+           &numEntries);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
+        }
+
+        ImageFormat* value = (ImageFormat*)
+            alloca(numEntries * sizeof(ImageFormat));
+        err = ::clGetSupportedImageFormats(
+            object_, 
+            flags, 
+            type, 
+            numEntries,
+            (cl_image_format*) value, 
+            NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
+        }
+
+        formats->assign(&value[0], &value[numEntries]);
+        return CL_SUCCESS;
+    }
+};
+
+inline Device Device::getDefault(cl_int * err)
+{
+    cl_int error;
+    Device device;
+
+    Context context = Context::getDefault(&error);
+    detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+    if (error != CL_SUCCESS) {
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+    else {
+        device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+        if (err != NULL) {
+            *err = CL_SUCCESS;
+        }
+    }
+
+    return device;
+}
+
+
+#ifdef _WIN32
+__declspec(selectany) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+__declspec(selectany) Context Context::default_;
+__declspec(selectany) volatile cl_int Context::default_error_ = CL_SUCCESS;
+#else
+__attribute__((weak)) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+__attribute__((weak)) Context Context::default_;
+__attribute__((weak)) volatile cl_int Context::default_error_ = CL_SUCCESS;
+#endif
+
+/*! \brief Class interface for cl_event.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_event as the original.  For details, see
+ *        clRetainEvent() and clReleaseEvent().
+ *
+ *  \see cl_event
+ */
+class Event : public detail::Wrapper<cl_event>
+{
+public:
+    /*! \brief Destructor.
+     *
+     *  This calls clReleaseEvent() on the value held by this instance.
+     */
+    ~Event() { }
+ 
+    //! \brief Default constructor - initializes to NULL.
+    Event() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Copy constructor.
+     * 
+     *  This calls clRetainEvent() on the parameter's cl_event.
+     */
+    Event(const Event& event) : detail::Wrapper<cl_type>(event) { }
+
+    /*! \brief Constructor from cl_event - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_event
+     *  into the new Event object.
+     */
+    Event(const cl_event& event) : detail::Wrapper<cl_type>(event) { }
+
+    /*! \brief Assignment operator from cl_event - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseEvent() on the value previously held by this instance.
+     */
+    Event& operator = (const Event& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment operator from cl_event.
+     * 
+     *  This calls clRetainEvent() on the parameter and clReleaseEvent() on
+     *  the previous value held by this instance.
+     */
+    Event& operator = (const cl_event& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetEventInfo().
+    template <typename T>
+    cl_int getInfo(cl_event_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetEventInfo, object_, name, param),
+            __GET_EVENT_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetEventInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_event_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_event_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    //! \brief Wrapper for clGetEventProfilingInfo().
+    template <typename T>
+    cl_int getProfilingInfo(cl_profiling_info name, T* param) const
+    {
+        return detail::errHandler(detail::getInfo(
+            &::clGetEventProfilingInfo, object_, name, param),
+            __GET_EVENT_PROFILE_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetEventProfilingInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_profiling_info, name>::param_type
+    getProfilingInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_profiling_info, name>::param_type param;
+        cl_int result = getProfilingInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*! \brief Blocks the calling thread until this event completes.
+     * 
+     *  Wraps clWaitForEvents().
+     */
+    cl_int wait() const
+    {
+        return detail::errHandler(
+            ::clWaitForEvents(1, &object_),
+            __WAIT_FOR_EVENTS_ERR);
+    }
+
+#if defined(CL_VERSION_1_1)
+    /*! \brief Registers a user callback function for a specific command execution status.
+     *
+     *  Wraps clSetEventCallback().
+     */
+    cl_int setCallback(
+        cl_int type,
+        void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *),		
+        void * user_data = NULL)
+    {
+        return detail::errHandler(
+            ::clSetEventCallback(
+                object_,
+                type,
+                pfn_notify,
+                user_data), 
+            __SET_EVENT_CALLBACK_ERR);
+    }
+#endif
+
+    /*! \brief Blocks the calling thread until every event specified is complete.
+     * 
+     *  Wraps clWaitForEvents().
+     */
+    static cl_int
+    waitForEvents(const VECTOR_CLASS<Event>& events)
+    {
+        return detail::errHandler(
+            ::clWaitForEvents(
+                (cl_uint) events.size(), (cl_event*)&events.front()),
+            __WAIT_FOR_EVENTS_ERR);
+    }
+};
+
+#if defined(CL_VERSION_1_1)
+/*! \brief Class interface for user events (a subset of cl_event's).
+ * 
+ *  See Event for details about copy semantics, etc.
+ */
+class UserEvent : public Event
+{
+public:
+    /*! \brief Constructs a user event on a given context.
+     *
+     *  Wraps clCreateUserEvent().
+     */
+    UserEvent(
+        const Context& context,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateUserEvent(
+            context(),
+            &error);
+
+        detail::errHandler(error, __CREATE_USER_EVENT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    UserEvent() : Event() { }
+
+    //! \brief Copy constructor - performs shallow copy.
+    UserEvent(const UserEvent& event) : Event(event) { }
+
+    //! \brief Assignment Operator - performs shallow copy.
+    UserEvent& operator = (const UserEvent& rhs)
+    {
+        if (this != &rhs) {
+            Event::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Sets the execution status of a user event object.
+     *
+     *  Wraps clSetUserEventStatus().
+     */
+    cl_int setStatus(cl_int status)
+    {
+        return detail::errHandler(
+            ::clSetUserEventStatus(object_,status), 
+            __SET_USER_EVENT_STATUS_ERR);
+    }
+};
+#endif
+
+/*! \brief Blocks the calling thread until every event specified is complete.
+ * 
+ *  Wraps clWaitForEvents().
+ */
+inline static cl_int
+WaitForEvents(const VECTOR_CLASS<Event>& events)
+{
+    return detail::errHandler(
+        ::clWaitForEvents(
+            (cl_uint) events.size(), (cl_event*)&events.front()),
+        __WAIT_FOR_EVENTS_ERR);
+}
+
+/*! \brief Class interface for cl_mem.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_mem as the original.  For details, see
+ *        clRetainMemObject() and clReleaseMemObject().
+ *
+ *  \see cl_mem
+ */
+class Memory : public detail::Wrapper<cl_mem>
+{
+public:
+ 
+    /*! \brief Destructor.
+     *
+     *  This calls clReleaseMemObject() on the value held by this instance.
+     */
+    ~Memory() {}
+
+    //! \brief Default constructor - initializes to NULL.
+    Memory() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     * 
+     *  This calls clRetainMemObject() on the parameter's cl_mem.
+     */
+    Memory(const Memory& memory) : detail::Wrapper<cl_type>(memory) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_mem
+     *  into the new Memory object.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Memory(const cl_mem& memory) : detail::Wrapper<cl_type>(memory) { }
+
+    /*! \brief Assignment operator from Memory.
+     * 
+     *  This calls clRetainMemObject() on the parameter and clReleaseMemObject()
+     *  on the previous value held by this instance.
+     */
+    Memory& operator = (const Memory& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment operator from cl_mem - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseMemObject() on the value previously held by this instance.
+     */
+    Memory& operator = (const cl_mem& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetMemObjectInfo().
+    template <typename T>
+    cl_int getInfo(cl_mem_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetMemObjectInfo, object_, name, param),
+            __GET_MEM_OBJECT_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetMemObjectInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_mem_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_mem_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+#if defined(CL_VERSION_1_1)
+    /*! \brief Registers a callback function to be called when the memory object
+     *         is no longer needed.
+     *
+     *  Wraps clSetMemObjectDestructorCallback().
+     *
+     *  Repeated calls to this function, for a given cl_mem value, will append
+     *  to the list of functions called (in reverse order) when memory object's
+     *  resources are freed and the memory object is deleted.
+     *
+     *  \note
+     *  The registered callbacks are associated with the underlying cl_mem
+     *  value - not the Memory class instance.
+     */
+    cl_int setDestructorCallback(
+        void (CL_CALLBACK * pfn_notify)(cl_mem, void *),		
+        void * user_data = NULL)
+    {
+        return detail::errHandler(
+            ::clSetMemObjectDestructorCallback(
+                object_,
+                pfn_notify,
+                user_data), 
+            __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR);
+    }
+#endif
+
+};
+
+// Pre-declare copy functions
+class Buffer;
+template< typename IteratorType >
+cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );
+template< typename IteratorType >
+cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );
+template< typename IteratorType >
+cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );
+template< typename IteratorType >
+cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );
+
+
+/*! \brief Class interface for Buffer Memory Objects.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ *
+ *  \see Memory
+ */
+class Buffer : public Memory
+{
+public:
+
+    /*! \brief Constructs a Buffer in a specified context.
+     *
+     *  Wraps clCreateBuffer().
+     *
+     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
+     *                  specified.  Note alignment & exclusivity requirements.
+     */
+    Buffer(
+        const Context& context,
+        cl_mem_flags flags,
+        ::size_t size,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Constructs a Buffer in the default context.
+     *
+     *  Wraps clCreateBuffer().
+     *
+     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
+     *                  specified.  Note alignment & exclusivity requirements.
+     *
+     *  \see Context::getDefault()
+     */
+    Buffer(
+         cl_mem_flags flags,
+        ::size_t size,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        Context context = Context::getDefault(err);
+
+        object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*!
+     * \brief Construct a Buffer from a host container via iterators.
+     * IteratorType must be random access.
+     * If useHostPtr is specified iterators must represent contiguous data.
+     */
+    template< typename IteratorType >
+    Buffer(
+        IteratorType startIterator,
+        IteratorType endIterator,
+        bool readOnly,
+        bool useHostPtr = false,
+        cl_int* err = NULL)
+    {
+        typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+        cl_int error;
+
+        cl_mem_flags flags = 0;
+        if( readOnly ) {
+            flags |= CL_MEM_READ_ONLY;
+        }
+        else {
+            flags |= CL_MEM_READ_WRITE;
+        }
+        if( useHostPtr ) {
+            flags |= CL_MEM_USE_HOST_PTR;
+        }
+        
+        ::size_t size = sizeof(DataType)*(endIterator - startIterator);
+
+        Context context = Context::getDefault(err);
+
+        if( useHostPtr ) {
+            object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+        } else {
+            object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
+        }
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        if( !useHostPtr ) {
+            error = cl::copy(startIterator, endIterator, *this);
+            detail::errHandler(error, __CREATE_BUFFER_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+    }
+
+    /*!
+     * \brief Construct a Buffer from a host container via iterators using a specified context.
+     * IteratorType must be random access.
+     * If useHostPtr is specified iterators must represent contiguous data.
+     */
+    template< typename IteratorType >
+    Buffer(const Context &context, IteratorType startIterator, IteratorType endIterator,
+        bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
+
+    //! \brief Default constructor - initializes to NULL.
+    Buffer() : Memory() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Buffer(const Buffer& buffer) : Memory(buffer) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Buffer(const cl_mem& buffer) : Memory(buffer) { }
+
+    /*! \brief Assignment from Buffer - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Buffer& operator = (const Buffer& rhs)
+    {
+        if (this != &rhs) {
+            Memory::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Buffer& operator = (const cl_mem& rhs)
+    {
+        Memory::operator=(rhs);
+        return *this;
+    }
+
+#if defined(CL_VERSION_1_1)
+    /*! \brief Creates a new buffer object from this.
+     *
+     *  Wraps clCreateSubBuffer().
+     */
+    Buffer createSubBuffer(
+        cl_mem_flags flags,
+        cl_buffer_create_type buffer_create_type,
+        const void * buffer_create_info,
+        cl_int * err = NULL)
+    {
+        Buffer result;
+        cl_int error;
+        result.object_ = ::clCreateSubBuffer(
+            object_, 
+            flags, 
+            buffer_create_type, 
+            buffer_create_info, 
+            &error);
+
+        detail::errHandler(error, __CREATE_SUBBUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        return result;
+    }		
+#endif
+};
+
+#if defined (USE_DX_INTEROP)
+/*! \brief Class interface for creating OpenCL buffers from ID3D10Buffer's.
+ *
+ *  This is provided to facilitate interoperability with Direct3D.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ *
+ *  \see Memory
+ */
+class BufferD3D10 : public Buffer
+{
+public:
+    typedef CL_API_ENTRY cl_mem (CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)(
+    cl_context context, cl_mem_flags flags, ID3D10Buffer*  buffer,
+    cl_int* errcode_ret);
+
+    /*! \brief Constructs a BufferD3D10, in a specified context, from a
+     *         given ID3D10Buffer.
+     *
+     *  Wraps clCreateFromD3D10BufferKHR().
+     */
+    BufferD3D10(
+        const Context& context,
+        cl_mem_flags flags,
+        ID3D10Buffer* bufobj,
+        cl_int * err = NULL)
+    {
+        static PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR = NULL;
+
+#if defined(CL_VERSION_1_2)
+        vector<cl_context_properties> props = context.getInfo<CL_CONTEXT_PROPERTIES>();
+        cl_platform platform = -1;
+        for( int i = 0; i < props.size(); ++i ) {
+            if( props[i] == CL_CONTEXT_PLATFORM ) {
+                platform = props[i+1];
+            }
+        }
+        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clCreateFromD3D10BufferKHR);
+#endif
+#if defined(CL_VERSION_1_1)
+        __INIT_CL_EXT_FCN_PTR(clCreateFromD3D10BufferKHR);
+#endif
+
+        cl_int error;
+        object_ = pfn_clCreateFromD3D10BufferKHR(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    BufferD3D10() : Buffer() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferD3D10(const BufferD3D10& buffer) : Buffer(buffer) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS BufferD3D10(const cl_mem& buffer) : Buffer(buffer) { }
+
+    /*! \brief Assignment from BufferD3D10 - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferD3D10& operator = (const BufferD3D10& rhs)
+    {
+        if (this != &rhs) {
+            Buffer::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferD3D10& operator = (const cl_mem& rhs)
+    {
+        Buffer::operator=(rhs);
+        return *this;
+    }
+};
+#endif
+
+/*! \brief Class interface for GL Buffer Memory Objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class BufferGL : public Buffer
+{
+public:
+    /*! \brief Constructs a BufferGL in a specified context, from a given
+     *         GL buffer.
+     *
+     *  Wraps clCreateFromGLBuffer().
+     */
+    BufferGL(
+        const Context& context,
+        cl_mem_flags flags,
+        GLuint bufobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLBuffer(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    BufferGL() : Buffer() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferGL(const BufferGL& buffer) : Buffer(buffer) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS BufferGL(const cl_mem& buffer) : Buffer(buffer) { }
+
+    /*! \brief Assignment from BufferGL - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferGL& operator = (const BufferGL& rhs)
+    {
+        if (this != &rhs) {
+            Buffer::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferGL& operator = (const cl_mem& rhs)
+    {
+        Buffer::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetGLObjectInfo().
+    cl_int getObjectInfo(
+        cl_gl_object_type *type,
+        GLuint * gl_object_name)
+    {
+        return detail::errHandler(
+            ::clGetGLObjectInfo(object_,type,gl_object_name),
+            __GET_GL_OBJECT_INFO_ERR);
+    }
+};
+
+/*! \brief Class interface for GL Render Buffer Memory Objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class BufferRenderGL : public Buffer
+{
+public:
+    /*! \brief Constructs a BufferRenderGL in a specified context, from a given
+     *         GL Renderbuffer.
+     *
+     *  Wraps clCreateFromGLRenderbuffer().
+     */
+    BufferRenderGL(
+        const Context& context,
+        cl_mem_flags flags,
+        GLuint bufobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLRenderbuffer(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_RENDER_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    BufferRenderGL() : Buffer() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferRenderGL(const BufferGL& buffer) : Buffer(buffer) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS BufferRenderGL(const cl_mem& buffer) : Buffer(buffer) { }
+
+    /*! \brief Assignment from BufferGL - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferRenderGL& operator = (const BufferRenderGL& rhs)
+    {
+        if (this != &rhs) {
+            Buffer::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferRenderGL& operator = (const cl_mem& rhs)
+    {
+        Buffer::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetGLObjectInfo().
+    cl_int getObjectInfo(
+        cl_gl_object_type *type,
+        GLuint * gl_object_name)
+    {
+        return detail::errHandler(
+            ::clGetGLObjectInfo(object_,type,gl_object_name),
+            __GET_GL_OBJECT_INFO_ERR);
+    }
+};
+
+/*! \brief C++ base class for Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image : public Memory
+{
+protected:
+    //! \brief Default constructor - initializes to NULL.
+    Image() : Memory() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image(const Image& image) : Memory(image) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image(const cl_mem& image) : Memory(image) { }
+
+    /*! \brief Assignment from Image - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image& operator = (const Image& rhs)
+    {
+        if (this != &rhs) {
+            Memory::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image& operator = (const cl_mem& rhs)
+    {
+        Memory::operator=(rhs);
+        return *this;
+    }
+
+public:
+    //! \brief Wrapper for clGetImageInfo().
+    template <typename T>
+    cl_int getImageInfo(cl_image_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetImageInfo, object_, name, param),
+            __GET_IMAGE_INFO_ERR);
+    }
+    
+    //! \brief Wrapper for clGetImageInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_image_info, name>::param_type
+    getImageInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_image_info, name>::param_type param;
+        cl_int result = getImageInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+};
+
+#if defined(CL_VERSION_1_2)
+/*! \brief Class interface for 1D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image1D : public Image
+{
+public:
+    /*! \brief Constructs a 1D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image1D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t width,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE1D,
+            width,
+            0, 0, 0, 0, 0, 0, 0, 0
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image1D() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image1D(const Image1D& image1D) : Image(image1D) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image1D(const cl_mem& image1D) : Image(image1D) { }
+
+    /*! \brief Assignment from Image1D - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image1D& operator = (const Image1D& rhs)
+    {
+        if (this != &rhs) {
+            Image::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image1D& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+};
+
+/*! \class Image1DBuffer
+ * \brief Image interface for 1D buffer images.
+ */
+class Image1DBuffer : public Image
+{
+public:
+    Image1DBuffer(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t width,
+        const Buffer &buffer,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE1D_BUFFER,
+            width,
+            0, 0, 0, 0, 0, 0, 0,
+            buffer()
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            NULL, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image1DBuffer() { }
+
+    Image1DBuffer(const Image1DBuffer& image1D) : Image(image1D) { }
+
+    __CL_EXPLICIT_CONSTRUCTORS Image1DBuffer(const cl_mem& image1D) : Image(image1D) { }
+
+    Image1DBuffer& operator = (const Image1DBuffer& rhs)
+    {
+        if (this != &rhs) {
+            Image::operator=(rhs);
+        }
+        return *this;
+    }
+
+    Image1DBuffer& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+};
+
+/*! \class Image1DArray
+ * \brief Image interface for arrays of 1D images.
+ */
+class Image1DArray : public Image
+{
+public:
+    Image1DArray(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t arraySize,
+        ::size_t width,
+        ::size_t rowPitch,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE1D_ARRAY,
+            width,
+            0, 0,  // height, depth (unused)
+            arraySize,
+            rowPitch,
+            0, 0, 0, 0
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image1DArray() { }
+
+    Image1DArray(const Image1DArray& imageArray) : Image(imageArray) { }
+
+    __CL_EXPLICIT_CONSTRUCTORS Image1DArray(const cl_mem& imageArray) : Image(imageArray) { }
+
+    Image1DArray& operator = (const Image1DArray& rhs)
+    {
+        if (this != &rhs) {
+            Image::operator=(rhs);
+        }
+        return *this;
+    }
+
+    Image1DArray& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+
+/*! \brief Class interface for 2D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image2D : public Image
+{
+public:
+    /*! \brief Constructs a 1D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image2D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t width,
+        ::size_t height,
+        ::size_t row_pitch = 0,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        bool useCreateImage;
+
+#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+        // Run-time decision based on the actual platform
+        {
+            cl_uint version = detail::getContextPlatformVersion(context());
+            useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
+        }
+#elif defined(CL_VERSION_1_2)
+        useCreateImage = true;
+#else
+        useCreateImage = false;
+#endif
+
+#if defined(CL_VERSION_1_2)
+        if (useCreateImage)
+        {
+            cl_image_desc desc =
+            {
+                CL_MEM_OBJECT_IMAGE2D,
+                width,
+                height,
+                0, 0, // depth, array size (unused)
+                row_pitch,
+                0, 0, 0, 0
+            };
+            object_ = ::clCreateImage(
+                context(),
+                flags,
+                &format,
+                &desc,
+                host_ptr,
+                &error);
+
+            detail::errHandler(error, __CREATE_IMAGE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // #if defined(CL_VERSION_1_2)
+#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+        if (!useCreateImage)
+        {
+            object_ = ::clCreateImage2D(
+                context(), flags,&format, width, height, row_pitch, host_ptr, &error);
+
+            detail::errHandler(error, __CREATE_IMAGE2D_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image2D() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2D(const Image2D& image2D) : Image(image2D) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image2D(const cl_mem& image2D) : Image(image2D) { }
+
+    /*! \brief Assignment from Image2D - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2D& operator = (const Image2D& rhs)
+    {
+        if (this != &rhs) {
+            Image::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2D& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+};
+
+
+#if !defined(CL_VERSION_1_2)
+/*! \brief Class interface for GL 2D Image Memory objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ *  \note Deprecated for OpenCL 1.2. Please use ImageGL instead.
+ */
+class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED : public Image2D
+{
+public:
+    /*! \brief Constructs an Image2DGL in a specified context, from a given
+     *         GL Texture.
+     *
+     *  Wraps clCreateFromGLTexture2D().
+     */
+    Image2DGL(
+        const Context& context,
+        cl_mem_flags flags,
+        GLenum target,
+        GLint  miplevel,
+        GLuint texobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLTexture2D(
+            context(),
+            flags,
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_TEXTURE_2D_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+    }
+    
+    //! \brief Default constructor - initializes to NULL.
+    Image2DGL() : Image2D() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2DGL(const Image2DGL& image) : Image2D(image) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image2DGL(const cl_mem& image) : Image2D(image) { }
+
+    /*! \brief Assignment from Image2DGL - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2DGL& operator = (const Image2DGL& rhs)
+    {
+        if (this != &rhs) {
+            Image2D::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2DGL& operator = (const cl_mem& rhs)
+    {
+        Image2D::operator=(rhs);
+        return *this;
+    }
+};
+#endif // #if !defined(CL_VERSION_1_2)
+
+#if defined(CL_VERSION_1_2)
+/*! \class Image2DArray
+ * \brief Image interface for arrays of 2D images.
+ */
+class Image2DArray : public Image
+{
+public:
+    Image2DArray(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t arraySize,
+        ::size_t width,
+        ::size_t height,
+        ::size_t rowPitch,
+        ::size_t slicePitch,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE2D_ARRAY,
+            width,
+            height,
+            0,       // depth (unused)
+            arraySize,
+            rowPitch,
+            slicePitch,
+            0, 0, 0
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image2DArray() { }
+
+    Image2DArray(const Image2DArray& imageArray) : Image(imageArray) { }
+
+    __CL_EXPLICIT_CONSTRUCTORS Image2DArray(const cl_mem& imageArray) : Image(imageArray) { }
+
+    Image2DArray& operator = (const Image2DArray& rhs)
+    {
+        if (this != &rhs) {
+            Image::operator=(rhs);
+        }
+        return *this;
+    }
+
+    Image2DArray& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+/*! \brief Class interface for 3D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image3D : public Image
+{
+public:
+    /*! \brief Constructs a 3D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image3D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t width,
+        ::size_t height,
+        ::size_t depth,
+        ::size_t row_pitch = 0,
+        ::size_t slice_pitch = 0,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        bool useCreateImage;
+
+#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+        // Run-time decision based on the actual platform
+        {
+            cl_uint version = detail::getContextPlatformVersion(context());
+            useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
+        }
+#elif defined(CL_VERSION_1_2)
+        useCreateImage = true;
+#else
+        useCreateImage = false;
+#endif
+
+#if defined(CL_VERSION_1_2)
+        if (useCreateImage)
+        {
+            cl_image_desc desc =
+            {
+                CL_MEM_OBJECT_IMAGE3D,
+                width,
+                height,
+                depth,
+                0,      // array size (unused)
+                row_pitch,
+                slice_pitch,
+                0, 0, 0
+            };
+            object_ = ::clCreateImage(
+                context(), 
+                flags, 
+                &format, 
+                &desc, 
+                host_ptr, 
+                &error);
+
+            detail::errHandler(error, __CREATE_IMAGE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif  // #if defined(CL_VERSION_1_2)
+#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+        if (!useCreateImage)
+        {
+            object_ = ::clCreateImage3D(
+                context(), flags, &format, width, height, depth, row_pitch,
+                slice_pitch, host_ptr, &error);
+
+            detail::errHandler(error, __CREATE_IMAGE3D_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image3D() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3D(const Image3D& image3D) : Image(image3D) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image3D(const cl_mem& image3D) : Image(image3D) { }
+
+    /*! \brief Assignment from Image3D - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3D& operator = (const Image3D& rhs)
+    {
+        if (this != &rhs) {
+            Image::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3D& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+};
+
+#if !defined(CL_VERSION_1_2)
+/*! \brief Class interface for GL 3D Image Memory objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image3DGL : public Image3D
+{
+public:
+    /*! \brief Constructs an Image3DGL in a specified context, from a given
+     *         GL Texture.
+     *
+     *  Wraps clCreateFromGLTexture3D().
+     */
+    Image3DGL(
+        const Context& context,
+        cl_mem_flags flags,
+        GLenum target,
+        GLint  miplevel,
+        GLuint texobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLTexture3D(
+            context(),
+            flags,
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_TEXTURE_3D_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image3DGL() : Image3D() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3DGL(const Image3DGL& image) : Image3D(image) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image3DGL(const cl_mem& image) : Image3D(image) { }
+
+    /*! \brief Assignment from Image3DGL - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3DGL& operator = (const Image3DGL& rhs)
+    {
+        if (this != &rhs) {
+            Image3D::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3DGL& operator = (const cl_mem& rhs)
+    {
+        Image3D::operator=(rhs);
+        return *this;
+    }
+};
+#endif // #if !defined(CL_VERSION_1_2)
+
+#if defined(CL_VERSION_1_2)
+/*! \class ImageGL
+ * \brief general image interface for GL interop.
+ * We abstract the 2D and 3D GL images into a single instance here
+ * that wraps all GL sourced images on the grounds that setup information
+ * was performed by OpenCL anyway.
+ */
+class ImageGL : public Image
+{
+public:
+    ImageGL(
+        const Context& context,
+        cl_mem_flags flags,
+        GLenum target,
+        GLint  miplevel,
+        GLuint texobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLTexture(
+            context(), 
+            flags, 
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_TEXTURE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    ImageGL() : Image() { }
+
+    ImageGL(const ImageGL& image) : Image(image) { }
+
+    __CL_EXPLICIT_CONSTRUCTORS ImageGL(const cl_mem& image) : Image(image) { }
+
+    ImageGL& operator = (const ImageGL& rhs)
+    {
+        if (this != &rhs) {
+            Image::operator=(rhs);
+        }
+        return *this;
+    }
+
+    ImageGL& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+/*! \brief Class interface for cl_sampler.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_sampler as the original.  For details, see
+ *        clRetainSampler() and clReleaseSampler().
+ *
+ *  \see cl_sampler 
+ */
+class Sampler : public detail::Wrapper<cl_sampler>
+{
+public:
+    /*! \brief Destructor.
+     *
+     *  This calls clReleaseSampler() on the value held by this instance.
+     */
+    ~Sampler() { }
+
+    //! \brief Default constructor - initializes to NULL.
+    Sampler() { }
+
+    /*! \brief Constructs a Sampler in a specified context.
+     *
+     *  Wraps clCreateSampler().
+     */
+    Sampler(
+        const Context& context,
+        cl_bool normalized_coords,
+        cl_addressing_mode addressing_mode,
+        cl_filter_mode filter_mode,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateSampler(
+            context(), 
+            normalized_coords,
+            addressing_mode,
+            filter_mode,
+            &error);
+
+        detail::errHandler(error, __CREATE_SAMPLER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     * 
+     *  This calls clRetainSampler() on the parameter's cl_sampler.
+     */
+    Sampler(const Sampler& sampler) : detail::Wrapper<cl_type>(sampler) { }
+
+    /*! \brief Constructor from cl_sampler - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_sampler
+     *  into the new Sampler object.
+     */
+    Sampler(const cl_sampler& sampler) : detail::Wrapper<cl_type>(sampler) { }
+
+    /*! \brief Assignment operator from Sampler.
+     * 
+     *  This calls clRetainSampler() on the parameter and clReleaseSampler()
+     *  on the previous value held by this instance.
+     */
+    Sampler& operator = (const Sampler& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment operator from cl_sampler - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseSampler() on the value previously held by this instance.
+     */
+    Sampler& operator = (const cl_sampler& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetSamplerInfo().
+    template <typename T>
+    cl_int getInfo(cl_sampler_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetSamplerInfo, object_, name, param),
+            __GET_SAMPLER_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetSamplerInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_sampler_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_sampler_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+};
+
+class Program;
+class CommandQueue;
+class Kernel;
+
+//! \brief Class interface for specifying NDRange values.
+class NDRange
+{
+private:
+    size_t<3> sizes_;
+    cl_uint dimensions_;
+
+public:
+    //! \brief Default constructor - resulting range has zero dimensions.
+    NDRange()
+        : dimensions_(0)
+    { }
+
+    //! \brief Constructs one-dimensional range.
+    NDRange(::size_t size0)
+        : dimensions_(1)
+    {
+        sizes_[0] = size0;
+    }
+
+    //! \brief Constructs two-dimensional range.
+    NDRange(::size_t size0, ::size_t size1)
+        : dimensions_(2)
+    {
+        sizes_[0] = size0;
+        sizes_[1] = size1;
+    }
+
+    //! \brief Constructs three-dimensional range.
+    NDRange(::size_t size0, ::size_t size1, ::size_t size2)
+        : dimensions_(3)
+    {
+        sizes_[0] = size0;
+        sizes_[1] = size1;
+        sizes_[2] = size2;
+    }
+
+    /*! \brief Conversion operator to const ::size_t *.
+     *  
+     *  \returns a pointer to the size of the first dimension.
+     */
+    operator const ::size_t*() const { 
+        return (const ::size_t*) sizes_; 
+    }
+
+    //! \brief Queries the number of dimensions in the range.
+    ::size_t dimensions() const { return dimensions_; }
+};
+
+//! \brief A zero-dimensional range.
+static const NDRange NullRange;
+
+//! \brief Local address wrapper for use with Kernel::setArg
+struct LocalSpaceArg
+{
+    ::size_t size_;
+};
+
+namespace detail {
+
+template <typename T>
+struct KernelArgumentHandler
+{
+    static ::size_t size(const T&) { return sizeof(T); }
+    static T* ptr(T& value) { return &value; }
+};
+
+template <>
+struct KernelArgumentHandler<LocalSpaceArg>
+{
+    static ::size_t size(const LocalSpaceArg& value) { return value.size_; }
+    static void* ptr(LocalSpaceArg&) { return NULL; }
+};
+
+} 
+//! \endcond
+
+/*! __local
+ * \brief Helper function for generating LocalSpaceArg objects.
+ * Deprecated. Replaced with Local.
+ */
+inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED LocalSpaceArg
+__local(::size_t size) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+inline LocalSpaceArg
+__local(::size_t size)
+{
+    LocalSpaceArg ret = { size };
+    return ret;
+}
+
+/*! Local
+ * \brief Helper function for generating LocalSpaceArg objects.
+ */
+inline LocalSpaceArg
+Local(::size_t size)
+{
+    LocalSpaceArg ret = { size };
+    return ret;
+}
+
+//class KernelFunctor;
+
+/*! \brief Class interface for cl_kernel.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_kernel as the original.  For details, see
+ *        clRetainKernel() and clReleaseKernel().
+ *
+ *  \see cl_kernel
+ */
+class Kernel : public detail::Wrapper<cl_kernel>
+{
+public:
+    inline Kernel(const Program& program, const char* name, cl_int* err = NULL);
+
+    /*! \brief Destructor.
+     *
+     *  This calls clReleaseKernel() on the value held by this instance.
+     */
+    ~Kernel() { }
+
+    //! \brief Default constructor - initializes to NULL.
+    Kernel() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     * 
+     *  This calls clRetainKernel() on the parameter's cl_kernel.
+     */
+    Kernel(const Kernel& kernel) : detail::Wrapper<cl_type>(kernel) { }
+
+    /*! \brief Constructor from cl_kernel - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_kernel
+     *  into the new Kernel object.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Kernel(const cl_kernel& kernel) : detail::Wrapper<cl_type>(kernel) { }
+
+    /*! \brief Assignment operator from Kernel.
+     * 
+     *  This calls clRetainKernel() on the parameter and clReleaseKernel()
+     *  on the previous value held by this instance.
+     */
+    Kernel& operator = (const Kernel& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment operator from cl_kernel - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseKernel() on the value previously held by this instance.
+     */
+    Kernel& operator = (const cl_kernel& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_kernel_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetKernelInfo, object_, name, param),
+            __GET_KERNEL_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_kernel_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+#if defined(CL_VERSION_1_2)
+    template <typename T>
+    cl_int getArgInfo(cl_uint argIndex, cl_kernel_arg_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetKernelArgInfo, object_, argIndex, name, param),
+            __GET_KERNEL_ARG_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_arg_info, name>::param_type
+    getArgInfo(cl_uint argIndex, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_kernel_arg_info, name>::param_type param;
+        cl_int result = getArgInfo(argIndex, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    template <typename T>
+    cl_int getWorkGroupInfo(
+        const Device& device, cl_kernel_work_group_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetKernelWorkGroupInfo, object_, device(), name, param),
+                __GET_KERNEL_WORK_GROUP_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_work_group_info, name>::param_type
+        getWorkGroupInfo(const Device& device, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+        detail::cl_kernel_work_group_info, name>::param_type param;
+        cl_int result = getWorkGroupInfo(device, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    template <typename T>
+    cl_int setArg(cl_uint index, T value)
+    {
+        return detail::errHandler(
+            ::clSetKernelArg(
+                object_,
+                index,
+                detail::KernelArgumentHandler<T>::size(value),
+                detail::KernelArgumentHandler<T>::ptr(value)),
+            __SET_KERNEL_ARGS_ERR);
+    }
+
+    cl_int setArg(cl_uint index, ::size_t size, void* argPtr)
+    {
+        return detail::errHandler(
+            ::clSetKernelArg(object_, index, size, argPtr),
+            __SET_KERNEL_ARGS_ERR);
+    }
+};
+
+/*! \class Program
+ * \brief Program interface that implements cl_program.
+ */
+class Program : public detail::Wrapper<cl_program>
+{
+public:
+    typedef VECTOR_CLASS<std::pair<const void*, ::size_t> > Binaries;
+    typedef VECTOR_CLASS<std::pair<const char*, ::size_t> > Sources;
+
+    Program(
+        const STRING_CLASS& source,
+		bool build = false,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const char * strings = source.c_str();
+        const ::size_t length  = source.size();
+
+        Context context = Context::getDefault(err);
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)1, &strings, &length, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+
+        if (error == CL_SUCCESS && build) {
+
+            error = ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+                "",
+                NULL,
+                NULL);
+
+            detail::errHandler(error, __BUILD_PROGRAM_ERR);
+        }
+
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Program(
+        const Context& context,
+        const STRING_CLASS& source,
+        bool build = false,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const char * strings = source.c_str();
+        const ::size_t length  = source.size();
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)1, &strings, &length, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+
+        if (error == CL_SUCCESS && build) {
+
+            error = ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+                "",
+                NULL,
+                NULL);
+
+            detail::errHandler(error, __BUILD_PROGRAM_ERR);
+        }
+
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Program(
+        const Context& context,
+        const Sources& sources,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const ::size_t n = (::size_t)sources.size();
+        ::size_t* lengths = (::size_t*) alloca(n * sizeof(::size_t));
+        const char** strings = (const char**) alloca(n * sizeof(const char*));
+
+        for (::size_t i = 0; i < n; ++i) {
+            strings[i] = sources[(int)i].first;
+            lengths[i] = sources[(int)i].second;
+        }
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)n, strings, lengths, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /**
+     * Construct a program object from a list of devices and a per-device list of binaries.
+     * \param context A valid OpenCL context in which to construct the program.
+     * \param devices A vector of OpenCL device objects for which the program will be created.
+     * \param binaries A vector of pairs of a pointer to a binary object and its length.
+     * \param binaryStatus An optional vector that on completion will be resized to
+     *   match the size of binaries and filled with values to specify if each binary
+     *   was successfully loaded.
+     *   Set to CL_SUCCESS if the binary was successfully loaded.
+     *   Set to CL_INVALID_VALUE if the length is 0 or the binary pointer is NULL.
+     *   Set to CL_INVALID_BINARY if the binary provided is not valid for the matching device.
+     * \param err if non-NULL will be set to CL_SUCCESS on successful operation or one of the following errors:
+     *   CL_INVALID_CONTEXT if context is not a valid context.
+     *   CL_INVALID_VALUE if the length of devices is zero; or if the length of binaries does not match the length of devices; 
+     *     or if any entry in binaries is NULL or has length 0.
+     *   CL_INVALID_DEVICE if OpenCL devices listed in devices are not in the list of devices associated with context.
+     *   CL_INVALID_BINARY if an invalid program binary was encountered for any device. binaryStatus will return specific status for each device.
+     *   CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host.
+     */
+    Program(
+        const Context& context,
+        const VECTOR_CLASS<Device>& devices,
+        const Binaries& binaries,
+        VECTOR_CLASS<cl_int>* binaryStatus = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        
+        const ::size_t numDevices = devices.size();
+        
+        // Catch size mismatch early and return
+        if(binaries.size() != numDevices) {
+            error = CL_INVALID_VALUE;
+            detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+            return;
+        }
+
+        ::size_t* lengths = (::size_t*) alloca(numDevices * sizeof(::size_t));
+        const unsigned char** images = (const unsigned char**) alloca(numDevices * sizeof(const unsigned char**));
+
+        for (::size_t i = 0; i < numDevices; ++i) {
+            images[i] = (const unsigned char*)binaries[i].first;
+            lengths[i] = binaries[(int)i].second;
+        }
+
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        if(binaryStatus) {
+            binaryStatus->resize(numDevices);
+        }
+        
+        object_ = ::clCreateProgramWithBinary(
+            context(), (cl_uint) devices.size(),
+            deviceIDs,
+            lengths, images, binaryStatus != NULL
+               ? &binaryStatus->front()
+               : NULL, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    
+#if defined(CL_VERSION_1_2)
+    /**
+     * Create program using builtin kernels.
+     * \param kernelNames Semi-colon separated list of builtin kernel names
+     */
+    Program(
+        const Context& context,
+        const VECTOR_CLASS<Device>& devices,
+        const STRING_CLASS& kernelNames,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+
+        ::size_t numDevices = devices.size();
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+        
+        object_ = ::clCreateProgramWithBuiltInKernels(
+            context(), 
+            (cl_uint) devices.size(),
+            deviceIDs,
+            kernelNames.c_str(), 
+            &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    Program() { }
+
+    Program(const Program& program) : detail::Wrapper<cl_type>(program) { }
+
+    __CL_EXPLICIT_CONSTRUCTORS Program(const cl_program& program) : detail::Wrapper<cl_type>(program) { }
+
+    Program& operator = (const Program& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    Program& operator = (const cl_program& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    cl_int build(
+        const VECTOR_CLASS<Device>& devices,
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        ::size_t numDevices = devices.size();
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        return detail::errHandler(
+            ::clBuildProgram(
+                object_,
+                (cl_uint)
+                devices.size(),
+                deviceIDs,
+                options,
+                notifyFptr,
+                data),
+                __BUILD_PROGRAM_ERR);
+    }
+
+    cl_int build(
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        return detail::errHandler(
+            ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+                options,
+                notifyFptr,
+                data),
+                __BUILD_PROGRAM_ERR);
+    }
+
+#if defined(CL_VERSION_1_2)
+	cl_int compile(
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        return detail::errHandler(
+            ::clCompileProgram(
+                object_,
+                0,
+                NULL,
+                options,
+				0,
+				NULL,
+				NULL,
+                notifyFptr,
+                data),
+                __COMPILE_PROGRAM_ERR);
+    }
+#endif
+
+    template <typename T>
+    cl_int getInfo(cl_program_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetProgramInfo, object_, name, param),
+            __GET_PROGRAM_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_program_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_program_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    template <typename T>
+    cl_int getBuildInfo(
+        const Device& device, cl_program_build_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetProgramBuildInfo, object_, device(), name, param),
+                __GET_PROGRAM_BUILD_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_program_build_info, name>::param_type
+    getBuildInfo(const Device& device, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_program_build_info, name>::param_type param;
+        cl_int result = getBuildInfo(device, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    cl_int createKernels(VECTOR_CLASS<Kernel>* kernels)
+    {
+        cl_uint numKernels;
+        cl_int err = ::clCreateKernelsInProgram(object_, 0, NULL, &numKernels);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
+        }
+
+        Kernel* value = (Kernel*) alloca(numKernels * sizeof(Kernel));
+        err = ::clCreateKernelsInProgram(
+            object_, numKernels, (cl_kernel*) value, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
+        }
+
+        kernels->assign(&value[0], &value[numKernels]);
+        return CL_SUCCESS;
+    }
+};
+
+#if defined(CL_VERSION_1_2)
+inline Program linkProgram(
+    Program input1,
+    Program input2,
+    const char* options = NULL,
+    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+    void* data = NULL,
+    cl_int* err = NULL) 
+{
+    cl_int err_local = CL_SUCCESS;
+
+    cl_program programs[2] = { input1(), input2() };
+
+    Context ctx = input1.getInfo<CL_PROGRAM_CONTEXT>();
+
+    cl_program prog = ::clLinkProgram(
+        ctx(),
+        0,
+        NULL,
+        options,
+        2,
+        programs,
+        notifyFptr,
+        data,
+        &err_local);
+
+    detail::errHandler(err_local,__COMPILE_PROGRAM_ERR);
+    if (err != NULL) {
+        *err = err_local;
+    }
+
+    return Program(prog);
+}
+
+inline Program linkProgram(
+    VECTOR_CLASS<Program> inputPrograms,
+    const char* options = NULL,
+    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+    void* data = NULL,
+    cl_int* err = NULL) 
+{
+    cl_int err_local = CL_SUCCESS;
+
+    cl_program * programs = (cl_program*) alloca(inputPrograms.size() * sizeof(cl_program));
+
+    if (programs != NULL) {
+        for (unsigned int i = 0; i < inputPrograms.size(); i++) {
+          programs[i] = inputPrograms[i]();
+        }
+    } 
+
+    cl_program prog = ::clLinkProgram(
+        Context::getDefault()(),
+        0,
+        NULL,
+        options,
+        (cl_uint)inputPrograms.size(),
+        programs,
+        notifyFptr,
+        data,
+        &err_local);
+
+    detail::errHandler(err_local,__COMPILE_PROGRAM_ERR);
+    if (err != NULL) {
+        *err = err_local;
+    }
+
+    return Program(prog);
+}
+#endif
+
+template<>
+inline VECTOR_CLASS<char *> cl::Program::getInfo<CL_PROGRAM_BINARIES>(cl_int* err) const
+{
+    VECTOR_CLASS< ::size_t> sizes = getInfo<CL_PROGRAM_BINARY_SIZES>();
+    VECTOR_CLASS<char *> binaries;
+    for (VECTOR_CLASS< ::size_t>::iterator s = sizes.begin(); s != sizes.end(); ++s) 
+    {
+        char *ptr = NULL;
+        if (*s != 0) 
+            ptr = new char[*s];
+        binaries.push_back(ptr);
+    }
+    
+    cl_int result = getInfo(CL_PROGRAM_BINARIES, &binaries);
+    if (err != NULL) {
+        *err = result;
+    }
+    return binaries;
+}
+
+inline Kernel::Kernel(const Program& program, const char* name, cl_int* err)
+{
+    cl_int error;
+
+    object_ = ::clCreateKernel(program(), name, &error);
+    detail::errHandler(error, __CREATE_KERNEL_ERR);
+
+    if (err != NULL) {
+        *err = error;
+    }
+
+}
+
+/*! \class CommandQueue
+ * \brief CommandQueue interface for cl_command_queue.
+ */
+class CommandQueue : public detail::Wrapper<cl_command_queue>
+{
+private:
+    static volatile int default_initialized_;
+    static CommandQueue default_;
+    static volatile cl_int default_error_;
+public:
+   CommandQueue(
+        cl_command_queue_properties properties,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        Context context = Context::getDefault(&error);
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+        if (error != CL_SUCCESS) {
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+        else {
+            Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+
+            object_ = ::clCreateCommandQueue(
+                context(), device(), properties, &error);
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+    }
+    /*!
+    * \brief Constructs a CommandQueue for an implementation defined device in the given context
+    */
+    explicit CommandQueue(
+        const Context& context,
+        cl_command_queue_properties properties = 0,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        VECTOR_CLASS<cl::Device> devices;
+        error = context.getInfo(CL_CONTEXT_DEVICES, &devices);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+        if (error != CL_SUCCESS)
+        {
+            if (err != NULL) {
+                *err = error;
+            }
+            return;
+        }
+
+        object_ = ::clCreateCommandQueue(context(), devices[0](), properties, &error);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+        if (err != NULL) {
+            *err = error;
+        }
+
+    }
+
+    CommandQueue(
+        const Context& context,
+        const Device& device,
+        cl_command_queue_properties properties = 0,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateCommandQueue(
+            context(), device(), properties, &error);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    static CommandQueue getDefault(cl_int * err = NULL) 
+    {
+        int state = detail::compare_exchange(
+            &default_initialized_, 
+            __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED);
+        
+        if (state & __DEFAULT_INITIALIZED) {
+            if (err != NULL) {
+                *err = default_error_;
+            }
+            return default_;
+        }
+
+        if (state & __DEFAULT_BEING_INITIALIZED) {
+              // Assume writes will propagate eventually...
+              while(default_initialized_ != __DEFAULT_INITIALIZED) {
+                  detail::fence();
+              }
+
+            if (err != NULL) {
+                *err = default_error_;
+            }
+            return default_;
+        }
+
+        cl_int error;
+
+        Context context = Context::getDefault(&error);
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+        if (error != CL_SUCCESS) {
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+        else {
+            Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+
+            default_ = CommandQueue(context, device, 0, &error);
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+
+        detail::fence();
+
+        default_error_ = error;
+        // Assume writes will propagate eventually...
+        default_initialized_ = __DEFAULT_INITIALIZED;
+
+        detail::fence();
+
+        if (err != NULL) {
+            *err = default_error_;
+        }
+        return default_;
+
+    }
+
+    CommandQueue() { }
+
+    CommandQueue(const CommandQueue& commandQueue) : detail::Wrapper<cl_type>(commandQueue) { }
+
+    CommandQueue(const cl_command_queue& commandQueue) : detail::Wrapper<cl_type>(commandQueue) { }
+
+    CommandQueue& operator = (const CommandQueue& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    CommandQueue& operator = (const cl_command_queue& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_command_queue_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetCommandQueueInfo, object_, name, param),
+                __GET_COMMAND_QUEUE_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_command_queue_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_command_queue_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    cl_int enqueueReadBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        ::size_t offset,
+        ::size_t size,
+        void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueReadBuffer(
+                object_, buffer(), blocking, offset, size,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_READ_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        ::size_t offset,
+        ::size_t size,
+        const void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteBuffer(
+                object_, buffer(), blocking, offset, size,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_WRITE_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBuffer(
+        const Buffer& src,
+        const Buffer& dst,
+        ::size_t src_offset,
+        ::size_t dst_offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBuffer(
+                object_, src(), dst(), src_offset, dst_offset, size,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQEUE_COPY_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueReadBufferRect(
+        const Buffer& buffer,
+        cl_bool blocking,
+        const size_t<3>& buffer_offset,
+        const size_t<3>& host_offset,
+        const size_t<3>& region,
+        ::size_t buffer_row_pitch,
+        ::size_t buffer_slice_pitch,
+        ::size_t host_row_pitch,
+        ::size_t host_slice_pitch,
+        void *ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueReadBufferRect(
+                object_, 
+                buffer(), 
+                blocking, 
+                (const ::size_t *)buffer_offset,
+                (const ::size_t *)host_offset,
+                (const ::size_t *)region,
+                buffer_row_pitch,
+                buffer_slice_pitch,
+                host_row_pitch,
+                host_slice_pitch,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_READ_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteBufferRect(
+        const Buffer& buffer,
+        cl_bool blocking,
+        const size_t<3>& buffer_offset,
+        const size_t<3>& host_offset,
+        const size_t<3>& region,
+        ::size_t buffer_row_pitch,
+        ::size_t buffer_slice_pitch,
+        ::size_t host_row_pitch,
+        ::size_t host_slice_pitch,
+        void *ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteBufferRect(
+                object_, 
+                buffer(), 
+                blocking, 
+                (const ::size_t *)buffer_offset,
+                (const ::size_t *)host_offset,
+                (const ::size_t *)region,
+                buffer_row_pitch,
+                buffer_slice_pitch,
+                host_row_pitch,
+                host_slice_pitch,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_WRITE_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBufferRect(
+        const Buffer& src,
+        const Buffer& dst,
+        const size_t<3>& src_origin,
+        const size_t<3>& dst_origin,
+        const size_t<3>& region,
+        ::size_t src_row_pitch,
+        ::size_t src_slice_pitch,
+        ::size_t dst_row_pitch,
+        ::size_t dst_slice_pitch,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBufferRect(
+                object_, 
+                src(), 
+                dst(), 
+                (const ::size_t *)src_origin, 
+                (const ::size_t *)dst_origin, 
+                (const ::size_t *)region,
+                src_row_pitch,
+                src_slice_pitch,
+                dst_row_pitch,
+                dst_slice_pitch,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQEUE_COPY_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if defined(CL_VERSION_1_2)
+    /**
+     * Enqueue a command to fill a buffer object with a pattern
+     * of a given size. The pattern is specified a as vector.
+     * \tparam PatternType The datatype of the pattern field. 
+     *     The pattern type must be an accepted OpenCL data type.
+     */
+    template<typename PatternType>
+    cl_int enqueueFillBuffer(
+        const Buffer& buffer,
+        PatternType pattern,
+        ::size_t offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillBuffer(
+                object_, 
+                buffer(),
+                static_cast<void*>(&pattern),
+                sizeof(PatternType), 
+                offset, 
+                size,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    cl_int enqueueReadImage(
+        const Image& image,
+        cl_bool blocking,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        ::size_t row_pitch,
+        ::size_t slice_pitch,
+        void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueReadImage(
+                object_, image(), blocking, (const ::size_t *) origin,
+                (const ::size_t *) region, row_pitch, slice_pitch, ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_READ_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteImage(
+        const Image& image,
+        cl_bool blocking,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        ::size_t row_pitch,
+        ::size_t slice_pitch,
+        void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteImage(
+                object_, image(), blocking, (const ::size_t *) origin,
+                (const ::size_t *) region, row_pitch, slice_pitch, ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_WRITE_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyImage(
+        const Image& src,
+        const Image& dst,
+        const size_t<3>& src_origin,
+        const size_t<3>& dst_origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyImage(
+                object_, src(), dst(), (const ::size_t *) src_origin,
+                (const ::size_t *)dst_origin, (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if defined(CL_VERSION_1_2)
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA floating-point color value if
+     *     the image channel data type is not an unnormalized signed or
+     *     unsigned data type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_float4 fillColor,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                (const ::size_t *) origin, 
+                (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA signed integer color value if
+     *     the image channel data type is an unnormalized signed integer
+     *     type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_int4 fillColor,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                (const ::size_t *) origin, 
+                (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA unsigned integer color value if
+     *     the image channel data type is an unnormalized unsigned integer
+     *     type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_uint4 fillColor,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                (const ::size_t *) origin, 
+                (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    cl_int enqueueCopyImageToBuffer(
+        const Image& src,
+        const Buffer& dst,
+        const size_t<3>& src_origin,
+        const size_t<3>& region,
+        ::size_t dst_offset,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyImageToBuffer(
+                object_, src(), dst(), (const ::size_t *) src_origin,
+                (const ::size_t *) region, dst_offset,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBufferToImage(
+        const Buffer& src,
+        const Image& dst,
+        ::size_t src_offset,
+        const size_t<3>& dst_origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBufferToImage(
+                object_, src(), dst(), src_offset,
+                (const ::size_t *) dst_origin, (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    void* enqueueMapBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        ::size_t offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL) const
+    {
+        cl_int error;
+        void * result = ::clEnqueueMapBuffer(
+            object_, buffer(), blocking, flags, offset, size,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (cl_event*) event,
+            &error);
+
+        detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+        return result;
+    }
+
+    void* enqueueMapImage(
+        const Image& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        ::size_t * row_pitch,
+        ::size_t * slice_pitch,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL) const
+    {
+        cl_int error;
+        void * result = ::clEnqueueMapImage(
+            object_, buffer(), blocking, flags,
+            (const ::size_t *) origin, (const ::size_t *) region,
+            row_pitch, slice_pitch,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (cl_event*) event,
+            &error);
+
+        detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR);
+        if (err != NULL) {
+              *err = error;
+        }
+        return result;
+    }
+
+    cl_int enqueueUnmapMemObject(
+        const Memory& memory,
+        void* mapped_ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueUnmapMemObject(
+                object_, memory(), mapped_ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if defined(CL_VERSION_1_2)
+    /**
+     * Enqueues a marker command which waits for either a list of events to complete, 
+     * or all previously enqueued commands to complete.
+     *
+     * Enqueues a marker command which waits for either a list of events to complete, 
+     * or if the list is empty it waits for all commands previously enqueued in command_queue 
+     * to complete before it completes. This command returns an event which can be waited on, 
+     * i.e. this event can be waited on to insure that all events either in the event_wait_list 
+     * or all previously enqueued commands, queued before this command to command_queue, 
+     * have completed.
+     */
+    cl_int enqueueMarkerWithWaitList(
+        const VECTOR_CLASS<Event> *events = 0,
+        Event *event = 0)
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueMarkerWithWaitList(
+                object_,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_MARKER_WAIT_LIST_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * A synchronization point that enqueues a barrier operation.
+     *
+     * Enqueues a barrier command which waits for either a list of events to complete, 
+     * or if the list is empty it waits for all commands previously enqueued in command_queue 
+     * to complete before it completes. This command blocks command execution, that is, any 
+     * following commands enqueued after it do not execute until it completes. This command 
+     * returns an event which can be waited on, i.e. this event can be waited on to insure that 
+     * all events either in the event_wait_list or all previously enqueued commands, queued 
+     * before this command to command_queue, have completed.
+     */
+    cl_int enqueueBarrierWithWaitList(
+        const VECTOR_CLASS<Event> *events = 0,
+        Event *event = 0)
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueBarrierWithWaitList(
+                object_,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_BARRIER_WAIT_LIST_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+    
+    /**
+     * Enqueues a command to indicate with which device a set of memory objects
+     * should be associated.
+     */
+    cl_int enqueueMigrateMemObjects(
+        const VECTOR_CLASS<Memory> &memObjects,
+        cl_mem_migration_flags flags,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL
+        )
+    {
+        cl_event tmp;
+        
+        cl_mem* localMemObjects = static_cast<cl_mem*>(alloca(memObjects.size() * sizeof(cl_mem)));
+        for( int i = 0; i < (int)memObjects.size(); ++i ) {
+            localMemObjects[i] = memObjects[i]();
+        }
+
+
+        cl_int err = detail::errHandler(
+            ::clEnqueueMigrateMemObjects(
+                object_, 
+                (cl_uint)memObjects.size(), 
+                static_cast<const cl_mem*>(localMemObjects),
+                flags,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    cl_int enqueueNDRangeKernel(
+        const Kernel& kernel,
+        const NDRange& offset,
+        const NDRange& global,
+        const NDRange& local = NullRange,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueNDRangeKernel(
+                object_, kernel(), (cl_uint) global.dimensions(),
+                offset.dimensions() != 0 ? (const ::size_t*) offset : NULL,
+                (const ::size_t*) global,
+                local.dimensions() != 0 ? (const ::size_t*) local : NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_NDRANGE_KERNEL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueTask(
+        const Kernel& kernel,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueTask(
+                object_, kernel(),
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_TASK_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueNativeKernel(
+        void (CL_CALLBACK *userFptr)(void *),
+        std::pair<void*, ::size_t> args,
+        const VECTOR_CLASS<Memory>* mem_objects = NULL,
+        const VECTOR_CLASS<const void*>* mem_locs = NULL,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_mem * mems = (mem_objects != NULL && mem_objects->size() > 0) 
+            ? (cl_mem*) alloca(mem_objects->size() * sizeof(cl_mem))
+            : NULL;
+
+        if (mems != NULL) {
+            for (unsigned int i = 0; i < mem_objects->size(); i++) {
+                mems[i] = ((*mem_objects)[i])();
+            }
+        }
+
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueNativeKernel(
+                object_, userFptr, args.first, args.second,
+                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                mems,
+                (mem_locs != NULL) ? (const void **) &mem_locs->front() : NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_NATIVE_KERNEL);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED 
+    cl_int enqueueMarker(Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        return detail::errHandler(
+            ::clEnqueueMarker(object_, (cl_event*) event),
+            __ENQUEUE_MARKER_ERR);
+    }
+
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    cl_int enqueueWaitForEvents(const VECTOR_CLASS<Event>& events) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        return detail::errHandler(
+            ::clEnqueueWaitForEvents(
+                object_,
+                (cl_uint) events.size(),
+                (const cl_event*) &events.front()),
+            __ENQUEUE_WAIT_FOR_EVENTS_ERR);
+    }
+#endif // #if defined(CL_VERSION_1_1)
+
+    cl_int enqueueAcquireGLObjects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+     {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             ::clEnqueueAcquireGLObjects(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_ACQUIRE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+    cl_int enqueueReleaseGLObjects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+     {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             ::clEnqueueReleaseGLObjects(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_RELEASE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+#if defined (USE_DX_INTEROP)
+typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list, cl_event* event);
+typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem* mem_objects,  cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list, cl_event* event);
+
+    cl_int enqueueAcquireD3D10Objects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+    {
+        static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL;
+#if defined(CL_VERSION_1_2)
+        cl_context context = getInfo<CL_QUEUE_CONTEXT>();
+        cl::Device device(getInfo<CL_QUEUE_DEVICE>());
+        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
+        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueAcquireD3D10ObjectsKHR);
+#endif
+#if defined(CL_VERSION_1_1)
+        __INIT_CL_EXT_FCN_PTR(clEnqueueAcquireD3D10ObjectsKHR);
+#endif
+        
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             pfn_clEnqueueAcquireD3D10ObjectsKHR(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_ACQUIRE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+    cl_int enqueueReleaseD3D10Objects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+    {
+        static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL;
+#if defined(CL_VERSION_1_2)
+        cl_context context = getInfo<CL_QUEUE_CONTEXT>();
+        cl::Device device(getInfo<CL_QUEUE_DEVICE>());
+        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
+        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueReleaseD3D10ObjectsKHR);
+#endif // #if defined(CL_VERSION_1_2)
+#if defined(CL_VERSION_1_1)
+        __INIT_CL_EXT_FCN_PTR(clEnqueueReleaseD3D10ObjectsKHR);
+#endif // #if defined(CL_VERSION_1_1)
+
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            pfn_clEnqueueReleaseD3D10ObjectsKHR(
+                object_,
+                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_RELEASE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    cl_int enqueueBarrier() const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        return detail::errHandler(
+            ::clEnqueueBarrier(object_),
+            __ENQUEUE_BARRIER_ERR);
+    }
+#endif // #if defined(CL_VERSION_1_1)
+
+    cl_int flush() const
+    {
+        return detail::errHandler(::clFlush(object_), __FLUSH_ERR);
+    }
+
+    cl_int finish() const
+    {
+        return detail::errHandler(::clFinish(object_), __FINISH_ERR);
+    }
+};
+
+#ifdef _WIN32
+__declspec(selectany) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+__declspec(selectany) CommandQueue CommandQueue::default_;
+__declspec(selectany) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
+#else
+__attribute__((weak)) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+__attribute__((weak)) CommandQueue CommandQueue::default_;
+__attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
+#endif
+
+template< typename IteratorType >
+Buffer::Buffer(
+    const Context &context,
+    IteratorType startIterator,
+    IteratorType endIterator,
+    bool readOnly,
+    bool useHostPtr,
+    cl_int* err)
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+
+    cl_mem_flags flags = 0;
+    if( readOnly ) {
+        flags |= CL_MEM_READ_ONLY;
+    }
+    else {
+        flags |= CL_MEM_READ_WRITE;
+    }
+    if( useHostPtr ) {
+        flags |= CL_MEM_USE_HOST_PTR;
+    }
+    
+    ::size_t size = sizeof(DataType)*(endIterator - startIterator);
+
+    if( useHostPtr ) {
+        object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+    } else {
+        object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
+    }
+
+    detail::errHandler(error, __CREATE_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+
+    if( !useHostPtr ) {
+        CommandQueue queue(context, 0, &error);
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        error = cl::copy(queue, startIterator, endIterator, *this);
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+}
+
+inline cl_int enqueueReadBuffer(
+    const Buffer& buffer,
+    cl_bool blocking,
+    ::size_t offset,
+    ::size_t size,
+    void* ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadBuffer(buffer, blocking, offset, size, ptr, events, event);
+}
+
+inline cl_int enqueueWriteBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        ::size_t offset,
+        ::size_t size,
+        const void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteBuffer(buffer, blocking, offset, size, ptr, events, event);
+}
+
+inline void* enqueueMapBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        ::size_t offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+
+    void * result = ::clEnqueueMapBuffer(
+            queue(), buffer(), blocking, flags, offset, size,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (cl_event*) event,
+            &error);
+
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+    return result;
+}
+
+inline cl_int enqueueUnmapMemObject(
+    const Memory& memory,
+    void* mapped_ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    cl_event tmp;
+    cl_int err = detail::errHandler(
+        ::clEnqueueUnmapMemObject(
+            queue(), memory(), mapped_ptr,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (event != NULL) ? &tmp : NULL),
+        __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+    if (event != NULL && err == CL_SUCCESS)
+        *event = tmp;
+
+    return err;
+}
+
+inline cl_int enqueueCopyBuffer(
+        const Buffer& src,
+        const Buffer& dst,
+        ::size_t src_offset,
+        ::size_t dst_offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBuffer(src, dst, src_offset, dst_offset, size, events, event);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Host to Device.
+ * Uses default command queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS)
+        return error;
+
+    return cl::copy(queue, startIterator, endIterator, buffer);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Device to Host.
+ * Uses default command queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS)
+        return error;
+
+    return cl::copy(queue, buffer, startIterator, endIterator);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Host to Device.
+ * Uses specified queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+    
+    ::size_t length = endIterator-startIterator;
+    ::size_t byteLength = length*sizeof(DataType);
+
+    DataType *pointer = 
+        static_cast<DataType*>(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_WRITE, 0, byteLength, 0, 0, &error));
+    // if exceptions enabled, enqueueMapBuffer will throw
+    if( error != CL_SUCCESS ) {
+        return error;
+    }
+#if defined(_MSC_VER)
+    std::copy(
+        startIterator, 
+        endIterator, 
+        stdext::checked_array_iterator<DataType*>(
+            pointer, length));
+#else
+    std::copy(startIterator, endIterator, pointer);
+#endif
+    Event endEvent;
+    error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
+    // if exceptions enabled, enqueueUnmapMemObject will throw
+    if( error != CL_SUCCESS ) { 
+        return error;
+    }
+    endEvent.wait();
+    return CL_SUCCESS;
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Device to Host.
+ * Uses specified queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+        
+    ::size_t length = endIterator-startIterator;
+    ::size_t byteLength = length*sizeof(DataType);
+
+    DataType *pointer = 
+        static_cast<DataType*>(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_READ, 0, byteLength, 0, 0, &error));
+    // if exceptions enabled, enqueueMapBuffer will throw
+    if( error != CL_SUCCESS ) {
+        return error;
+    }
+    std::copy(pointer, pointer + length, startIterator);
+    Event endEvent;
+    error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
+    // if exceptions enabled, enqueueUnmapMemObject will throw
+    if( error != CL_SUCCESS ) { 
+        return error;
+    }
+    endEvent.wait();
+    return CL_SUCCESS;
+}
+
+#if defined(CL_VERSION_1_1)
+inline cl_int enqueueReadBufferRect(
+    const Buffer& buffer,
+    cl_bool blocking,
+    const size_t<3>& buffer_offset,
+    const size_t<3>& host_offset,
+    const size_t<3>& region,
+    ::size_t buffer_row_pitch,
+    ::size_t buffer_slice_pitch,
+    ::size_t host_row_pitch,
+    ::size_t host_slice_pitch,
+    void *ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadBufferRect(
+        buffer, 
+        blocking, 
+        buffer_offset, 
+        host_offset,
+        region,
+        buffer_row_pitch,
+        buffer_slice_pitch,
+        host_row_pitch,
+        host_slice_pitch,
+        ptr, 
+        events, 
+        event);
+}
+
+inline cl_int enqueueWriteBufferRect(
+    const Buffer& buffer,
+    cl_bool blocking,
+    const size_t<3>& buffer_offset,
+    const size_t<3>& host_offset,
+    const size_t<3>& region,
+    ::size_t buffer_row_pitch,
+    ::size_t buffer_slice_pitch,
+    ::size_t host_row_pitch,
+    ::size_t host_slice_pitch,
+    void *ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteBufferRect(
+        buffer, 
+        blocking, 
+        buffer_offset, 
+        host_offset,
+        region,
+        buffer_row_pitch,
+        buffer_slice_pitch,
+        host_row_pitch,
+        host_slice_pitch,
+        ptr, 
+        events, 
+        event);
+}
+
+inline cl_int enqueueCopyBufferRect(
+    const Buffer& src,
+    const Buffer& dst,
+    const size_t<3>& src_origin,
+    const size_t<3>& dst_origin,
+    const size_t<3>& region,
+    ::size_t src_row_pitch,
+    ::size_t src_slice_pitch,
+    ::size_t dst_row_pitch,
+    ::size_t dst_slice_pitch,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBufferRect(
+        src,
+        dst,
+        src_origin,
+        dst_origin,
+        region,
+        src_row_pitch,
+        src_slice_pitch,
+        dst_row_pitch,
+        dst_slice_pitch,
+        events, 
+        event);
+}
+#endif
+
+inline cl_int enqueueReadImage(
+    const Image& image,
+    cl_bool blocking,
+    const size_t<3>& origin,
+    const size_t<3>& region,
+    ::size_t row_pitch,
+    ::size_t slice_pitch,
+    void* ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL) 
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadImage(
+        image,
+        blocking,
+        origin,
+        region,
+        row_pitch,
+        slice_pitch,
+        ptr,
+        events, 
+        event);
+}
+
+inline cl_int enqueueWriteImage(
+    const Image& image,
+    cl_bool blocking,
+    const size_t<3>& origin,
+    const size_t<3>& region,
+    ::size_t row_pitch,
+    ::size_t slice_pitch,
+    void* ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteImage(
+        image,
+        blocking,
+        origin,
+        region,
+        row_pitch,
+        slice_pitch,
+        ptr,
+        events, 
+        event);
+}
+
+inline cl_int enqueueCopyImage(
+    const Image& src,
+    const Image& dst,
+    const size_t<3>& src_origin,
+    const size_t<3>& dst_origin,
+    const size_t<3>& region,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyImage(
+        src,
+        dst,
+        src_origin,
+        dst_origin,
+        region,
+        events,
+        event);
+}
+
+inline cl_int enqueueCopyImageToBuffer(
+    const Image& src,
+    const Buffer& dst,
+    const size_t<3>& src_origin,
+    const size_t<3>& region,
+    ::size_t dst_offset,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyImageToBuffer(
+        src,
+        dst,
+        src_origin,
+        region,
+        dst_offset,
+        events,
+        event);
+}
+
+inline cl_int enqueueCopyBufferToImage(
+    const Buffer& src,
+    const Image& dst,
+    ::size_t src_offset,
+    const size_t<3>& dst_origin,
+    const size_t<3>& region,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBufferToImage(
+        src,
+        dst,
+        src_offset,
+        dst_origin,
+        region,
+        events,
+        event);
+}
+
+
+inline cl_int flush(void)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.flush();
+}
+
+inline cl_int finish(void)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    } 
+
+
+    return queue.finish();
+}
+
+// Kernel Functor support
+// New interface as of September 2011
+// Requires the C++11 std::tr1::function (note do not support TR1)
+// Visual Studio 2010 and GCC 4.2
+
+struct EnqueueArgs
+{
+    CommandQueue queue_;
+    const NDRange offset_;
+    const NDRange global_;
+    const NDRange local_;
+    VECTOR_CLASS<Event> events_;
+
+    EnqueueArgs(NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+
+    }
+
+    EnqueueArgs(NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(Event e, NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(Event e, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(Event e, NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+};
+
+namespace detail {
+
+class NullType {};
+
+template<int index, typename T0>
+struct SetArg
+{
+    static void set (Kernel kernel, T0 arg)
+    {
+        kernel.setArg(index, arg);
+    }
+};  
+
+template<int index>
+struct SetArg<index, NullType>
+{
+    static void set (Kernel, NullType)
+    { 
+    }
+};
+
+template <
+   typename T0,   typename T1,   typename T2,   typename T3,
+   typename T4,   typename T5,   typename T6,   typename T7,
+   typename T8,   typename T9,   typename T10,   typename T11,
+   typename T12,   typename T13,   typename T14,   typename T15,
+   typename T16,   typename T17,   typename T18,   typename T19,
+   typename T20,   typename T21,   typename T22,   typename T23,
+   typename T24,   typename T25,   typename T26,   typename T27,
+   typename T28,   typename T29,   typename T30,   typename T31
+>
+class KernelFunctorGlobal
+{
+private:
+    Kernel kernel_;
+
+public:
+   KernelFunctorGlobal(
+        Kernel kernel) :
+            kernel_(kernel)
+    {}
+
+   KernelFunctorGlobal(
+        const Program& program,
+        const STRING_CLASS name,
+        cl_int * err = NULL) :
+            kernel_(program, name.c_str(), err)
+    {}
+
+    Event operator() (
+        const EnqueueArgs& args,
+        T0 t0,
+        T1 t1 = NullType(),
+        T2 t2 = NullType(),
+        T3 t3 = NullType(),
+        T4 t4 = NullType(),
+        T5 t5 = NullType(),
+        T6 t6 = NullType(),
+        T7 t7 = NullType(),
+        T8 t8 = NullType(),
+        T9 t9 = NullType(),
+        T10 t10 = NullType(),
+        T11 t11 = NullType(),
+        T12 t12 = NullType(),
+        T13 t13 = NullType(),
+        T14 t14 = NullType(),
+        T15 t15 = NullType(),
+        T16 t16 = NullType(),
+        T17 t17 = NullType(),
+        T18 t18 = NullType(),
+        T19 t19 = NullType(),
+        T20 t20 = NullType(),
+        T21 t21 = NullType(),
+        T22 t22 = NullType(),
+        T23 t23 = NullType(),
+        T24 t24 = NullType(),
+        T25 t25 = NullType(),
+        T26 t26 = NullType(),
+        T27 t27 = NullType(),
+        T28 t28 = NullType(),
+        T29 t29 = NullType(),
+        T30 t30 = NullType(),
+        T31 t31 = NullType()
+        )
+    {
+        Event event;
+        SetArg<0, T0>::set(kernel_, t0);
+        SetArg<1, T1>::set(kernel_, t1);
+        SetArg<2, T2>::set(kernel_, t2);
+        SetArg<3, T3>::set(kernel_, t3);
+        SetArg<4, T4>::set(kernel_, t4);
+        SetArg<5, T5>::set(kernel_, t5);
+        SetArg<6, T6>::set(kernel_, t6);
+        SetArg<7, T7>::set(kernel_, t7);
+        SetArg<8, T8>::set(kernel_, t8);
+        SetArg<9, T9>::set(kernel_, t9);
+        SetArg<10, T10>::set(kernel_, t10);
+        SetArg<11, T11>::set(kernel_, t11);
+        SetArg<12, T12>::set(kernel_, t12);
+        SetArg<13, T13>::set(kernel_, t13);
+        SetArg<14, T14>::set(kernel_, t14);
+        SetArg<15, T15>::set(kernel_, t15);
+        SetArg<16, T16>::set(kernel_, t16);
+        SetArg<17, T17>::set(kernel_, t17);
+        SetArg<18, T18>::set(kernel_, t18);
+        SetArg<19, T19>::set(kernel_, t19);
+        SetArg<20, T20>::set(kernel_, t20);
+        SetArg<21, T21>::set(kernel_, t21);
+        SetArg<22, T22>::set(kernel_, t22);
+        SetArg<23, T23>::set(kernel_, t23);
+        SetArg<24, T24>::set(kernel_, t24);
+        SetArg<25, T25>::set(kernel_, t25);
+        SetArg<26, T26>::set(kernel_, t26);
+        SetArg<27, T27>::set(kernel_, t27);
+        SetArg<28, T28>::set(kernel_, t28);
+        SetArg<29, T29>::set(kernel_, t29);
+        SetArg<30, T30>::set(kernel_, t30);
+        SetArg<31, T31>::set(kernel_, t31);
+        
+        args.queue_.enqueueNDRangeKernel(
+            kernel_,
+            args.offset_,
+            args.global_,
+            args.local_,
+            &args.events_,
+            &event);
+        
+        return event;
+    }
+
+};
+
+//------------------------------------------------------------------------------------------------------
+
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27,
+	typename T28,
+	typename T29,
+	typename T30,
+	typename T31>
+struct functionImplementation_
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		T30,
+		T31> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 32))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		T30,
+		T31);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27,
+		T28 arg28,
+		T29 arg29,
+		T30 arg30,
+		T31 arg31)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27,
+			arg28,
+			arg29,
+			arg30,
+			arg31);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27,
+	typename T28,
+	typename T29,
+	typename T30>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	T27,
+	T28,
+	T29,
+	T30,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		T30,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 31))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		T30);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27,
+		T28 arg28,
+		T29 arg29,
+		T30 arg30)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27,
+			arg28,
+			arg29,
+			arg30);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27,
+	typename T28,
+	typename T29>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	T27,
+	T28,
+	T29,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 30))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27,
+		T28 arg28,
+		T29 arg29)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27,
+			arg28,
+			arg29);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27,
+	typename T28>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	T27,
+	T28,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 29))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27,
+		T28 arg28)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27,
+			arg28);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	T27,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 28))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 27))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 26))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 25))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 24))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 23))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 22))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 21))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 20))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 19))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 18))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 17))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 16))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 15))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 14))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 13))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 12))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 11))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 10))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 9))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 8))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 7))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 6))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 5))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 4))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 3))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1>
+struct functionImplementation_
+<	T0,
+	T1,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 2))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1);
+	}
+
+
+};
+
+template<
+	typename T0>
+struct functionImplementation_
+<	T0,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 1))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0);
+	}
+
+
+};
+
+
+
+
+
+} // namespace detail
+
+//----------------------------------------------------------------------------------------------
+
+template <
+   typename T0,   typename T1 = detail::NullType,   typename T2 = detail::NullType,
+   typename T3 = detail::NullType,   typename T4 = detail::NullType,
+   typename T5 = detail::NullType,   typename T6 = detail::NullType,
+   typename T7 = detail::NullType,   typename T8 = detail::NullType,
+   typename T9 = detail::NullType,   typename T10 = detail::NullType,
+   typename T11 = detail::NullType,   typename T12 = detail::NullType,
+   typename T13 = detail::NullType,   typename T14 = detail::NullType,
+   typename T15 = detail::NullType,   typename T16 = detail::NullType,
+   typename T17 = detail::NullType,   typename T18 = detail::NullType,
+   typename T19 = detail::NullType,   typename T20 = detail::NullType,
+   typename T21 = detail::NullType,   typename T22 = detail::NullType,
+   typename T23 = detail::NullType,   typename T24 = detail::NullType,
+   typename T25 = detail::NullType,   typename T26 = detail::NullType,
+   typename T27 = detail::NullType,   typename T28 = detail::NullType,
+   typename T29 = detail::NullType,   typename T30 = detail::NullType,
+   typename T31 = detail::NullType
+>
+struct make_kernel :
+    public detail::functionImplementation_<
+               T0,   T1,   T2,   T3,
+               T4,   T5,   T6,   T7,
+               T8,   T9,   T10,   T11,
+               T12,   T13,   T14,   T15,
+               T16,   T17,   T18,   T19,
+               T20,   T21,   T22,   T23,
+               T24,   T25,   T26,   T27,
+               T28,   T29,   T30,   T31
+    >
+{
+public:
+	typedef detail::KernelFunctorGlobal<             
+		       T0,   T1,   T2,   T3,
+               T4,   T5,   T6,   T7,
+               T8,   T9,   T10,   T11,
+               T12,   T13,   T14,   T15,
+               T16,   T17,   T18,   T19,
+               T20,   T21,   T22,   T23,
+               T24,   T25,   T26,   T27,
+               T28,   T29,   T30,   T31
+    > FunctorType;
+
+    make_kernel(
+        const Program& program,
+        const STRING_CLASS name,
+        cl_int * err = NULL) :
+           detail::functionImplementation_<
+                    T0,   T1,   T2,   T3,
+                       T4,   T5,   T6,   T7,
+                       T8,   T9,   T10,   T11,
+                       T12,   T13,   T14,   T15,
+                       T16,   T17,   T18,   T19,
+                       T20,   T21,   T22,   T23,
+                       T24,   T25,   T26,   T27,
+                       T28,   T29,   T30,   T31
+           >(
+            FunctorType(program, name, err)) 
+    {}
+
+    make_kernel(
+        const Kernel kernel) :
+           detail::functionImplementation_<
+                    T0,   T1,   T2,   T3,
+                       T4,   T5,   T6,   T7,
+                       T8,   T9,   T10,   T11,
+                       T12,   T13,   T14,   T15,
+                       T16,   T17,   T18,   T19,
+                       T20,   T21,   T22,   T23,
+                       T24,   T25,   T26,   T27,
+                       T28,   T29,   T30,   T31
+           >(
+            FunctorType(kernel)) 
+    {}    
+};
+
+
+//----------------------------------------------------------------------------------------------------------------------
+
+#undef __ERR_STR
+#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
+#undef __GET_DEVICE_INFO_ERR
+#undef __GET_PLATFORM_INFO_ERR
+#undef __GET_DEVICE_IDS_ERR
+#undef __GET_CONTEXT_INFO_ERR
+#undef __GET_EVENT_INFO_ERR
+#undef __GET_EVENT_PROFILE_INFO_ERR
+#undef __GET_MEM_OBJECT_INFO_ERR
+#undef __GET_IMAGE_INFO_ERR
+#undef __GET_SAMPLER_INFO_ERR
+#undef __GET_KERNEL_INFO_ERR
+#undef __GET_KERNEL_ARG_INFO_ERR
+#undef __GET_KERNEL_WORK_GROUP_INFO_ERR
+#undef __GET_PROGRAM_INFO_ERR
+#undef __GET_PROGRAM_BUILD_INFO_ERR
+#undef __GET_COMMAND_QUEUE_INFO_ERR
+
+#undef __CREATE_CONTEXT_ERR
+#undef __CREATE_CONTEXT_FROM_TYPE_ERR
+#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR
+
+#undef __CREATE_BUFFER_ERR
+#undef __CREATE_SUBBUFFER_ERR
+#undef __CREATE_IMAGE2D_ERR
+#undef __CREATE_IMAGE3D_ERR
+#undef __CREATE_SAMPLER_ERR
+#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR
+
+#undef __CREATE_USER_EVENT_ERR
+#undef __SET_USER_EVENT_STATUS_ERR
+#undef __SET_EVENT_CALLBACK_ERR
+#undef __SET_PRINTF_CALLBACK_ERR
+
+#undef __WAIT_FOR_EVENTS_ERR
+
+#undef __CREATE_KERNEL_ERR
+#undef __SET_KERNEL_ARGS_ERR
+#undef __CREATE_PROGRAM_WITH_SOURCE_ERR
+#undef __CREATE_PROGRAM_WITH_BINARY_ERR
+#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR
+#undef __BUILD_PROGRAM_ERR
+#undef __CREATE_KERNELS_IN_PROGRAM_ERR
+
+#undef __CREATE_COMMAND_QUEUE_ERR
+#undef __SET_COMMAND_QUEUE_PROPERTY_ERR
+#undef __ENQUEUE_READ_BUFFER_ERR
+#undef __ENQUEUE_WRITE_BUFFER_ERR
+#undef __ENQUEUE_READ_BUFFER_RECT_ERR
+#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR
+#undef __ENQEUE_COPY_BUFFER_ERR
+#undef __ENQEUE_COPY_BUFFER_RECT_ERR
+#undef __ENQUEUE_READ_IMAGE_ERR
+#undef __ENQUEUE_WRITE_IMAGE_ERR
+#undef __ENQUEUE_COPY_IMAGE_ERR
+#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR
+#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR
+#undef __ENQUEUE_MAP_BUFFER_ERR
+#undef __ENQUEUE_MAP_IMAGE_ERR
+#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR
+#undef __ENQUEUE_NDRANGE_KERNEL_ERR
+#undef __ENQUEUE_TASK_ERR
+#undef __ENQUEUE_NATIVE_KERNEL
+
+#undef __CL_EXPLICIT_CONSTRUCTORS
+
+#undef __UNLOAD_COMPILER_ERR
+#endif //__CL_USER_OVERRIDE_ERROR_STRINGS
+
+#undef __CL_FUNCTION_TYPE
+
+// Extensions
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_VERSION_1_1)
+#undef __INIT_CL_EXT_FCN_PTR
+#endif // #if defined(CL_VERSION_1_1)
+#undef __CREATE_SUB_DEVICES
+
+#if defined(USE_CL_DEVICE_FISSION)
+#undef __PARAM_NAME_DEVICE_FISSION
+#endif // USE_CL_DEVICE_FISSION
+
+#undef __DEFAULT_NOT_INITIALIZED 
+#undef __DEFAULT_BEING_INITIALIZED 
+#undef __DEFAULT_INITIALIZED
+
+} // namespace cl
+
+#ifdef _WIN32
+#pragma pop_macro("max")
+#endif // _WIN32
+
+#endif // CL_HPP_
diff --git a/include/CL/cl2.hpp b/include/CL/cl2.hpp
new file mode 100644
index 0000000000..c6cd8a716c
--- /dev/null
+++ b/include/CL/cl2.hpp
@@ -0,0 +1,9526 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/*! \file
+ *
+ *   \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33), 
+ *       OpenCL 1.2 (rev 15) and OpenCL 2.0 (rev 29)
+ *   \author Lee Howes and Bruce Merry
+ *   
+ *   Derived from the OpenCL 1.x C++ bindings written by 
+ *   Benedict R. Gaster, Laurent Morichetti and Lee Howes
+ *   With additions and fixes from:
+ *       Brian Cole, March 3rd 2010 and April 2012
+ *       Matt Gruenke, April 2012.
+ *       Bruce Merry, February 2013.
+ *       Tom Deakin and Simon McIntosh-Smith, July 2013
+ *       James Price, June-November 2015
+ *
+ *   \version 2.0.8
+ *   \date 2015-11-03
+ *
+ *   Optional extension support
+ *
+ *         cl_ext_device_fission
+ *         #define CL_HPP_USE_CL_DEVICE_FISSION
+ *         cl_khr_d3d10_sharing
+ *         #define CL_HPP_USE_DX_INTEROP
+ *         cl_khr_sub_groups
+ *         #define CL_HPP_USE_CL_SUB_GROUPS_KHR
+ */
+
+/*! \mainpage
+ * \section intro Introduction
+ * For many large applications C++ is the language of choice and so it seems
+ * reasonable to define C++ bindings for OpenCL.
+ *
+ * The interface is contained with a single C++ header file \em cl2.hpp and all
+ * definitions are contained within the namespace \em cl. There is no additional
+ * requirement to include \em cl.h and to use either the C++ or original C
+ * bindings; it is enough to simply include \em cl.hpp.
+ *
+ * The bindings themselves are lightweight and correspond closely to the
+ * underlying C API. Using the C++ bindings introduces no additional execution
+ * overhead.
+ *
+ * There are numerous compatibility, portability and memory management
+ * fixes in the new header as well as additional OpenCL 2.0 features.
+ * As a result the header is not directly backward compatible and for this
+ * reason we release it as cl2.hpp rather than a new version of cl.hpp.
+ * 
+ *
+ * \section compatibility Compatibility
+ * Due to the evolution of the underlying OpenCL API the 2.0 C++ bindings
+ * include an updated approach to defining supported feature versions
+ * and the range of valid underlying OpenCL runtime versions supported.
+ *
+ * The combination of preprocessor macros CL_HPP_TARGET_OPENCL_VERSION and 
+ * CL_HPP_MINIMUM_OPENCL_VERSION control this range. These are three digit
+ * decimal values representing OpenCL runime versions. The default for 
+ * the target is 200, representing OpenCL 2.0 and the minimum is also 
+ * defined as 200. These settings would use 2.0 API calls only.
+ * If backward compatibility with a 1.2 runtime is required, the minimum
+ * version may be set to 120.
+ *
+ * Note that this is a compile-time setting, and so affects linking against
+ * a particular SDK version rather than the versioning of the loaded runtime.
+ *
+ * The earlier versions of the header included basic vector and string 
+ * classes based loosely on STL versions. These were difficult to 
+ * maintain and very rarely used. For the 2.0 header we now assume
+ * the presence of the standard library unless requested otherwise.
+ * We use std::array, std::vector, std::shared_ptr and std::string 
+ * throughout to safely manage memory and reduce the chance of a 
+ * recurrance of earlier memory management bugs.
+ *
+ * These classes are used through typedefs in the cl namespace: 
+ * cl::array, cl::vector, cl::pointer and cl::string.
+ * In addition cl::allocate_pointer forwards to std::allocate_shared
+ * by default.
+ * In all cases these standard library classes can be replaced with 
+ * custom interface-compatible versions using the CL_HPP_NO_STD_ARRAY, 
+ * CL_HPP_NO_STD_VECTOR, CL_HPP_NO_STD_UNIQUE_PTR and 
+ * CL_HPP_NO_STD_STRING macros.
+ *
+ * The OpenCL 1.x versions of the C++ bindings included a size_t wrapper
+ * class to interface with kernel enqueue. This caused unpleasant interactions
+ * with the standard size_t declaration and led to namespacing bugs.
+ * In the 2.0 version we have replaced this with a std::array-based interface.
+ * However, the old behaviour can be regained for backward compatibility
+ * using the CL_HPP_ENABLE_SIZE_T_COMPATIBILITY macro.
+ *
+ * Finally, the program construction interface used a clumsy vector-of-pairs
+ * design in the earlier versions. We have replaced that with a cleaner 
+ * vector-of-vectors and vector-of-strings design. However, for backward 
+ * compatibility old behaviour can be regained with the
+ * CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY macro.
+ * 
+ * In OpenCL 2.0 OpenCL C is not entirely backward compatibility with 
+ * earlier versions. As a result a flag must be passed to the OpenCL C
+ * compiled to request OpenCL 2.0 compilation of kernels with 1.2 as
+ * the default in the absence of the flag.
+ * In some cases the C++ bindings automatically compile code for ease.
+ * For those cases the compilation defaults to OpenCL C 2.0.
+ * If this is not wanted, the CL_HPP_CL_1_2_DEFAULT_BUILD macro may
+ * be specified to assume 1.2 compilation.
+ * If more fine-grained decisions on a per-kernel bases are required
+ * then explicit build operations that take the flag should be used.
+ *
+ *
+ * \section parameterization Parameters
+ * This header may be parameterized by a set of preprocessor macros.
+ * CL_HPP_TARGET_OPENCL_VERSION
+ *  - Defines the target OpenCL runtime version to build the header against.
+ *    Defaults to 200, representing OpenCL 2.0.
+ * CL_HPP_NO_STD_STRING
+ *  - Do not use the standard library string class.
+ *    cl::string is not defined and may be defined by the user before 
+ *    cl2.hpp is included.
+ * CL_HPP_NO_STD_VECTOR
+ *  - Do not use the standard library vector class.
+ *    cl::vector is not defined and may be defined by the user before
+ *    cl2.hpp is included. 
+ * CL_HPP_NO_STD_ARRAY
+ *  - Do not use the standard library array class.
+ *    cl::array is not defined and may be defined by the user before
+ *    cl2.hpp is included.
+ * CL_HPP_NO_STD_UNIQUE_PTR
+ *  - Do not use the standard library unique_ptr class.
+ *    cl::pointer and the cl::allocate_pointer function are not defined 
+ *    and may be defined by the user before cl2.hpp is included.
+ * CL_HPP_ENABLE_DEVICE_FISSION 
+ *  - Enables device fission for OpenCL 1.2 platforms
+ * CL_HPP_ENABLE_EXCEPTIONS
+ *  - Enable exceptions for use in the C++ bindings header.
+ *    This is the preferred error handling mechanism but is not required.
+ * CL_HPP_ENABLE_SIZE_T_COMPATIBILITY
+ *  - Backward compatibility option to support cl.hpp-style size_t class.
+ *    Replaces the updated std::array derived version and removal of size_t
+ *    from the namespace. Note that in this case the new size_t class
+ *    is placed in the cl::compatibility namespace and thus requires
+ *    an additional using declaration for direct backward compatibility.
+ * CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 
+ *  - Enable older vector of pairs interface for construction of programs.
+ * CL_HPP_CL_1_2_DEFAULT_BUILD
+ *  - Default to OpenCL C 1.2 compilation rather than OpenCL C 2.0
+ *  - applies to use of cl::Program construction and other program build variants.
+ *
+ *
+ * \section example Example
+ *
+ * The following example shows a general use case for the C++
+ * bindings, including support for the optional exception feature and
+ * also the supplied vector and string classes, see following sections for
+ * decriptions of these features.
+ * 
+ * \code
+    #define CL_HPP_ENABLE_EXCEPTIONS
+    #define CL_HPP_TARGET_OPENCL_VERSION 200
+   
+    #include <CL/cl2.hpp>
+    #include <iostream>
+    #include <vector>
+    #include <memory>
+    #include <algorithm>
+  
+    const int numElements = 32;
+  
+    int main(void)
+    {
+        // Filter for a 2.0 platform and set it as the default
+        std::vector<cl::Platform> platforms;
+        cl::Platform::get(&platforms);
+        cl::Platform plat;
+        for (auto &p : platforms) {
+            std::string platver = p.getInfo<CL_PLATFORM_VERSION>();
+            if (platver.find("OpenCL 2.") != std::string::npos) {
+                plat = p;
+            }
+        }
+        if (plat() == 0)  {
+            std::cout << "No OpenCL 2.0 platform found.";
+            return -1;
+        }
+
+        cl::Platform newP = cl::Platform::setDefault(plat);
+        if (newP != plat) {
+            std::cout << "Error setting default platform.";
+            return -1;
+        }
+    
+        std::string kernel1{
+            "global int globalA;"
+            "kernel void updateGlobal(){"
+            "  globalA = 75;"
+            "}"};
+        std::string kernel2{
+            "typedef struct { global int *bar; } Foo; kernel void vectorAdd(global const Foo* aNum, global const int *inputA, global const int *inputB, global int *output, int val, write_only pipe int outPipe, queue_t childQueue){"
+            "  output[get_global_id(0)] = inputA[get_global_id(0)] + inputB[get_global_id(0)] + val + *(aNum->bar);"
+            "  write_pipe(outPipe, &val);"
+            "  queue_t default_queue = get_default_queue(); "
+            "  ndrange_t ndrange = ndrange_1D(get_global_size(0)/2, get_global_size(0)/2); "
+            // Have a child kernel write into third quarter of output
+            "  enqueue_kernel(default_queue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, "
+            "    ^{"
+            "      output[get_global_size(0)*2 + get_global_id(0)] = inputA[get_global_size(0)*2+get_global_id(0)] + inputB[get_global_size(0)*2+get_global_id(0)] + globalA;"
+            "    });"
+            // Have a child kernel write into last quarter of output
+            "  enqueue_kernel(childQueue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, "
+            "    ^{"
+            "      output[get_global_size(0)*3 + get_global_id(0)] = inputA[get_global_size(0)*3 + get_global_id(0)] + inputB[get_global_size(0)*3 + get_global_id(0)] + globalA + 2;"
+            "    });"
+            "}" };
+
+        // New simpler string interface style
+        std::vector<std::string> programStrings {kernel1, kernel2};
+
+        cl::Program vectorAddProgram(
+            programStrings);
+        try {
+            vectorAddProgram.build("-cl-std=CL2.0");
+        }
+        catch (...) {
+            // Print build info for all devices
+            cl_int buildErr = CL_SUCCESS;
+            auto buildInfo = vectorAddProgram.getBuildInfo<CL_PROGRAM_BUILD_LOG>(&buildErr);
+            for (auto &pair : buildInfo) {
+                std::cerr << pair.second << std::endl << std::endl;
+            }
+        
+            return 1;
+        }
+
+        typedef struct { int *bar; } Foo;
+
+        // Get and run kernel that initializes the program-scope global
+        // A test for kernels that take no arguments
+        auto program2Kernel =
+            cl::KernelFunctor<>(vectorAddProgram, "updateGlobal");
+        program2Kernel(
+            cl::EnqueueArgs(
+            cl::NDRange(1)));
+    
+        //////////////////
+        // SVM allocations
+    
+        cl::pointer<int> anSVMInt = cl::allocate_svm<int, cl::SVMTraitCoarse<>>();
+        *anSVMInt = 5;
+        cl::SVMAllocator<int, cl::SVMTraitCoarse<cl::SVMTraitReadOnly<>>> svmAllocReadOnly;
+        auto fooPointer = cl::allocate_pointer<Foo>(svmAllocReadOnly);
+        fooPointer->bar = anSVMInt.get();
+        cl::SVMAllocator<int, cl::SVMTraitCoarse<>> svmAlloc;
+        std::vector<int, cl::SVMAllocator<int, cl::SVMTraitCoarse<>>> inputA(numElements, 1, svmAlloc);    
+        cl::coarse_svm_vector<int> inputB(numElements, 2, svmAlloc);
+
+        //
+        //////////////
+
+        // Traditional cl_mem allocations
+        std::vector<int> output(numElements, 0xdeadbeef);
+        cl::Buffer outputBuffer(begin(output), end(output), false);
+        cl::Pipe aPipe(sizeof(cl_int), numElements / 2);
+    
+        // Default command queue, also passed in as a parameter
+        cl::DeviceCommandQueue defaultDeviceQueue = cl::DeviceCommandQueue::makeDefault(
+            cl::Context::getDefault(), cl::Device::getDefault());
+
+        auto vectorAddKernel =
+            cl::KernelFunctor<
+                decltype(fooPointer)&,
+                int*,
+                cl::coarse_svm_vector<int>&,
+                cl::Buffer,
+                int,
+                cl::Pipe&,
+                cl::DeviceCommandQueue
+                >(vectorAddProgram, "vectorAdd");
+
+        // Ensure that the additional SVM pointer is available to the kernel
+        // This one was not passed as a parameter
+        vectorAddKernel.setSVMPointers(anSVMInt);
+
+        // Hand control of coarse allocations to runtime
+        cl::enqueueUnmapSVM(anSVMInt);
+        cl::enqueueUnmapSVM(fooPointer);
+        cl::unmapSVM(inputB);
+        cl::unmapSVM(output2);
+
+	    cl_int error;
+	    vectorAddKernel(
+            cl::EnqueueArgs(
+                cl::NDRange(numElements/2),
+                cl::NDRange(numElements/2)),
+            fooPointer,
+            inputA.data(),
+            inputB,
+            outputBuffer,
+            3,
+            aPipe,
+            defaultDeviceQueue,
+		    error
+            );
+
+        cl::copy(outputBuffer, begin(output), end(output));
+        // Grab the SVM output vector using a map
+        cl::mapSVM(output2);
+
+        cl::Device d = cl::Device::getDefault();
+
+        std::cout << "Output:\n";
+        for (int i = 1; i < numElements; ++i) {
+            std::cout << "\t" << output[i] << "\n";
+        }
+        std::cout << "\n\n";
+
+        return 0;
+    }
+ * 
+ * \endcode
+ *
+ */
+#ifndef CL_HPP_
+#define CL_HPP_
+
+#ifdef __GNUC__
+#pragma GCC system_header 
+#endif
+
+/* Handle deprecated preprocessor definitions. In each case, we only check for
+ * the old name if the new name is not defined, so that user code can define
+ * both and hence work with either version of the bindings.
+ */
+#if !defined(CL_HPP_USE_DX_INTEROP) && defined(USE_DX_INTEROP)
+# pragma message("cl2.hpp: USE_DX_INTEROP is deprecated. Define CL_HPP_USE_DX_INTEROP instead")
+# define CL_HPP_USE_DX_INTEROP
+#endif
+#if !defined(CL_HPP_USE_CL_DEVICE_FISSION) && defined(USE_CL_DEVICE_FISSION)
+# pragma message("cl2.hpp: USE_CL_DEVICE_FISSION is deprecated. Define CL_HPP_USE_CL_DEVICE_FISSION instead")
+# define CL_HPP_USE_CL_DEVICE_FISSION
+#endif
+#if !defined(CL_HPP_ENABLE_EXCEPTIONS) && defined(__CL_ENABLE_EXCEPTIONS)
+# pragma message("cl2.hpp: __CL_ENABLE_EXCEPTIONS is deprecated. Define CL_HPP_ENABLE_EXCEPTIONS instead")
+# define CL_HPP_ENABLE_EXCEPTIONS
+#endif
+#if !defined(CL_HPP_NO_STD_VECTOR) && defined(__NO_STD_VECTOR)
+# pragma message("cl2.hpp: __NO_STD_VECTOR is deprecated. Define CL_HPP_NO_STD_VECTOR instead")
+# define CL_HPP_NO_STD_VECTOR
+#endif
+#if !defined(CL_HPP_NO_STD_STRING) && defined(__NO_STD_STRING)
+# pragma message("cl2.hpp: __NO_STD_STRING is deprecated. Define CL_HPP_NO_STD_STRING instead")
+# define CL_HPP_NO_STD_STRING
+#endif
+#if defined(VECTOR_CLASS)
+# pragma message("cl2.hpp: VECTOR_CLASS is deprecated. Alias cl::vector instead")
+#endif
+#if defined(STRING_CLASS)
+# pragma message("cl2.hpp: STRING_CLASS is deprecated. Alias cl::string instead.")
+#endif
+#if !defined(CL_HPP_USER_OVERRIDE_ERROR_STRINGS) && defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
+# pragma message("cl2.hpp: __CL_USER_OVERRIDE_ERROR_STRINGS is deprecated. Define CL_HPP_USER_OVERRIDE_ERROR_STRINGS instead")
+# define CL_HPP_USER_OVERRIDE_ERROR_STRINGS
+#endif
+
+/* Warn about features that are no longer supported
+ */
+#if defined(__USE_DEV_VECTOR)
+# pragma message("cl2.hpp: __USE_DEV_VECTOR is no longer supported. Expect compilation errors")
+#endif
+#if defined(__USE_DEV_STRING)
+# pragma message("cl2.hpp: __USE_DEV_STRING is no longer supported. Expect compilation errors")
+#endif
+
+/* Detect which version to target */
+#if !defined(CL_HPP_TARGET_OPENCL_VERSION)
+# pragma message("cl2.hpp: CL_HPP_TARGET_OPENCL_VERSION is not defined. It will default to 200 (OpenCL 2.0)")
+# define CL_HPP_TARGET_OPENCL_VERSION 200
+#endif
+#if CL_HPP_TARGET_OPENCL_VERSION != 100 && CL_HPP_TARGET_OPENCL_VERSION != 110 && CL_HPP_TARGET_OPENCL_VERSION != 120 && CL_HPP_TARGET_OPENCL_VERSION != 200
+# pragma message("cl2.hpp: CL_HPP_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120 or 200). It will be set to 200")
+# undef CL_HPP_TARGET_OPENCL_VERSION
+# define CL_HPP_TARGET_OPENCL_VERSION 200
+#endif
+
+#if !defined(CL_HPP_MINIMUM_OPENCL_VERSION)
+# define CL_HPP_MINIMUM_OPENCL_VERSION 200
+#endif
+#if CL_HPP_MINIMUM_OPENCL_VERSION != 100 && CL_HPP_MINIMUM_OPENCL_VERSION != 110 && CL_HPP_MINIMUM_OPENCL_VERSION != 120 && CL_HPP_MINIMUM_OPENCL_VERSION != 200
+# pragma message("cl2.hpp: CL_HPP_MINIMUM_OPENCL_VERSION is not a valid value (100, 110, 120 or 200). It will be set to 100")
+# undef CL_HPP_MINIMUM_OPENCL_VERSION
+# define CL_HPP_MINIMUM_OPENCL_VERSION 100
+#endif
+#if CL_HPP_MINIMUM_OPENCL_VERSION > CL_HPP_TARGET_OPENCL_VERSION
+# error "CL_HPP_MINIMUM_OPENCL_VERSION must not be greater than CL_HPP_TARGET_OPENCL_VERSION"
+#endif
+
+#if CL_HPP_MINIMUM_OPENCL_VERSION <= 100 && !defined(CL_USE_DEPRECATED_OPENCL_1_0_APIS)
+# define CL_USE_DEPRECATED_OPENCL_1_0_APIS
+#endif
+#if CL_HPP_MINIMUM_OPENCL_VERSION <= 110 && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+# define CL_USE_DEPRECATED_OPENCL_1_1_APIS
+#endif
+#if CL_HPP_MINIMUM_OPENCL_VERSION <= 120 && !defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
+# define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#endif
+#if CL_HPP_MINIMUM_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS)
+# define CL_USE_DEPRECATED_OPENCL_2_0_APIS
+#endif
+
+#ifdef _WIN32
+
+#include <malloc.h>
+
+#if defined(CL_HPP_USE_DX_INTEROP)
+#include <CL/cl_d3d10.h>
+#include <CL/cl_dx9_media_sharing.h>
+#endif
+#endif // _WIN32
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif // _MSC_VER 
+ 
+ // Check for a valid C++ version
+
+// Need to do both tests here because for some reason __cplusplus is not 
+// updated in visual studio
+#if (!defined(_MSC_VER) && __cplusplus < 201103L) || (defined(_MSC_VER) && _MSC_VER < 1700)
+#error Visual studio 2013 or another C++11-supporting compiler required
+#endif
+
+// 
+#if defined(CL_HPP_USE_CL_DEVICE_FISSION) || defined(CL_HPP_USE_CL_SUB_GROUPS_KHR)
+#include <CL/cl_ext.h>
+#endif
+
+#if defined(__APPLE__) || defined(__MACOSX)
+#include <OpenCL/opencl.h>
+#else
+#include <CL/opencl.h>
+#endif // !__APPLE__
+
+#if (__cplusplus >= 201103L)
+#define CL_HPP_NOEXCEPT_ noexcept
+#else
+#define CL_HPP_NOEXCEPT_
+#endif
+
+#if defined(_MSC_VER)
+# define CL_HPP_DEFINE_STATIC_MEMBER_ __declspec(selectany)
+#else
+# define CL_HPP_DEFINE_STATIC_MEMBER_ __attribute__((weak))
+#endif // !_MSC_VER
+
+// Define deprecated prefixes and suffixes to ensure compilation
+// in case they are not pre-defined
+#if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED  
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+#if !defined(CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED)
+#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+
+#if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED)
+#define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED  
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED)
+#if !defined(CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED)
+#define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED)
+
+#if !defined(CL_CALLBACK)
+#define CL_CALLBACK
+#endif //CL_CALLBACK
+
+#include <utility>
+#include <limits>
+#include <iterator>
+#include <mutex>
+#include <cstring>
+#include <functional>
+
+
+// Define a size_type to represent a correctly resolved size_t
+#if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY)
+namespace cl {
+    using size_type = ::size_t;
+} // namespace cl
+#else // #if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY)
+namespace cl {
+    using size_type = size_t;
+} // namespace cl
+#endif // #if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY)
+
+
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+#include <exception>
+#endif // #if defined(CL_HPP_ENABLE_EXCEPTIONS)
+
+#if !defined(CL_HPP_NO_STD_VECTOR)
+#include <vector>
+namespace cl {
+    template < class T, class Alloc = std::allocator<T> >
+    using vector = std::vector<T, Alloc>;
+} // namespace cl
+#endif // #if !defined(CL_HPP_NO_STD_VECTOR)
+
+#if !defined(CL_HPP_NO_STD_STRING)
+#include <string>
+namespace cl {
+    using string = std::string;
+} // namespace cl
+#endif // #if !defined(CL_HPP_NO_STD_STRING)
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+#if !defined(CL_HPP_NO_STD_UNIQUE_PTR)
+#include <memory>
+namespace cl {
+    // Replace unique_ptr and allocate_pointer for internal use
+    // to allow user to replace them
+    template<class T, class D>
+    using pointer = std::unique_ptr<T, D>;
+} // namespace cl
+#endif 
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+#if !defined(CL_HPP_NO_STD_ARRAY)
+#include <array>
+namespace cl {
+    template < class T, size_type N >
+    using array = std::array<T, N>;
+} // namespace cl
+#endif // #if !defined(CL_HPP_NO_STD_ARRAY)
+
+// Define size_type appropriately to allow backward-compatibility
+// use of the old size_t interface class
+#if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY)
+namespace cl {
+    namespace compatibility {
+        /*! \brief class used to interface between C++ and
+        *  OpenCL C calls that require arrays of size_t values, whose
+        *  size is known statically.
+        */
+        template <int N>
+        class size_t
+        {
+        private:
+            size_type data_[N];
+
+        public:
+            //! \brief Initialize size_t to all 0s
+            size_t()
+            {
+                for (int i = 0; i < N; ++i) {
+                    data_[i] = 0;
+                }
+            }
+
+            size_t(const array<size_type, N> &rhs)
+            {
+                for (int i = 0; i < N; ++i) {
+                    data_[i] = rhs[i];
+                }
+            }
+
+            size_type& operator[](int index)
+            {
+                return data_[index];
+            }
+
+            const size_type& operator[](int index) const
+            {
+                return data_[index];
+            }
+
+            //! \brief Conversion operator to T*.
+            operator size_type* ()             { return data_; }
+
+            //! \brief Conversion operator to const T*.
+            operator const size_type* () const { return data_; }
+
+            operator array<size_type, N>() const
+            {
+                array<size_type, N> ret;
+
+                for (int i = 0; i < N; ++i) {
+                    ret[i] = data_[i];
+                }
+                return ret;
+            }
+        };
+    } // namespace compatibility
+
+    template<int N>
+    using size_t = compatibility::size_t<N>;
+} // namespace cl
+#endif // #if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY)
+
+// Helper alias to avoid confusing the macros
+namespace cl {
+    namespace detail {
+        using size_t_array = array<size_type, 3>;
+    } // namespace detail
+} // namespace cl
+
+
+/*! \namespace cl
+ *
+ * \brief The OpenCL C++ bindings are defined within this namespace.
+ *
+ */
+namespace cl {
+    class Memory;
+
+#define CL_HPP_INIT_CL_EXT_FCN_PTR_(name) \
+    if (!pfn_##name) {    \
+    pfn_##name = (PFN_##name) \
+    clGetExtensionFunctionAddress(#name); \
+    if (!pfn_##name) {    \
+    } \
+    }
+
+#define CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, name) \
+    if (!pfn_##name) {    \
+    pfn_##name = (PFN_##name) \
+    clGetExtensionFunctionAddressForPlatform(platform, #name); \
+    if (!pfn_##name) {    \
+    } \
+    }
+
+    class Program;
+    class Device;
+    class Context;
+    class CommandQueue;
+    class DeviceCommandQueue;
+    class Memory;
+    class Buffer;
+    class Pipe;
+
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+    /*! \brief Exception class 
+     * 
+     *  This may be thrown by API functions when CL_HPP_ENABLE_EXCEPTIONS is defined.
+     */
+    class Error : public std::exception
+    {
+    private:
+        cl_int err_;
+        const char * errStr_;
+    public:
+        /*! \brief Create a new CL error exception for a given error code
+         *  and corresponding message.
+         * 
+         *  \param err error code value.
+         *
+         *  \param errStr a descriptive string that must remain in scope until
+         *                handling of the exception has concluded.  If set, it
+         *                will be returned by what().
+         */
+        Error(cl_int err, const char * errStr = NULL) : err_(err), errStr_(errStr)
+        {}
+
+        ~Error() throw() {}
+
+        /*! \brief Get error string associated with exception
+         *
+         * \return A memory pointer to the error message string.
+         */
+        virtual const char * what() const throw ()
+        {
+            if (errStr_ == NULL) {
+                return "empty";
+            }
+            else {
+                return errStr_;
+            }
+        }
+
+        /*! \brief Get error code associated with exception
+         *
+         *  \return The error code.
+         */
+        cl_int err(void) const { return err_; }
+    };
+#define CL_HPP_ERR_STR_(x) #x
+#else
+#define CL_HPP_ERR_STR_(x) NULL
+#endif // CL_HPP_ENABLE_EXCEPTIONS
+
+
+namespace detail
+{
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+static inline cl_int errHandler (
+    cl_int err,
+    const char * errStr = NULL)
+{
+    if (err != CL_SUCCESS) {
+        throw Error(err, errStr);
+    }
+    return err;
+}
+#else
+static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
+{
+    (void) errStr; // suppress unused variable warning
+    return err;
+}
+#endif // CL_HPP_ENABLE_EXCEPTIONS
+}
+
+
+
+//! \cond DOXYGEN_DETAIL
+#if !defined(CL_HPP_USER_OVERRIDE_ERROR_STRINGS)
+#define __GET_DEVICE_INFO_ERR               CL_HPP_ERR_STR_(clGetDeviceInfo)
+#define __GET_PLATFORM_INFO_ERR             CL_HPP_ERR_STR_(clGetPlatformInfo)
+#define __GET_DEVICE_IDS_ERR                CL_HPP_ERR_STR_(clGetDeviceIDs)
+#define __GET_PLATFORM_IDS_ERR              CL_HPP_ERR_STR_(clGetPlatformIDs)
+#define __GET_CONTEXT_INFO_ERR              CL_HPP_ERR_STR_(clGetContextInfo)
+#define __GET_EVENT_INFO_ERR                CL_HPP_ERR_STR_(clGetEventInfo)
+#define __GET_EVENT_PROFILE_INFO_ERR        CL_HPP_ERR_STR_(clGetEventProfileInfo)
+#define __GET_MEM_OBJECT_INFO_ERR           CL_HPP_ERR_STR_(clGetMemObjectInfo)
+#define __GET_IMAGE_INFO_ERR                CL_HPP_ERR_STR_(clGetImageInfo)
+#define __GET_SAMPLER_INFO_ERR              CL_HPP_ERR_STR_(clGetSamplerInfo)
+#define __GET_KERNEL_INFO_ERR               CL_HPP_ERR_STR_(clGetKernelInfo)
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __GET_KERNEL_ARG_INFO_ERR           CL_HPP_ERR_STR_(clGetKernelArgInfo)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __GET_KERNEL_WORK_GROUP_INFO_ERR    CL_HPP_ERR_STR_(clGetKernelWorkGroupInfo)
+#define __GET_PROGRAM_INFO_ERR              CL_HPP_ERR_STR_(clGetProgramInfo)
+#define __GET_PROGRAM_BUILD_INFO_ERR        CL_HPP_ERR_STR_(clGetProgramBuildInfo)
+#define __GET_COMMAND_QUEUE_INFO_ERR        CL_HPP_ERR_STR_(clGetCommandQueueInfo)
+
+#define __CREATE_CONTEXT_ERR                CL_HPP_ERR_STR_(clCreateContext)
+#define __CREATE_CONTEXT_FROM_TYPE_ERR      CL_HPP_ERR_STR_(clCreateContextFromType)
+#define __GET_SUPPORTED_IMAGE_FORMATS_ERR   CL_HPP_ERR_STR_(clGetSupportedImageFormats)
+
+#define __CREATE_BUFFER_ERR                 CL_HPP_ERR_STR_(clCreateBuffer)
+#define __COPY_ERR                          CL_HPP_ERR_STR_(cl::copy)
+#define __CREATE_SUBBUFFER_ERR              CL_HPP_ERR_STR_(clCreateSubBuffer)
+#define __CREATE_GL_BUFFER_ERR              CL_HPP_ERR_STR_(clCreateFromGLBuffer)
+#define __CREATE_GL_RENDER_BUFFER_ERR       CL_HPP_ERR_STR_(clCreateFromGLBuffer)
+#define __GET_GL_OBJECT_INFO_ERR            CL_HPP_ERR_STR_(clGetGLObjectInfo)
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __CREATE_IMAGE_ERR                  CL_HPP_ERR_STR_(clCreateImage)
+#define __CREATE_GL_TEXTURE_ERR             CL_HPP_ERR_STR_(clCreateFromGLTexture)
+#define __IMAGE_DIMENSION_ERR               CL_HPP_ERR_STR_(Incorrect image dimensions)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR CL_HPP_ERR_STR_(clSetMemObjectDestructorCallback)
+
+#define __CREATE_USER_EVENT_ERR             CL_HPP_ERR_STR_(clCreateUserEvent)
+#define __SET_USER_EVENT_STATUS_ERR         CL_HPP_ERR_STR_(clSetUserEventStatus)
+#define __SET_EVENT_CALLBACK_ERR            CL_HPP_ERR_STR_(clSetEventCallback)
+#define __WAIT_FOR_EVENTS_ERR               CL_HPP_ERR_STR_(clWaitForEvents)
+
+#define __CREATE_KERNEL_ERR                 CL_HPP_ERR_STR_(clCreateKernel)
+#define __SET_KERNEL_ARGS_ERR               CL_HPP_ERR_STR_(clSetKernelArg)
+#define __CREATE_PROGRAM_WITH_SOURCE_ERR    CL_HPP_ERR_STR_(clCreateProgramWithSource)
+#define __CREATE_PROGRAM_WITH_BINARY_ERR    CL_HPP_ERR_STR_(clCreateProgramWithBinary)
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR    CL_HPP_ERR_STR_(clCreateProgramWithBuiltInKernels)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __BUILD_PROGRAM_ERR                 CL_HPP_ERR_STR_(clBuildProgram)
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __COMPILE_PROGRAM_ERR               CL_HPP_ERR_STR_(clCompileProgram)
+#define __LINK_PROGRAM_ERR                  CL_HPP_ERR_STR_(clLinkProgram)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __CREATE_KERNELS_IN_PROGRAM_ERR     CL_HPP_ERR_STR_(clCreateKernelsInProgram)
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+#define __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR          CL_HPP_ERR_STR_(clCreateCommandQueueWithProperties)
+#define __CREATE_SAMPLER_WITH_PROPERTIES_ERR                CL_HPP_ERR_STR_(clCreateSamplerWithProperties)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
+#define __SET_COMMAND_QUEUE_PROPERTY_ERR    CL_HPP_ERR_STR_(clSetCommandQueueProperty)
+#define __ENQUEUE_READ_BUFFER_ERR           CL_HPP_ERR_STR_(clEnqueueReadBuffer)
+#define __ENQUEUE_READ_BUFFER_RECT_ERR      CL_HPP_ERR_STR_(clEnqueueReadBufferRect)
+#define __ENQUEUE_WRITE_BUFFER_ERR          CL_HPP_ERR_STR_(clEnqueueWriteBuffer)
+#define __ENQUEUE_WRITE_BUFFER_RECT_ERR     CL_HPP_ERR_STR_(clEnqueueWriteBufferRect)
+#define __ENQEUE_COPY_BUFFER_ERR            CL_HPP_ERR_STR_(clEnqueueCopyBuffer)
+#define __ENQEUE_COPY_BUFFER_RECT_ERR       CL_HPP_ERR_STR_(clEnqueueCopyBufferRect)
+#define __ENQUEUE_FILL_BUFFER_ERR           CL_HPP_ERR_STR_(clEnqueueFillBuffer)
+#define __ENQUEUE_READ_IMAGE_ERR            CL_HPP_ERR_STR_(clEnqueueReadImage)
+#define __ENQUEUE_WRITE_IMAGE_ERR           CL_HPP_ERR_STR_(clEnqueueWriteImage)
+#define __ENQUEUE_COPY_IMAGE_ERR            CL_HPP_ERR_STR_(clEnqueueCopyImage)
+#define __ENQUEUE_FILL_IMAGE_ERR            CL_HPP_ERR_STR_(clEnqueueFillImage)
+#define __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR  CL_HPP_ERR_STR_(clEnqueueCopyImageToBuffer)
+#define __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR  CL_HPP_ERR_STR_(clEnqueueCopyBufferToImage)
+#define __ENQUEUE_MAP_BUFFER_ERR            CL_HPP_ERR_STR_(clEnqueueMapBuffer)
+#define __ENQUEUE_MAP_IMAGE_ERR             CL_HPP_ERR_STR_(clEnqueueMapImage)
+#define __ENQUEUE_UNMAP_MEM_OBJECT_ERR      CL_HPP_ERR_STR_(clEnqueueUnMapMemObject)
+#define __ENQUEUE_NDRANGE_KERNEL_ERR        CL_HPP_ERR_STR_(clEnqueueNDRangeKernel)
+#define __ENQUEUE_NATIVE_KERNEL             CL_HPP_ERR_STR_(clEnqueueNativeKernel)
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR   CL_HPP_ERR_STR_(clEnqueueMigrateMemObjects)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+#define __ENQUEUE_ACQUIRE_GL_ERR            CL_HPP_ERR_STR_(clEnqueueAcquireGLObjects)
+#define __ENQUEUE_RELEASE_GL_ERR            CL_HPP_ERR_STR_(clEnqueueReleaseGLObjects)
+
+#define __CREATE_PIPE_ERR             CL_HPP_ERR_STR_(clCreatePipe)
+#define __GET_PIPE_INFO_ERR           CL_HPP_ERR_STR_(clGetPipeInfo)
+
+
+#define __RETAIN_ERR                        CL_HPP_ERR_STR_(Retain Object)
+#define __RELEASE_ERR                       CL_HPP_ERR_STR_(Release Object)
+#define __FLUSH_ERR                         CL_HPP_ERR_STR_(clFlush)
+#define __FINISH_ERR                        CL_HPP_ERR_STR_(clFinish)
+#define __VECTOR_CAPACITY_ERR               CL_HPP_ERR_STR_(Vector capacity error)
+
+/**
+ * CL 1.2 version that uses device fission.
+ */
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __CREATE_SUB_DEVICES_ERR            CL_HPP_ERR_STR_(clCreateSubDevices)
+#else
+#define __CREATE_SUB_DEVICES_ERR            CL_HPP_ERR_STR_(clCreateSubDevicesEXT)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+#define __ENQUEUE_MARKER_ERR                CL_HPP_ERR_STR_(clEnqueueMarker)
+#define __ENQUEUE_WAIT_FOR_EVENTS_ERR       CL_HPP_ERR_STR_(clEnqueueWaitForEvents)
+#define __ENQUEUE_BARRIER_ERR               CL_HPP_ERR_STR_(clEnqueueBarrier)
+#define __UNLOAD_COMPILER_ERR               CL_HPP_ERR_STR_(clUnloadCompiler)
+#define __CREATE_GL_TEXTURE_2D_ERR          CL_HPP_ERR_STR_(clCreateFromGLTexture2D)
+#define __CREATE_GL_TEXTURE_3D_ERR          CL_HPP_ERR_STR_(clCreateFromGLTexture3D)
+#define __CREATE_IMAGE2D_ERR                CL_HPP_ERR_STR_(clCreateImage2D)
+#define __CREATE_IMAGE3D_ERR                CL_HPP_ERR_STR_(clCreateImage3D)
+#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+
+/**
+ * Deprecated APIs for 2.0
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
+#define __CREATE_COMMAND_QUEUE_ERR          CL_HPP_ERR_STR_(clCreateCommandQueue)
+#define __ENQUEUE_TASK_ERR                  CL_HPP_ERR_STR_(clEnqueueTask)
+#define __CREATE_SAMPLER_ERR                CL_HPP_ERR_STR_(clCreateSampler)
+#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+
+/**
+ * CL 1.2 marker and barrier commands
+ */
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+#define __ENQUEUE_MARKER_WAIT_LIST_ERR                CL_HPP_ERR_STR_(clEnqueueMarkerWithWaitList)
+#define __ENQUEUE_BARRIER_WAIT_LIST_ERR               CL_HPP_ERR_STR_(clEnqueueBarrierWithWaitList)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+#endif // CL_HPP_USER_OVERRIDE_ERROR_STRINGS
+//! \endcond
+
+
+namespace detail {
+
+// Generic getInfoHelper. The final parameter is used to guide overload
+// resolution: the actual parameter passed is an int, which makes this
+// a worse conversion sequence than a specialization that declares the
+// parameter as an int.
+template<typename Functor, typename T>
+inline cl_int getInfoHelper(Functor f, cl_uint name, T* param, long)
+{
+    return f(name, sizeof(T), param, NULL);
+}
+
+// Specialized for getInfo<CL_PROGRAM_BINARIES>
+// Assumes that the output vector was correctly resized on the way in
+template <typename Func>
+inline cl_int getInfoHelper(Func f, cl_uint name, vector<vector<unsigned char>>* param, int)
+{
+    if (name != CL_PROGRAM_BINARIES) {
+        return CL_INVALID_VALUE;
+    }
+    if (param) {
+        // Create array of pointers, calculate total size and pass pointer array in
+        size_type numBinaries = param->size();
+        vector<unsigned char*> binariesPointers(numBinaries);
+
+        size_type totalSize = 0;
+        for (size_type i = 0; i < numBinaries; ++i)
+        {
+            binariesPointers[i] = (*param)[i].data();
+            totalSize += (*param)[i].size();
+        }
+
+        cl_int err = f(name, totalSize, binariesPointers.data(), NULL);
+
+        if (err != CL_SUCCESS) {
+            return err;
+        }
+    }
+
+
+    return CL_SUCCESS;
+}
+
+// Specialized getInfoHelper for vector params
+template <typename Func, typename T>
+inline cl_int getInfoHelper(Func f, cl_uint name, vector<T>* param, long)
+{
+    size_type required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+    const size_type elements = required / sizeof(T);
+
+    // Temporary to avoid changing param on an error
+    vector<T> localData(elements);
+    err = f(name, required, localData.data(), NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+    if (param) {
+        *param = std::move(localData);
+    }
+
+    return CL_SUCCESS;
+}
+
+/* Specialization for reference-counted types. This depends on the
+ * existence of Wrapper<T>::cl_type, and none of the other types having the
+ * cl_type member. Note that simplify specifying the parameter as Wrapper<T>
+ * does not work, because when using a derived type (e.g. Context) the generic
+ * template will provide a better match.
+ */
+template <typename Func, typename T>
+inline cl_int getInfoHelper(
+    Func f, cl_uint name, vector<T>* param, int, typename T::cl_type = 0)
+{
+    size_type required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    const size_type elements = required / sizeof(typename T::cl_type);
+
+    vector<typename T::cl_type> value(elements);
+    err = f(name, required, value.data(), NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    if (param) {
+        // Assign to convert CL type to T for each element
+        param->resize(elements);
+
+        // Assign to param, constructing with retain behaviour
+        // to correctly capture each underlying CL object
+        for (size_type i = 0; i < elements; i++) {
+            (*param)[i] = T(value[i], true);
+        }
+    }
+    return CL_SUCCESS;
+}
+
+// Specialized GetInfoHelper for string params
+template <typename Func>
+inline cl_int getInfoHelper(Func f, cl_uint name, string* param, long)
+{
+    size_type required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    // std::string has a constant data member
+    // a char vector does not
+    if (required > 0) {
+        vector<char> value(required);
+        err = f(name, required, value.data(), NULL);
+        if (err != CL_SUCCESS) {
+            return err;
+        }
+        if (param) {
+            param->assign(begin(value), prev(end(value)));
+        }
+    }
+    else if (param) {
+        param->assign("");
+    }
+    return CL_SUCCESS;
+}
+
+// Specialized GetInfoHelper for clsize_t params
+template <typename Func, size_type N>
+inline cl_int getInfoHelper(Func f, cl_uint name, array<size_type, N>* param, long)
+{
+    size_type required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    size_type elements = required / sizeof(size_type);
+    vector<size_type> value(elements, 0);
+
+    err = f(name, required, value.data(), NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+    
+    // Bound the copy with N to prevent overruns
+    // if passed N > than the amount copied
+    if (elements > N) {
+        elements = N;
+    }
+    for (size_type i = 0; i < elements; ++i) {
+        (*param)[i] = value[i];
+    }
+
+    return CL_SUCCESS;
+}
+
+template<typename T> struct ReferenceHandler;
+
+/* Specialization for reference-counted types. This depends on the
+ * existence of Wrapper<T>::cl_type, and none of the other types having the
+ * cl_type member. Note that simplify specifying the parameter as Wrapper<T>
+ * does not work, because when using a derived type (e.g. Context) the generic
+ * template will provide a better match.
+ */
+template<typename Func, typename T>
+inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_type = 0)
+{
+    typename T::cl_type value;
+    cl_int err = f(name, sizeof(value), &value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+    *param = value;
+    if (value != NULL)
+    {
+        err = param->retain();
+        if (err != CL_SUCCESS) {
+            return err;
+        }
+    }
+    return CL_SUCCESS;
+}
+
+#define CL_HPP_PARAM_NAME_INFO_1_0_(F) \
+    F(cl_platform_info, CL_PLATFORM_PROFILE, string) \
+    F(cl_platform_info, CL_PLATFORM_VERSION, string) \
+    F(cl_platform_info, CL_PLATFORM_NAME, string) \
+    F(cl_platform_info, CL_PLATFORM_VENDOR, string) \
+    F(cl_platform_info, CL_PLATFORM_EXTENSIONS, string) \
+    \
+    F(cl_device_info, CL_DEVICE_TYPE, cl_device_type) \
+    F(cl_device_info, CL_DEVICE_VENDOR_ID, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_GROUP_SIZE, size_type) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_SIZES, cl::vector<size_type>) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) \
+    F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, size_type) \
+    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, size_type) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_WIDTH, size_type) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_HEIGHT, size_type) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_DEPTH, size_type) \
+    F(cl_device_info, CL_DEVICE_IMAGE_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_MAX_PARAMETER_SIZE, size_type) \
+    F(cl_device_info, CL_DEVICE_MAX_SAMPLERS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint)\
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type) \
+    F(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_PROFILING_TIMER_RESOLUTION, size_type) \
+    F(cl_device_info, CL_DEVICE_ENDIAN_LITTLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_AVAILABLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_COMPILER_AVAILABLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities) \
+    F(cl_device_info, CL_DEVICE_PLATFORM, cl_platform_id) \
+    F(cl_device_info, CL_DEVICE_NAME, string) \
+    F(cl_device_info, CL_DEVICE_VENDOR, string) \
+    F(cl_device_info, CL_DRIVER_VERSION, string) \
+    F(cl_device_info, CL_DEVICE_PROFILE, string) \
+    F(cl_device_info, CL_DEVICE_VERSION, string) \
+    F(cl_device_info, CL_DEVICE_EXTENSIONS, string) \
+    \
+    F(cl_context_info, CL_CONTEXT_REFERENCE_COUNT, cl_uint) \
+    F(cl_context_info, CL_CONTEXT_DEVICES, cl::vector<Device>) \
+    F(cl_context_info, CL_CONTEXT_PROPERTIES, cl::vector<cl_context_properties>) \
+    \
+    F(cl_event_info, CL_EVENT_COMMAND_QUEUE, cl::CommandQueue) \
+    F(cl_event_info, CL_EVENT_COMMAND_TYPE, cl_command_type) \
+    F(cl_event_info, CL_EVENT_REFERENCE_COUNT, cl_uint) \
+    F(cl_event_info, CL_EVENT_COMMAND_EXECUTION_STATUS, cl_int) \
+    \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_QUEUED, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_SUBMIT, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_START, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_END, cl_ulong) \
+    \
+    F(cl_mem_info, CL_MEM_TYPE, cl_mem_object_type) \
+    F(cl_mem_info, CL_MEM_FLAGS, cl_mem_flags) \
+    F(cl_mem_info, CL_MEM_SIZE, size_type) \
+    F(cl_mem_info, CL_MEM_HOST_PTR, void*) \
+    F(cl_mem_info, CL_MEM_MAP_COUNT, cl_uint) \
+    F(cl_mem_info, CL_MEM_REFERENCE_COUNT, cl_uint) \
+    F(cl_mem_info, CL_MEM_CONTEXT, cl::Context) \
+    \
+    F(cl_image_info, CL_IMAGE_FORMAT, cl_image_format) \
+    F(cl_image_info, CL_IMAGE_ELEMENT_SIZE, size_type) \
+    F(cl_image_info, CL_IMAGE_ROW_PITCH, size_type) \
+    F(cl_image_info, CL_IMAGE_SLICE_PITCH, size_type) \
+    F(cl_image_info, CL_IMAGE_WIDTH, size_type) \
+    F(cl_image_info, CL_IMAGE_HEIGHT, size_type) \
+    F(cl_image_info, CL_IMAGE_DEPTH, size_type) \
+    \
+    F(cl_sampler_info, CL_SAMPLER_REFERENCE_COUNT, cl_uint) \
+    F(cl_sampler_info, CL_SAMPLER_CONTEXT, cl::Context) \
+    F(cl_sampler_info, CL_SAMPLER_NORMALIZED_COORDS, cl_bool) \
+    F(cl_sampler_info, CL_SAMPLER_ADDRESSING_MODE, cl_addressing_mode) \
+    F(cl_sampler_info, CL_SAMPLER_FILTER_MODE, cl_filter_mode) \
+    \
+    F(cl_program_info, CL_PROGRAM_REFERENCE_COUNT, cl_uint) \
+    F(cl_program_info, CL_PROGRAM_CONTEXT, cl::Context) \
+    F(cl_program_info, CL_PROGRAM_NUM_DEVICES, cl_uint) \
+    F(cl_program_info, CL_PROGRAM_DEVICES, cl::vector<Device>) \
+    F(cl_program_info, CL_PROGRAM_SOURCE, string) \
+    F(cl_program_info, CL_PROGRAM_BINARY_SIZES, cl::vector<size_type>) \
+    F(cl_program_info, CL_PROGRAM_BINARIES, cl::vector<cl::vector<unsigned char>>) \
+    \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_STATUS, cl_build_status) \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_OPTIONS, string) \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_LOG, string) \
+    \
+    F(cl_kernel_info, CL_KERNEL_FUNCTION_NAME, string) \
+    F(cl_kernel_info, CL_KERNEL_NUM_ARGS, cl_uint) \
+    F(cl_kernel_info, CL_KERNEL_REFERENCE_COUNT, cl_uint) \
+    F(cl_kernel_info, CL_KERNEL_CONTEXT, cl::Context) \
+    F(cl_kernel_info, CL_KERNEL_PROGRAM, cl::Program) \
+    \
+    F(cl_kernel_work_group_info, CL_KERNEL_WORK_GROUP_SIZE, size_type) \
+    F(cl_kernel_work_group_info, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, cl::detail::size_t_array) \
+    F(cl_kernel_work_group_info, CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong) \
+    \
+    F(cl_command_queue_info, CL_QUEUE_CONTEXT, cl::Context) \
+    F(cl_command_queue_info, CL_QUEUE_DEVICE, cl::Device) \
+    F(cl_command_queue_info, CL_QUEUE_REFERENCE_COUNT, cl_uint) \
+    F(cl_command_queue_info, CL_QUEUE_PROPERTIES, cl_command_queue_properties)
+
+
+#define CL_HPP_PARAM_NAME_INFO_1_1_(F) \
+    F(cl_context_info, CL_CONTEXT_NUM_DEVICES, cl_uint)\
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) \
+    F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_OPENCL_C_VERSION, string) \
+    \
+    F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \
+    F(cl_mem_info, CL_MEM_OFFSET, size_type) \
+    \
+    F(cl_kernel_work_group_info, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, size_type) \
+    F(cl_kernel_work_group_info, CL_KERNEL_PRIVATE_MEM_SIZE, cl_ulong) \
+    \
+    F(cl_event_info, CL_EVENT_CONTEXT, cl::Context)
+
+#define CL_HPP_PARAM_NAME_INFO_1_2_(F) \
+    F(cl_program_info, CL_PROGRAM_NUM_KERNELS, size_type) \
+    F(cl_program_info, CL_PROGRAM_KERNEL_NAMES, string) \
+    \
+    F(cl_program_build_info, CL_PROGRAM_BINARY_TYPE, cl_program_binary_type) \
+    \
+    F(cl_kernel_info, CL_KERNEL_ATTRIBUTES, string) \
+    \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_ADDRESS_QUALIFIER, cl_kernel_arg_address_qualifier) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_ACCESS_QUALIFIER, cl_kernel_arg_access_qualifier) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_NAME, string) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, string) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_QUALIFIER, cl_kernel_arg_type_qualifier) \
+    \
+    F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl::Device) \
+    F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, cl::vector<cl_device_partition_property>) \
+    F(cl_device_info, CL_DEVICE_PARTITION_TYPE, cl::vector<cl_device_partition_property>)  \
+    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, size_type) \
+    F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \
+    F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS, string) \
+    \
+    F(cl_image_info, CL_IMAGE_ARRAY_SIZE, size_type) \
+    F(cl_image_info, CL_IMAGE_NUM_MIP_LEVELS, cl_uint) \
+    F(cl_image_info, CL_IMAGE_NUM_SAMPLES, cl_uint)
+
+#define CL_HPP_PARAM_NAME_INFO_2_0_(F) \
+    F(cl_device_info, CL_DEVICE_QUEUE_ON_HOST_PROPERTIES, cl_command_queue_properties) \
+    F(cl_device_info, CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES, cl_command_queue_properties) \
+    F(cl_device_info, CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_ON_DEVICE_QUEUES, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_ON_DEVICE_EVENTS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_PIPE_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PIPE_MAX_PACKET_SIZE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_SVM_CAPABILITIES, cl_device_svm_capabilities) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT, cl_uint) \
+    F(cl_command_queue_info, CL_QUEUE_SIZE, cl_uint) \
+    F(cl_mem_info, CL_MEM_USES_SVM_POINTER, cl_bool) \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE, size_type) \
+    F(cl_pipe_info, CL_PIPE_PACKET_SIZE, cl_uint) \
+    F(cl_pipe_info, CL_PIPE_MAX_PACKETS, cl_uint)
+
+#define CL_HPP_PARAM_NAME_DEVICE_FISSION_(F) \
+    F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \
+    F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, cl::vector<cl_device_partition_property_ext>) \
+    F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, cl::vector<cl_device_partition_property_ext>) \
+    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \
+    F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, cl::vector<cl_device_partition_property_ext>)
+
+template <typename enum_type, cl_int Name>
+struct param_traits {};
+
+#define CL_HPP_DECLARE_PARAM_TRAITS_(token, param_name, T) \
+struct token;                                        \
+template<>                                           \
+struct param_traits<detail:: token,param_name>       \
+{                                                    \
+    enum { value = param_name };                     \
+    typedef T param_type;                            \
+};
+
+CL_HPP_PARAM_NAME_INFO_1_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#if CL_HPP_TARGET_OPENCL_VERSION >= 110
+CL_HPP_PARAM_NAME_INFO_1_1_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+CL_HPP_PARAM_NAME_INFO_1_2_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+CL_HPP_PARAM_NAME_INFO_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+
+
+// Flags deprecated in OpenCL 2.0
+#define CL_HPP_PARAM_NAME_INFO_1_0_DEPRECATED_IN_2_0_(F) \
+    F(cl_device_info, CL_DEVICE_QUEUE_PROPERTIES, cl_command_queue_properties)
+
+#define CL_HPP_PARAM_NAME_INFO_1_1_DEPRECATED_IN_2_0_(F) \
+    F(cl_device_info, CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool)
+
+#define CL_HPP_PARAM_NAME_INFO_1_2_DEPRECATED_IN_2_0_(F) \
+    F(cl_image_info, CL_IMAGE_BUFFER, cl::Buffer)
+
+// Include deprecated query flags based on versions
+// Only include deprecated 1.0 flags if 2.0 not active as there is an enum clash
+#if CL_HPP_TARGET_OPENCL_VERSION > 100 && CL_HPP_MINIMUM_OPENCL_VERSION < 200 && CL_HPP_TARGET_OPENCL_VERSION < 200
+CL_HPP_PARAM_NAME_INFO_1_0_DEPRECATED_IN_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 110
+#if CL_HPP_TARGET_OPENCL_VERSION > 110 && CL_HPP_MINIMUM_OPENCL_VERSION < 200
+CL_HPP_PARAM_NAME_INFO_1_1_DEPRECATED_IN_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120
+#if CL_HPP_TARGET_OPENCL_VERSION > 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 200
+CL_HPP_PARAM_NAME_INFO_1_2_DEPRECATED_IN_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
+#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200
+
+#if defined(CL_HPP_USE_CL_DEVICE_FISSION)
+CL_HPP_PARAM_NAME_DEVICE_FISSION_(CL_HPP_DECLARE_PARAM_TRAITS_);
+#endif // CL_HPP_USE_CL_DEVICE_FISSION
+
+#ifdef CL_PLATFORM_ICD_SUFFIX_KHR
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, string)
+#endif
+
+#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong)
+#endif
+
+#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, vector<size_type>)
+#endif
+#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_SIMD_WIDTH_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SIMD_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_WAVEFRONT_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint)
+#endif
+
+#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_REGISTERS_PER_BLOCK_NV
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_REGISTERS_PER_BLOCK_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_WARP_SIZE_NV
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_WARP_SIZE_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_GPU_OVERLAP_NV
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GPU_OVERLAP_NV, cl_bool)
+#endif
+#ifdef CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, cl_bool)
+#endif
+#ifdef CL_DEVICE_INTEGRATED_MEMORY_NV
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGRATED_MEMORY_NV, cl_bool)
+#endif
+
+// Convenience functions
+
+template <typename Func, typename T>
+inline cl_int
+getInfo(Func f, cl_uint name, T* param)
+{
+    return getInfoHelper(f, name, param, 0);
+}
+
+template <typename Func, typename Arg0>
+struct GetInfoFunctor0
+{
+    Func f_; const Arg0& arg0_;
+    cl_int operator ()(
+        cl_uint param, size_type size, void* value, size_type* size_ret)
+    { return f_(arg0_, param, size, value, size_ret); }
+};
+
+template <typename Func, typename Arg0, typename Arg1>
+struct GetInfoFunctor1
+{
+    Func f_; const Arg0& arg0_; const Arg1& arg1_;
+    cl_int operator ()(
+        cl_uint param, size_type size, void* value, size_type* size_ret)
+    { return f_(arg0_, arg1_, param, size, value, size_ret); }
+};
+
+template <typename Func, typename Arg0, typename T>
+inline cl_int
+getInfo(Func f, const Arg0& arg0, cl_uint name, T* param)
+{
+    GetInfoFunctor0<Func, Arg0> f0 = { f, arg0 };
+    return getInfoHelper(f0, name, param, 0);
+}
+
+template <typename Func, typename Arg0, typename Arg1, typename T>
+inline cl_int
+getInfo(Func f, const Arg0& arg0, const Arg1& arg1, cl_uint name, T* param)
+{
+    GetInfoFunctor1<Func, Arg0, Arg1> f0 = { f, arg0, arg1 };
+    return getInfoHelper(f0, name, param, 0);
+}
+
+
+template<typename T>
+struct ReferenceHandler
+{ };
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+/**
+ * OpenCL 1.2 devices do have retain/release.
+ */
+template <>
+struct ReferenceHandler<cl_device_id>
+{
+    /**
+     * Retain the device.
+     * \param device A valid device created using createSubDevices
+     * \return 
+     *   CL_SUCCESS if the function executed successfully.
+     *   CL_INVALID_DEVICE if device was not a valid subdevice
+     *   CL_OUT_OF_RESOURCES
+     *   CL_OUT_OF_HOST_MEMORY
+     */
+    static cl_int retain(cl_device_id device)
+    { return ::clRetainDevice(device); }
+    /**
+     * Retain the device.
+     * \param device A valid device created using createSubDevices
+     * \return 
+     *   CL_SUCCESS if the function executed successfully.
+     *   CL_INVALID_DEVICE if device was not a valid subdevice
+     *   CL_OUT_OF_RESOURCES
+     *   CL_OUT_OF_HOST_MEMORY
+     */
+    static cl_int release(cl_device_id device)
+    { return ::clReleaseDevice(device); }
+};
+#else // CL_HPP_TARGET_OPENCL_VERSION >= 120
+/**
+ * OpenCL 1.1 devices do not have retain/release.
+ */
+template <>
+struct ReferenceHandler<cl_device_id>
+{
+    // cl_device_id does not have retain().
+    static cl_int retain(cl_device_id)
+    { return CL_SUCCESS; }
+    // cl_device_id does not have release().
+    static cl_int release(cl_device_id)
+    { return CL_SUCCESS; }
+};
+#endif // ! (CL_HPP_TARGET_OPENCL_VERSION >= 120)
+
+template <>
+struct ReferenceHandler<cl_platform_id>
+{
+    // cl_platform_id does not have retain().
+    static cl_int retain(cl_platform_id)
+    { return CL_SUCCESS; }
+    // cl_platform_id does not have release().
+    static cl_int release(cl_platform_id)
+    { return CL_SUCCESS; }
+};
+
+template <>
+struct ReferenceHandler<cl_context>
+{
+    static cl_int retain(cl_context context)
+    { return ::clRetainContext(context); }
+    static cl_int release(cl_context context)
+    { return ::clReleaseContext(context); }
+};
+
+template <>
+struct ReferenceHandler<cl_command_queue>
+{
+    static cl_int retain(cl_command_queue queue)
+    { return ::clRetainCommandQueue(queue); }
+    static cl_int release(cl_command_queue queue)
+    { return ::clReleaseCommandQueue(queue); }
+};
+
+template <>
+struct ReferenceHandler<cl_mem>
+{
+    static cl_int retain(cl_mem memory)
+    { return ::clRetainMemObject(memory); }
+    static cl_int release(cl_mem memory)
+    { return ::clReleaseMemObject(memory); }
+};
+
+template <>
+struct ReferenceHandler<cl_sampler>
+{
+    static cl_int retain(cl_sampler sampler)
+    { return ::clRetainSampler(sampler); }
+    static cl_int release(cl_sampler sampler)
+    { return ::clReleaseSampler(sampler); }
+};
+
+template <>
+struct ReferenceHandler<cl_program>
+{
+    static cl_int retain(cl_program program)
+    { return ::clRetainProgram(program); }
+    static cl_int release(cl_program program)
+    { return ::clReleaseProgram(program); }
+};
+
+template <>
+struct ReferenceHandler<cl_kernel>
+{
+    static cl_int retain(cl_kernel kernel)
+    { return ::clRetainKernel(kernel); }
+    static cl_int release(cl_kernel kernel)
+    { return ::clReleaseKernel(kernel); }
+};
+
+template <>
+struct ReferenceHandler<cl_event>
+{
+    static cl_int retain(cl_event event)
+    { return ::clRetainEvent(event); }
+    static cl_int release(cl_event event)
+    { return ::clReleaseEvent(event); }
+};
+
+
+// Extracts version number with major in the upper 16 bits, minor in the lower 16
+static cl_uint getVersion(const vector<char> &versionInfo)
+{
+    int highVersion = 0;
+    int lowVersion = 0;
+    int index = 7;
+    while(versionInfo[index] != '.' ) {
+        highVersion *= 10;
+        highVersion += versionInfo[index]-'0';
+        ++index;
+    }
+    ++index;
+    while(versionInfo[index] != ' ' &&  versionInfo[index] != '\0') {
+        lowVersion *= 10;
+        lowVersion += versionInfo[index]-'0';
+        ++index;
+    }
+    return (highVersion << 16) | lowVersion;
+}
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120
+static cl_uint getPlatformVersion(cl_platform_id platform)
+{
+    size_type size = 0;
+    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, NULL, &size);
+    
+    vector<char> versionInfo(size);
+    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, size, versionInfo.data(), &size);
+    return getVersion(versionInfo);
+}
+
+static cl_uint getDevicePlatformVersion(cl_device_id device)
+{
+    cl_platform_id platform;
+    clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, NULL);
+    return getPlatformVersion(platform);
+}
+
+static cl_uint getContextPlatformVersion(cl_context context)
+{
+    // The platform cannot be queried directly, so we first have to grab a
+    // device and obtain its context
+    size_type size = 0;
+    clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size);
+    if (size == 0)
+        return 0;
+    vector<cl_device_id> devices(size/sizeof(cl_device_id));
+    clGetContextInfo(context, CL_CONTEXT_DEVICES, size, devices.data(), NULL);
+    return getDevicePlatformVersion(devices[0]);
+}
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120
+
+template <typename T>
+class Wrapper
+{
+public:
+    typedef T cl_type;
+
+protected:
+    cl_type object_;
+
+public:
+    Wrapper() : object_(NULL) { }
+    
+    Wrapper(const cl_type &obj, bool retainObject) : object_(obj) 
+    {
+        if (retainObject) { 
+            detail::errHandler(retain(), __RETAIN_ERR); 
+        }
+    }
+
+    ~Wrapper()
+    {
+        if (object_ != NULL) { release(); }
+    }
+
+    Wrapper(const Wrapper<cl_type>& rhs)
+    {
+        object_ = rhs.object_;
+        detail::errHandler(retain(), __RETAIN_ERR);
+    }
+
+    Wrapper(Wrapper<cl_type>&& rhs) CL_HPP_NOEXCEPT_
+    {
+        object_ = rhs.object_;
+        rhs.object_ = NULL;
+    }
+
+    Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
+    {
+        if (this != &rhs) {
+            detail::errHandler(release(), __RELEASE_ERR);
+            object_ = rhs.object_;
+            detail::errHandler(retain(), __RETAIN_ERR);
+        }
+        return *this;
+    }
+
+    Wrapper<cl_type>& operator = (Wrapper<cl_type>&& rhs)
+    {
+        if (this != &rhs) {
+            detail::errHandler(release(), __RELEASE_ERR);
+            object_ = rhs.object_;
+            rhs.object_ = NULL;
+        }
+        return *this;
+    }
+
+    Wrapper<cl_type>& operator = (const cl_type &rhs)
+    {
+        detail::errHandler(release(), __RELEASE_ERR);
+        object_ = rhs;
+        return *this;
+    }
+
+    const cl_type& operator ()() const { return object_; }
+
+    cl_type& operator ()() { return object_; }
+
+    const cl_type get() const { return object_; }
+
+    cl_type get() { return object_; }
+
+
+protected:
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
+
+    cl_int retain() const
+    {
+        if (object_ != nullptr) {
+            return ReferenceHandler<cl_type>::retain(object_);
+        }
+        else {
+            return CL_SUCCESS;
+        }
+    }
+
+    cl_int release() const
+    {
+        if (object_ != nullptr) {
+            return ReferenceHandler<cl_type>::release(object_);
+        }
+        else {
+            return CL_SUCCESS;
+        }
+    }
+};
+
+template <>
+class Wrapper<cl_device_id>
+{
+public:
+    typedef cl_device_id cl_type;
+
+protected:
+    cl_type object_;
+    bool referenceCountable_;
+
+    static bool isReferenceCountable(cl_device_id device)
+    {
+        bool retVal = false;
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+#if CL_HPP_MINIMUM_OPENCL_VERSION < 120
+        if (device != NULL) {
+            int version = getDevicePlatformVersion(device);
+            if(version > ((1 << 16) + 1)) {
+                retVal = true;
+            }
+        }
+#else // CL_HPP_MINIMUM_OPENCL_VERSION < 120
+        retVal = true;
+#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+        return retVal;
+    }
+
+public:
+    Wrapper() : object_(NULL), referenceCountable_(false) 
+    { 
+    }
+    
+    Wrapper(const cl_type &obj, bool retainObject) : 
+        object_(obj), 
+        referenceCountable_(false) 
+    {
+        referenceCountable_ = isReferenceCountable(obj); 
+
+        if (retainObject) {
+            detail::errHandler(retain(), __RETAIN_ERR);
+        }
+    }
+
+    ~Wrapper()
+    {
+        release();
+    }
+    
+    Wrapper(const Wrapper<cl_type>& rhs)
+    {
+        object_ = rhs.object_;
+        referenceCountable_ = isReferenceCountable(object_); 
+        detail::errHandler(retain(), __RETAIN_ERR);
+    }
+
+    Wrapper(Wrapper<cl_type>&& rhs) CL_HPP_NOEXCEPT_
+    {
+        object_ = rhs.object_;
+        referenceCountable_ = rhs.referenceCountable_;
+        rhs.object_ = NULL;
+        rhs.referenceCountable_ = false;
+    }
+
+    Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
+    {
+        if (this != &rhs) {
+            detail::errHandler(release(), __RELEASE_ERR);
+            object_ = rhs.object_;
+            referenceCountable_ = rhs.referenceCountable_;
+            detail::errHandler(retain(), __RETAIN_ERR);
+        }
+        return *this;
+    }
+
+    Wrapper<cl_type>& operator = (Wrapper<cl_type>&& rhs)
+    {
+        if (this != &rhs) {
+            detail::errHandler(release(), __RELEASE_ERR);
+            object_ = rhs.object_;
+            referenceCountable_ = rhs.referenceCountable_;
+            rhs.object_ = NULL;
+            rhs.referenceCountable_ = false;
+        }
+        return *this;
+    }
+
+    Wrapper<cl_type>& operator = (const cl_type &rhs)
+    {
+        detail::errHandler(release(), __RELEASE_ERR);
+        object_ = rhs;
+        referenceCountable_ = isReferenceCountable(object_); 
+        return *this;
+    }
+
+    const cl_type& operator ()() const { return object_; }
+
+    cl_type& operator ()() { return object_; }
+
+    const cl_type get() const { return object_; }
+
+    cl_type get() { return object_; }
+
+protected:
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
+
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, vector<U>*, int, typename U::cl_type);
+
+    cl_int retain() const
+    {
+        if( object_ != nullptr && referenceCountable_ ) {
+            return ReferenceHandler<cl_type>::retain(object_);
+        }
+        else {
+            return CL_SUCCESS;
+        }
+    }
+
+    cl_int release() const
+    {
+        if (object_ != nullptr && referenceCountable_) {
+            return ReferenceHandler<cl_type>::release(object_);
+        }
+        else {
+            return CL_SUCCESS;
+        }
+    }
+};
+
+template <typename T>
+inline bool operator==(const Wrapper<T> &lhs, const Wrapper<T> &rhs)
+{
+    return lhs() == rhs();
+}
+
+template <typename T>
+inline bool operator!=(const Wrapper<T> &lhs, const Wrapper<T> &rhs)
+{
+    return !operator==(lhs, rhs);
+}
+
+} // namespace detail
+//! \endcond
+
+
+using BuildLogType = vector<std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info, CL_PROGRAM_BUILD_LOG>::param_type>>;
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+/**
+* Exception class for build errors to carry build info
+*/
+class BuildError : public Error
+{
+private:
+    BuildLogType buildLogs;
+public:
+    BuildError(cl_int err, const char * errStr, const BuildLogType &vec) : Error(err, errStr), buildLogs(vec)
+    {
+    }
+
+    BuildLogType getBuildLog() const
+    {
+        return buildLogs;
+    }
+};
+namespace detail {
+    static inline cl_int buildErrHandler(
+        cl_int err,
+        const char * errStr,
+        const BuildLogType &buildLogs)
+    {
+        if (err != CL_SUCCESS) {
+            throw BuildError(err, errStr, buildLogs);
+        }
+        return err;
+    }
+} // namespace detail
+
+#else
+namespace detail {
+    static inline cl_int buildErrHandler(
+        cl_int err,
+        const char * errStr,
+        const BuildLogType &buildLogs)
+    {
+        (void)buildLogs; // suppress unused variable warning
+        (void)errStr;
+        return err;
+    }
+} // namespace detail
+#endif // #if defined(CL_HPP_ENABLE_EXCEPTIONS)
+
+
+/*! \stuct ImageFormat
+ *  \brief Adds constructors and member functions for cl_image_format.
+ *
+ *  \see cl_image_format
+ */
+struct ImageFormat : public cl_image_format
+{
+    //! \brief Default constructor - performs no initialization.
+    ImageFormat(){}
+
+    //! \brief Initializing constructor.
+    ImageFormat(cl_channel_order order, cl_channel_type type)
+    {
+        image_channel_order = order;
+        image_channel_data_type = type;
+    }
+
+    //! \brief Assignment operator.
+    ImageFormat& operator = (const ImageFormat& rhs)
+    {
+        if (this != &rhs) {
+            this->image_channel_data_type = rhs.image_channel_data_type;
+            this->image_channel_order     = rhs.image_channel_order;
+        }
+        return *this;
+    }
+};
+
+/*! \brief Class interface for cl_device_id.
+ *
+ *  \note Copies of these objects are inexpensive, since they don't 'own'
+ *        any underlying resources or data structures.
+ *
+ *  \see cl_device_id
+ */
+class Device : public detail::Wrapper<cl_device_id>
+{
+private:
+    static std::once_flag default_initialized_;
+    static Device default_;
+    static cl_int default_error_;
+
+    /*! \brief Create the default context.
+    *
+    * This sets @c default_ and @c default_error_. It does not throw
+    * @c cl::Error.
+    */
+    static void makeDefault();
+
+    /*! \brief Create the default platform from a provided platform.
+    *
+    * This sets @c default_. It does not throw
+    * @c cl::Error.
+    */
+    static void makeDefaultProvided(const Device &p) {
+        default_ = p;
+    }
+
+public:
+#ifdef CL_HPP_UNIT_TEST_ENABLE
+    /*! \brief Reset the default.
+    *
+    * This sets @c default_ to an empty value to support cleanup in
+    * the unit test framework.
+    * This function is not thread safe.
+    */
+    static void unitTestClearDefault() {
+        default_ = Device();
+    }
+#endif // #ifdef CL_HPP_UNIT_TEST_ENABLE
+
+    //! \brief Default constructor - initializes to NULL.
+    Device() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Constructor from cl_device_id.
+     * 
+     *  This simply copies the device ID value, which is an inexpensive operation.
+     */
+    explicit Device(const cl_device_id &device, bool retainObject = false) : 
+        detail::Wrapper<cl_type>(device, retainObject) { }
+
+    /*! \brief Returns the first device on the default context.
+     *
+     *  \see Context::getDefault()
+     */
+    static Device getDefault(
+        cl_int *errResult = NULL)
+    {
+        std::call_once(default_initialized_, makeDefault);
+        detail::errHandler(default_error_);
+        if (errResult != NULL) {
+            *errResult = default_error_;
+        }
+        return default_;
+    }
+
+    /**
+    * Modify the default device to be used by
+    * subsequent operations.
+    * Will only set the default if no default was previously created.
+    * @return updated default device.
+    *         Should be compared to the passed value to ensure that it was updated.
+    */
+    static Device setDefault(const Device &default_device)
+    {
+        std::call_once(default_initialized_, makeDefaultProvided, std::cref(default_device));
+        detail::errHandler(default_error_);
+        return default_;
+    }
+
+    /*! \brief Assignment operator from cl_device_id.
+     * 
+     *  This simply copies the device ID value, which is an inexpensive operation.
+     */
+    Device& operator = (const cl_device_id& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+    * Required for MSVC.
+    */
+    Device(const Device& dev) : detail::Wrapper<cl_type>(dev) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+    * Required for MSVC.
+    */
+    Device& operator = (const Device &dev)
+    {
+        detail::Wrapper<cl_type>::operator=(dev);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+    * Required for MSVC.
+    */
+    Device(Device&& dev) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(dev)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+    * Required for MSVC.
+    */
+    Device& operator = (Device &&dev)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(dev));
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetDeviceInfo().
+    template <typename T>
+    cl_int getInfo(cl_device_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetDeviceInfo, object_, name, param),
+            __GET_DEVICE_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetDeviceInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_device_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_device_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /**
+     * CL 1.2 version
+     */
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+    //! \brief Wrapper for clCreateSubDevices().
+    cl_int createSubDevices(
+        const cl_device_partition_property * properties,
+        vector<Device>* devices)
+    {
+        cl_uint n = 0;
+        cl_int err = clCreateSubDevices(object_, properties, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR);
+        }
+
+        vector<cl_device_id> ids(n);
+        err = clCreateSubDevices(object_, properties, n, ids.data(), NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR);
+        }
+
+        // Cannot trivially assign because we need to capture intermediates 
+        // with safe construction
+        if (devices) {
+            devices->resize(ids.size());
+
+            // Assign to param, constructing with retain behaviour
+            // to correctly capture each underlying CL object
+            for (size_type i = 0; i < ids.size(); i++) {
+                // We do not need to retain because this device is being created 
+                // by the runtime
+                (*devices)[i] = Device(ids[i], false);
+            }
+        }
+
+        return CL_SUCCESS;
+    }
+#elif defined(CL_HPP_USE_CL_DEVICE_FISSION)
+
+/**
+ * CL 1.1 version that uses device fission extension.
+ */
+    cl_int createSubDevices(
+        const cl_device_partition_property_ext * properties,
+        vector<Device>* devices)
+    {
+        typedef CL_API_ENTRY cl_int 
+            ( CL_API_CALL * PFN_clCreateSubDevicesEXT)(
+                cl_device_id /*in_device*/,
+                const cl_device_partition_property_ext * /* properties */,
+                cl_uint /*num_entries*/,
+                cl_device_id * /*out_devices*/,
+                cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+        static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL;
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateSubDevicesEXT);
+
+        cl_uint n = 0;
+        cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR);
+        }
+
+        vector<cl_device_id> ids(n);
+        err = pfn_clCreateSubDevicesEXT(object_, properties, n, ids.data(), NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR);
+        }
+        // Cannot trivially assign because we need to capture intermediates 
+        // with safe construction
+        if (devices) {
+            devices->resize(ids.size());
+
+            // Assign to param, constructing with retain behaviour
+            // to correctly capture each underlying CL object
+            for (size_type i = 0; i < ids.size(); i++) {
+                // We do not need to retain because this device is being created 
+                // by the runtime
+                (*devices)[i] = Device(ids[i], false);
+            }
+        }
+        return CL_SUCCESS;
+    }
+#endif // defined(CL_HPP_USE_CL_DEVICE_FISSION)
+};
+
+CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag Device::default_initialized_;
+CL_HPP_DEFINE_STATIC_MEMBER_ Device Device::default_;
+CL_HPP_DEFINE_STATIC_MEMBER_ cl_int Device::default_error_ = CL_SUCCESS;
+
+/*! \brief Class interface for cl_platform_id.
+ *
+ *  \note Copies of these objects are inexpensive, since they don't 'own'
+ *        any underlying resources or data structures.
+ *
+ *  \see cl_platform_id
+ */
+class Platform : public detail::Wrapper<cl_platform_id>
+{
+private:
+    static std::once_flag default_initialized_;
+    static Platform default_;
+    static cl_int default_error_;
+
+    /*! \brief Create the default context.
+    *
+    * This sets @c default_ and @c default_error_. It does not throw
+    * @c cl::Error.
+    */
+    static void makeDefault() {
+        /* Throwing an exception from a call_once invocation does not do
+        * what we wish, so we catch it and save the error.
+        */
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+        try
+#endif
+        {
+            // If default wasn't passed ,generate one
+            // Otherwise set it
+            cl_uint n = 0;
+
+            cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+            if (err != CL_SUCCESS) {
+                default_error_ = err;
+                return;
+            }
+            if (n == 0) {
+                default_error_ = CL_INVALID_PLATFORM;
+                return;
+            }
+
+            vector<cl_platform_id> ids(n);
+            err = ::clGetPlatformIDs(n, ids.data(), NULL);
+            if (err != CL_SUCCESS) {
+                default_error_ = err;
+                return;
+            }
+
+            default_ = Platform(ids[0]);
+        }
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+        catch (cl::Error &e) {
+            default_error_ = e.err();
+        }
+#endif
+    }
+
+    /*! \brief Create the default platform from a provided platform.
+     *
+     * This sets @c default_. It does not throw
+     * @c cl::Error.
+     */
+    static void makeDefaultProvided(const Platform &p) {
+       default_ = p;
+    }
+    
+public:
+#ifdef CL_HPP_UNIT_TEST_ENABLE
+    /*! \brief Reset the default.
+    *
+    * This sets @c default_ to an empty value to support cleanup in
+    * the unit test framework.
+    * This function is not thread safe.
+    */
+    static void unitTestClearDefault() {
+        default_ = Platform();
+    }
+#endif // #ifdef CL_HPP_UNIT_TEST_ENABLE
+
+    //! \brief Default constructor - initializes to NULL.
+    Platform() : detail::Wrapper<cl_type>()  { }
+
+    /*! \brief Constructor from cl_platform_id.
+     * 
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  This simply copies the platform ID value, which is an inexpensive operation.
+     */
+    explicit Platform(const cl_platform_id &platform, bool retainObject = false) : 
+        detail::Wrapper<cl_type>(platform, retainObject) { }
+
+    /*! \brief Assignment operator from cl_platform_id.
+     * 
+     *  This simply copies the platform ID value, which is an inexpensive operation.
+     */
+    Platform& operator = (const cl_platform_id& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    static Platform getDefault(
+        cl_int *errResult = NULL)
+    {
+        std::call_once(default_initialized_, makeDefault);
+        detail::errHandler(default_error_);
+        if (errResult != NULL) {
+            *errResult = default_error_;
+        }
+        return default_;
+    }
+
+    /**
+     * Modify the default platform to be used by 
+     * subsequent operations.
+     * Will only set the default if no default was previously created.
+     * @return updated default platform. 
+     *         Should be compared to the passed value to ensure that it was updated.
+     */
+    static Platform setDefault(const Platform &default_platform)
+    {
+        std::call_once(default_initialized_, makeDefaultProvided, std::cref(default_platform));
+        detail::errHandler(default_error_);
+        return default_;
+    }
+
+    //! \brief Wrapper for clGetPlatformInfo().
+    cl_int getInfo(cl_platform_info name, string* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetPlatformInfo, object_, name, param),
+            __GET_PLATFORM_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetPlatformInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_platform_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_platform_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*! \brief Gets a list of devices for this platform.
+     * 
+     *  Wraps clGetDeviceIDs().
+     */
+    cl_int getDevices(
+        cl_device_type type,
+        vector<Device>* devices) const
+    {
+        cl_uint n = 0;
+        if( devices == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
+        }
+        cl_int err = ::clGetDeviceIDs(object_, type, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        vector<cl_device_id> ids(n);
+        err = ::clGetDeviceIDs(object_, type, n, ids.data(), NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        // Cannot trivially assign because we need to capture intermediates 
+        // with safe construction
+        // We must retain things we obtain from the API to avoid releasing
+        // API-owned objects.
+        if (devices) {
+            devices->resize(ids.size());
+
+            // Assign to param, constructing with retain behaviour
+            // to correctly capture each underlying CL object
+            for (size_type i = 0; i < ids.size(); i++) {
+                (*devices)[i] = Device(ids[i], true);
+            }
+        }
+        return CL_SUCCESS;
+    }
+
+#if defined(CL_HPP_USE_DX_INTEROP)
+   /*! \brief Get the list of available D3D10 devices.
+     *
+     *  \param d3d_device_source.
+     *
+     *  \param d3d_object.
+     *
+     *  \param d3d_device_set.
+     *
+     *  \param devices returns a vector of OpenCL D3D10 devices found. The cl::Device
+     *  values returned in devices can be used to identify a specific OpenCL
+     *  device. If \a devices argument is NULL, this argument is ignored.
+     *
+     *  \return One of the following values:
+     *    - CL_SUCCESS if the function is executed successfully.
+     *
+     *  The application can query specific capabilities of the OpenCL device(s)
+     *  returned by cl::getDevices. This can be used by the application to
+     *  determine which device(s) to use.
+     *
+     * \note In the case that exceptions are enabled and a return value
+     * other than CL_SUCCESS is generated, then cl::Error exception is
+     * generated.
+     */
+    cl_int getDevices(
+        cl_d3d10_device_source_khr d3d_device_source,
+        void *                     d3d_object,
+        cl_d3d10_device_set_khr    d3d_device_set,
+        vector<Device>* devices) const
+    {
+        typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clGetDeviceIDsFromD3D10KHR)(
+            cl_platform_id platform, 
+            cl_d3d10_device_source_khr d3d_device_source, 
+            void * d3d_object,
+            cl_d3d10_device_set_khr d3d_device_set,
+            cl_uint num_entries,
+            cl_device_id * devices,
+            cl_uint* num_devices);
+
+        if( devices == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
+        }
+
+        static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = NULL;
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(object_, clGetDeviceIDsFromD3D10KHR);
+
+        cl_uint n = 0;
+        cl_int err = pfn_clGetDeviceIDsFromD3D10KHR(
+            object_, 
+            d3d_device_source, 
+            d3d_object,
+            d3d_device_set, 
+            0, 
+            NULL, 
+            &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        vector<cl_device_id> ids(n);
+        err = pfn_clGetDeviceIDsFromD3D10KHR(
+            object_, 
+            d3d_device_source, 
+            d3d_object,
+            d3d_device_set,
+            n, 
+            ids.data(), 
+            NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        // Cannot trivially assign because we need to capture intermediates 
+        // with safe construction
+        // We must retain things we obtain from the API to avoid releasing
+        // API-owned objects.
+        if (devices) {
+            devices->resize(ids.size());
+
+            // Assign to param, constructing with retain behaviour
+            // to correctly capture each underlying CL object
+            for (size_type i = 0; i < ids.size(); i++) {
+                (*devices)[i] = Device(ids[i], true);
+            }
+        }
+        return CL_SUCCESS;
+    }
+#endif
+
+    /*! \brief Gets a list of available platforms.
+     * 
+     *  Wraps clGetPlatformIDs().
+     */
+    static cl_int get(
+        vector<Platform>* platforms)
+    {
+        cl_uint n = 0;
+
+        if( platforms == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        vector<cl_platform_id> ids(n);
+        err = ::clGetPlatformIDs(n, ids.data(), NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        if (platforms) {
+            platforms->resize(ids.size());
+
+            // Platforms don't reference count
+            for (size_type i = 0; i < ids.size(); i++) {
+                (*platforms)[i] = Platform(ids[i]);
+            }
+        }
+        return CL_SUCCESS;
+    }
+
+    /*! \brief Gets the first available platform.
+     * 
+     *  Wraps clGetPlatformIDs(), returning the first result.
+     */
+    static cl_int get(
+        Platform * platform)
+    {
+        cl_int err;
+        Platform default_platform = Platform::getDefault(&err);
+        if (platform) {
+            *platform = default_platform;
+        }
+        return err;
+    }
+
+    /*! \brief Gets the first available platform, returning it by value.
+     *
+     * \return Returns a valid platform if one is available.
+     *         If no platform is available will return a null platform.
+     * Throws an exception if no platforms are available
+     * or an error condition occurs.
+     * Wraps clGetPlatformIDs(), returning the first result.
+     */
+    static Platform get(
+        cl_int * errResult = NULL)
+    {
+        cl_int err;
+        Platform default_platform = Platform::getDefault(&err);
+        if (errResult) {
+            *errResult = err;
+        }
+        return default_platform;
+    }    
+    
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+    //! \brief Wrapper for clUnloadCompiler().
+    cl_int
+    unloadCompiler()
+    {
+        return ::clUnloadPlatformCompiler(object_);
+    }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+}; // class Platform
+
+CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag Platform::default_initialized_;
+CL_HPP_DEFINE_STATIC_MEMBER_ Platform Platform::default_;
+CL_HPP_DEFINE_STATIC_MEMBER_ cl_int Platform::default_error_ = CL_SUCCESS;
+
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+/**
+ * Unload the OpenCL compiler.
+ * \note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead.
+ */
+inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int
+UnloadCompiler() CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+inline cl_int
+UnloadCompiler()
+{
+    return ::clUnloadCompiler();
+}
+#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+
+/*! \brief Class interface for cl_context.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_context as the original.  For details, see
+ *        clRetainContext() and clReleaseContext().
+ *
+ *  \see cl_context
+ */
+class Context 
+    : public detail::Wrapper<cl_context>
+{
+private:
+    static std::once_flag default_initialized_;
+    static Context default_;
+    static cl_int default_error_;
+
+    /*! \brief Create the default context from the default device type in the default platform.
+     *
+     * This sets @c default_ and @c default_error_. It does not throw
+     * @c cl::Error.
+     */
+    static void makeDefault() {
+        /* Throwing an exception from a call_once invocation does not do
+         * what we wish, so we catch it and save the error.
+         */
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+        try
+#endif
+        {
+#if !defined(__APPLE__) && !defined(__MACOS)
+            const Platform &p = Platform::getDefault();
+            cl_platform_id defaultPlatform = p();
+            cl_context_properties properties[3] = {
+                CL_CONTEXT_PLATFORM, (cl_context_properties)defaultPlatform, 0
+            };
+#else // #if !defined(__APPLE__) && !defined(__MACOS)
+            cl_context_properties *properties = nullptr;
+#endif // #if !defined(__APPLE__) && !defined(__MACOS)
+
+            default_ = Context(
+                CL_DEVICE_TYPE_DEFAULT,
+                properties,
+                NULL,
+                NULL,
+                &default_error_);
+        }
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+        catch (cl::Error &e) {
+            default_error_ = e.err();
+        }
+#endif
+    }
+
+
+    /*! \brief Create the default context from a provided Context.
+     *
+     * This sets @c default_. It does not throw
+     * @c cl::Error.
+     */
+    static void makeDefaultProvided(const Context &c) {
+        default_ = c;
+    }
+    
+public:
+#ifdef CL_HPP_UNIT_TEST_ENABLE
+    /*! \brief Reset the default.
+    *
+    * This sets @c default_ to an empty value to support cleanup in
+    * the unit test framework.
+    * This function is not thread safe.
+    */
+    static void unitTestClearDefault() {
+        default_ = Context();
+    }
+#endif // #ifdef CL_HPP_UNIT_TEST_ENABLE
+
+    /*! \brief Constructs a context including a list of specified devices.
+     *
+     *  Wraps clCreateContext().
+     */
+    Context(
+        const vector<Device>& devices,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            size_type,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        size_type numDevices = devices.size();
+        vector<cl_device_id> deviceIDs(numDevices);
+
+        for( size_type deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        object_ = ::clCreateContext(
+            properties, (cl_uint) numDevices,
+            deviceIDs.data(),
+            notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Context(
+        const Device& device,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            size_type,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        cl_device_id deviceID = device();
+
+        object_ = ::clCreateContext(
+            properties, 1,
+            &deviceID,
+            notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+    
+    /*! \brief Constructs a context including all or a subset of devices of a specified type.
+     *
+     *  Wraps clCreateContextFromType().
+     */
+    Context(
+        cl_device_type type,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            size_type,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+#if !defined(__APPLE__) && !defined(__MACOS)
+        cl_context_properties prop[4] = {CL_CONTEXT_PLATFORM, 0, 0, 0 };
+
+        if (properties == NULL) {
+            // Get a valid platform ID as we cannot send in a blank one
+            vector<Platform> platforms;
+            error = Platform::get(&platforms);
+            if (error != CL_SUCCESS) {
+                detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+                if (err != NULL) {
+                    *err = error;
+                }
+                return;
+            }
+
+            // Check the platforms we found for a device of our specified type
+            cl_context_properties platform_id = 0;
+            for (unsigned int i = 0; i < platforms.size(); i++) {
+
+                vector<Device> devices;
+
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+                try {
+#endif
+
+                    error = platforms[i].getDevices(type, &devices);
+
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+                } catch (Error) {}
+    // Catch if exceptions are enabled as we don't want to exit if first platform has no devices of type
+    // We do error checking next anyway, and can throw there if needed
+#endif
+
+                // Only squash CL_SUCCESS and CL_DEVICE_NOT_FOUND
+                if (error != CL_SUCCESS && error != CL_DEVICE_NOT_FOUND) {
+                    detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+                    if (err != NULL) {
+                        *err = error;
+                    }
+                }
+
+                if (devices.size() > 0) {
+                    platform_id = (cl_context_properties)platforms[i]();
+                    break;
+                }
+            }
+
+            if (platform_id == 0) {
+                detail::errHandler(CL_DEVICE_NOT_FOUND, __CREATE_CONTEXT_FROM_TYPE_ERR);
+                if (err != NULL) {
+                    *err = CL_DEVICE_NOT_FOUND;
+                }
+                return;
+            }
+
+            prop[1] = platform_id;
+            properties = &prop[0];
+        }
+#endif
+        object_ = ::clCreateContextFromType(
+            properties, type, notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Context(const Context& ctx) : detail::Wrapper<cl_type>(ctx) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Context& operator = (const Context &ctx)
+    {
+        detail::Wrapper<cl_type>::operator=(ctx);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Context(Context&& ctx) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(ctx)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Context& operator = (Context &&ctx)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(ctx));
+        return *this;
+    }
+
+
+    /*! \brief Returns a singleton context including all devices of CL_DEVICE_TYPE_DEFAULT.
+     *
+     *  \note All calls to this function return the same cl_context as the first.
+     */
+    static Context getDefault(cl_int * err = NULL) 
+    {
+        std::call_once(default_initialized_, makeDefault);
+        detail::errHandler(default_error_);
+        if (err != NULL) {
+            *err = default_error_;
+        }
+        return default_;
+    }
+
+    /**
+     * Modify the default context to be used by
+     * subsequent operations.
+     * Will only set the default if no default was previously created.
+     * @return updated default context.
+     *         Should be compared to the passed value to ensure that it was updated.
+     */
+    static Context setDefault(const Context &default_context)
+    {
+        std::call_once(default_initialized_, makeDefaultProvided, std::cref(default_context));
+        detail::errHandler(default_error_);
+        return default_;
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Context() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Constructor from cl_context - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_context
+     *  into the new Context object.
+     */
+    explicit Context(const cl_context& context, bool retainObject = false) : 
+        detail::Wrapper<cl_type>(context, retainObject) { }
+
+    /*! \brief Assignment operator from cl_context - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseContext() on the value previously held by this instance.
+     */
+    Context& operator = (const cl_context& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetContextInfo().
+    template <typename T>
+    cl_int getInfo(cl_context_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetContextInfo, object_, name, param),
+            __GET_CONTEXT_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetContextInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_context_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_context_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*! \brief Gets a list of supported image formats.
+     *  
+     *  Wraps clGetSupportedImageFormats().
+     */
+    cl_int getSupportedImageFormats(
+        cl_mem_flags flags,
+        cl_mem_object_type type,
+        vector<ImageFormat>* formats) const
+    {
+        cl_uint numEntries;
+        
+        if (!formats) {
+            return CL_SUCCESS;
+        }
+
+        cl_int err = ::clGetSupportedImageFormats(
+           object_, 
+           flags,
+           type, 
+           0, 
+           NULL, 
+           &numEntries);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
+        }
+
+        if (numEntries > 0) {
+            vector<ImageFormat> value(numEntries);
+            err = ::clGetSupportedImageFormats(
+                object_,
+                flags,
+                type,
+                numEntries,
+                (cl_image_format*)value.data(),
+                NULL);
+            if (err != CL_SUCCESS) {
+                return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
+            }
+
+            formats->assign(begin(value), end(value));
+        }
+        else {
+            // If no values are being returned, ensure an empty vector comes back
+            formats->clear();
+        }
+
+        return CL_SUCCESS;
+    }
+};
+
+inline void Device::makeDefault()
+{
+    /* Throwing an exception from a call_once invocation does not do
+    * what we wish, so we catch it and save the error.
+    */
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+    try
+#endif
+    {
+        cl_int error = 0;
+
+        Context context = Context::getDefault(&error);
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+
+        if (error != CL_SUCCESS) {
+            default_error_ = error;
+        }
+        else {
+            default_ = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+            default_error_ = CL_SUCCESS;
+        }
+    }
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+    catch (cl::Error &e) {
+        default_error_ = e.err();
+    }
+#endif
+}
+
+CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag Context::default_initialized_;
+CL_HPP_DEFINE_STATIC_MEMBER_ Context Context::default_;
+CL_HPP_DEFINE_STATIC_MEMBER_ cl_int Context::default_error_ = CL_SUCCESS;
+
+/*! \brief Class interface for cl_event.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_event as the original.  For details, see
+ *        clRetainEvent() and clReleaseEvent().
+ *
+ *  \see cl_event
+ */
+class Event : public detail::Wrapper<cl_event>
+{
+public:
+    //! \brief Default constructor - initializes to NULL.
+    Event() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Constructor from cl_event - takes ownership.
+     * 
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  This effectively transfers ownership of a refcount on the cl_event
+     *  into the new Event object.
+     */
+    explicit Event(const cl_event& event, bool retainObject = false) : 
+        detail::Wrapper<cl_type>(event, retainObject) { }
+
+    /*! \brief Assignment operator from cl_event - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseEvent() on the value previously held by this instance.
+     */
+    Event& operator = (const cl_event& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetEventInfo().
+    template <typename T>
+    cl_int getInfo(cl_event_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetEventInfo, object_, name, param),
+            __GET_EVENT_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetEventInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_event_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_event_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    //! \brief Wrapper for clGetEventProfilingInfo().
+    template <typename T>
+    cl_int getProfilingInfo(cl_profiling_info name, T* param) const
+    {
+        return detail::errHandler(detail::getInfo(
+            &::clGetEventProfilingInfo, object_, name, param),
+            __GET_EVENT_PROFILE_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetEventProfilingInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_profiling_info, name>::param_type
+    getProfilingInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_profiling_info, name>::param_type param;
+        cl_int result = getProfilingInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*! \brief Blocks the calling thread until this event completes.
+     * 
+     *  Wraps clWaitForEvents().
+     */
+    cl_int wait() const
+    {
+        return detail::errHandler(
+            ::clWaitForEvents(1, &object_),
+            __WAIT_FOR_EVENTS_ERR);
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 110
+    /*! \brief Registers a user callback function for a specific command execution status.
+     *
+     *  Wraps clSetEventCallback().
+     */
+    cl_int setCallback(
+        cl_int type,
+        void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *),		
+        void * user_data = NULL)
+    {
+        return detail::errHandler(
+            ::clSetEventCallback(
+                object_,
+                type,
+                pfn_notify,
+                user_data), 
+            __SET_EVENT_CALLBACK_ERR);
+    }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+
+    /*! \brief Blocks the calling thread until every event specified is complete.
+     * 
+     *  Wraps clWaitForEvents().
+     */
+    static cl_int
+    waitForEvents(const vector<Event>& events)
+    {
+        return detail::errHandler(
+            ::clWaitForEvents(
+                (cl_uint) events.size(), (events.size() > 0) ? (cl_event*)&events.front() : NULL),
+            __WAIT_FOR_EVENTS_ERR);
+    }
+};
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 110
+/*! \brief Class interface for user events (a subset of cl_event's).
+ * 
+ *  See Event for details about copy semantics, etc.
+ */
+class UserEvent : public Event
+{
+public:
+    /*! \brief Constructs a user event on a given context.
+     *
+     *  Wraps clCreateUserEvent().
+     */
+    UserEvent(
+        const Context& context,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateUserEvent(
+            context(),
+            &error);
+
+        detail::errHandler(error, __CREATE_USER_EVENT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    UserEvent() : Event() { }
+
+    /*! \brief Sets the execution status of a user event object.
+     *
+     *  Wraps clSetUserEventStatus().
+     */
+    cl_int setStatus(cl_int status)
+    {
+        return detail::errHandler(
+            ::clSetUserEventStatus(object_,status), 
+            __SET_USER_EVENT_STATUS_ERR);
+    }
+};
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+
+/*! \brief Blocks the calling thread until every event specified is complete.
+ * 
+ *  Wraps clWaitForEvents().
+ */
+inline static cl_int
+WaitForEvents(const vector<Event>& events)
+{
+    return detail::errHandler(
+        ::clWaitForEvents(
+            (cl_uint) events.size(), (events.size() > 0) ? (cl_event*)&events.front() : NULL),
+        __WAIT_FOR_EVENTS_ERR);
+}
+
+/*! \brief Class interface for cl_mem.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_mem as the original.  For details, see
+ *        clRetainMemObject() and clReleaseMemObject().
+ *
+ *  \see cl_mem
+ */
+class Memory : public detail::Wrapper<cl_mem>
+{
+public:
+    //! \brief Default constructor - initializes to NULL.
+    Memory() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  Optionally transfer ownership of a refcount on the cl_mem
+     *  into the new Memory object.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *
+     *  See Memory for further details.
+     */
+    explicit Memory(const cl_mem& memory, bool retainObject) :
+        detail::Wrapper<cl_type>(memory, retainObject) { }
+
+    /*! \brief Assignment operator from cl_mem - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseMemObject() on the value previously held by this instance.
+     */
+    Memory& operator = (const cl_mem& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Memory(const Memory& mem) : detail::Wrapper<cl_type>(mem) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Memory& operator = (const Memory &mem)
+    {
+        detail::Wrapper<cl_type>::operator=(mem);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Memory(Memory&& mem) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(mem)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Memory& operator = (Memory &&mem)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(mem));
+        return *this;
+    }
+
+
+    //! \brief Wrapper for clGetMemObjectInfo().
+    template <typename T>
+    cl_int getInfo(cl_mem_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetMemObjectInfo, object_, name, param),
+            __GET_MEM_OBJECT_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetMemObjectInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_mem_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_mem_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 110
+    /*! \brief Registers a callback function to be called when the memory object
+     *         is no longer needed.
+     *
+     *  Wraps clSetMemObjectDestructorCallback().
+     *
+     *  Repeated calls to this function, for a given cl_mem value, will append
+     *  to the list of functions called (in reverse order) when memory object's
+     *  resources are freed and the memory object is deleted.
+     *
+     *  \note
+     *  The registered callbacks are associated with the underlying cl_mem
+     *  value - not the Memory class instance.
+     */
+    cl_int setDestructorCallback(
+        void (CL_CALLBACK * pfn_notify)(cl_mem, void *),		
+        void * user_data = NULL)
+    {
+        return detail::errHandler(
+            ::clSetMemObjectDestructorCallback(
+                object_,
+                pfn_notify,
+                user_data), 
+            __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR);
+    }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+
+};
+
+// Pre-declare copy functions
+class Buffer;
+template< typename IteratorType >
+cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );
+template< typename IteratorType >
+cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );
+template< typename IteratorType >
+cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );
+template< typename IteratorType >
+cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );
+
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+namespace detail
+{
+    class SVMTraitNull
+    {
+    public:
+        static cl_svm_mem_flags getSVMMemFlags()
+        {
+            return 0;
+        }
+    };
+} // namespace detail
+
+template<class Trait = detail::SVMTraitNull>
+class SVMTraitReadWrite
+{
+public:
+    static cl_svm_mem_flags getSVMMemFlags()
+    {
+        return CL_MEM_READ_WRITE |
+            Trait::getSVMMemFlags();
+    }
+};
+
+template<class Trait = detail::SVMTraitNull>
+class SVMTraitReadOnly
+{
+public:
+    static cl_svm_mem_flags getSVMMemFlags()
+    {
+        return CL_MEM_READ_ONLY |
+            Trait::getSVMMemFlags();
+    }
+};
+
+template<class Trait = detail::SVMTraitNull>
+class SVMTraitWriteOnly
+{
+public:
+    static cl_svm_mem_flags getSVMMemFlags()
+    {
+        return CL_MEM_WRITE_ONLY |
+            Trait::getSVMMemFlags();
+    }
+};
+
+template<class Trait = SVMTraitReadWrite<>>
+class SVMTraitCoarse
+{
+public:
+    static cl_svm_mem_flags getSVMMemFlags()
+    {
+        return Trait::getSVMMemFlags();
+    }
+};
+
+template<class Trait = SVMTraitReadWrite<>>
+class SVMTraitFine
+{
+public:
+    static cl_svm_mem_flags getSVMMemFlags()
+    {
+        return CL_MEM_SVM_FINE_GRAIN_BUFFER |
+            Trait::getSVMMemFlags();
+    }
+};
+
+template<class Trait = SVMTraitReadWrite<>>
+class SVMTraitAtomic
+{
+public:
+    static cl_svm_mem_flags getSVMMemFlags()
+    {
+        return
+            CL_MEM_SVM_FINE_GRAIN_BUFFER |
+            CL_MEM_SVM_ATOMICS |
+            Trait::getSVMMemFlags();
+    }
+};
+
+// Pre-declare SVM map function
+template<typename T>
+inline cl_int enqueueMapSVM(
+    T* ptr,
+    cl_bool blocking,
+    cl_map_flags flags,
+    size_type size,
+    const vector<Event>* events = NULL,
+    Event* event = NULL);
+
+/**
+ * STL-like allocator class for managing SVM objects provided for convenience.
+ *
+ * Note that while this behaves like an allocator for the purposes of constructing vectors and similar objects,
+ * care must be taken when using with smart pointers.
+ * The allocator should not be used to construct a unique_ptr if we are using coarse-grained SVM mode because
+ * the coarse-grained management behaviour would behave incorrectly with respect to reference counting.
+ *
+ * Instead the allocator embeds a Deleter which may be used with unique_ptr and is used
+ * with the allocate_shared and allocate_ptr supplied operations.
+ */
+template<typename T, class SVMTrait>
+class SVMAllocator {
+private:
+    Context context_;
+
+public:
+    typedef T value_type;
+    typedef value_type* pointer;
+    typedef const value_type* const_pointer;
+    typedef value_type& reference;
+    typedef const value_type& const_reference;
+    typedef std::size_t size_type;
+    typedef std::ptrdiff_t difference_type;
+
+    template<typename U>
+    struct rebind
+    {
+        typedef SVMAllocator<U, SVMTrait> other;
+    };
+
+    template<typename U, typename V>
+    friend class SVMAllocator;
+
+    SVMAllocator() :
+        context_(Context::getDefault())
+    {
+    }
+
+    explicit SVMAllocator(cl::Context context) :
+        context_(context)
+    {
+    }
+
+
+    SVMAllocator(const SVMAllocator &other) :
+        context_(other.context_)
+    {
+    }
+
+    template<typename U>
+    SVMAllocator(const SVMAllocator<U, SVMTrait> &other) :
+        context_(other.context_)
+    {
+    }
+
+    ~SVMAllocator()
+    {
+    }
+
+    pointer address(reference r) CL_HPP_NOEXCEPT_
+    {
+        return std::addressof(r);
+    }
+
+    const_pointer address(const_reference r) CL_HPP_NOEXCEPT_
+    {
+        return std::addressof(r);
+    }
+
+    /**
+     * Allocate an SVM pointer.
+     *
+     * If the allocator is coarse-grained, this will take ownership to allow
+     * containers to correctly construct data in place. 
+     */
+    pointer allocate(
+        size_type size,
+        typename cl::SVMAllocator<void, SVMTrait>::const_pointer = 0)
+    {
+        // Allocate memory with default alignment matching the size of the type
+        void* voidPointer =
+            clSVMAlloc(
+            context_(),
+            SVMTrait::getSVMMemFlags(),
+            size*sizeof(T),
+            sizeof(T));
+        pointer retValue = reinterpret_cast<pointer>(
+            voidPointer);
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+        if (!retValue) {
+            std::bad_alloc excep;
+            throw excep;
+        }
+#endif // #if defined(CL_HPP_ENABLE_EXCEPTIONS)
+
+        // If allocation was coarse-grained then map it
+        if (!(SVMTrait::getSVMMemFlags() & CL_MEM_SVM_FINE_GRAIN_BUFFER)) {
+            cl_int err = enqueueMapSVM(retValue, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, size*sizeof(T));
+            if (err != CL_SUCCESS) {
+                std::bad_alloc excep;
+                throw excep;
+            }
+        }
+
+        // If exceptions disabled, return null pointer from allocator
+        return retValue;
+    }
+
+    void deallocate(pointer p, size_type)
+    {
+        clSVMFree(context_(), p);
+    }
+
+    /**
+     * Return the maximum possible allocation size.
+     * This is the minimum of the maximum sizes of all devices in the context.
+     */
+    size_type max_size() const CL_HPP_NOEXCEPT_
+    {
+        size_type maxSize = std::numeric_limits<size_type>::max() / sizeof(T);
+
+        for (Device &d : context_.getInfo<CL_CONTEXT_DEVICES>()) {
+            maxSize = std::min(
+                maxSize, 
+                static_cast<size_type>(d.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>()));
+        }
+
+        return maxSize;
+    }
+
+    template< class U, class... Args >
+    void construct(U* p, Args&&... args)
+    {
+        new(p)T(args...);
+    }
+
+    template< class U >
+    void destroy(U* p)
+    {
+        p->~U();
+    }
+
+    /**
+     * Returns true if the contexts match.
+     */
+    inline bool operator==(SVMAllocator const& rhs)
+    {
+        return (context_==rhs.context_);
+    }
+
+    inline bool operator!=(SVMAllocator const& a)
+    {
+        return !operator==(a);
+    }
+}; // class SVMAllocator        return cl::pointer<T>(tmp, detail::Deleter<T, Alloc>{alloc, copies});
+
+
+template<class SVMTrait>
+class SVMAllocator<void, SVMTrait> {
+public:
+    typedef void value_type;
+    typedef value_type* pointer;
+    typedef const value_type* const_pointer;
+
+    template<typename U>
+    struct rebind
+    {
+        typedef SVMAllocator<U, SVMTrait> other;
+    };
+
+    template<typename U, typename V>
+    friend class SVMAllocator;
+};
+
+#if !defined(CL_HPP_NO_STD_UNIQUE_PTR)
+namespace detail
+{
+    template<class Alloc>
+    class Deleter {
+    private:
+        Alloc alloc_;
+        size_type copies_;
+
+    public:
+        typedef typename std::allocator_traits<Alloc>::pointer pointer;
+
+        Deleter(const Alloc &alloc, size_type copies) : alloc_{ alloc }, copies_{ copies }
+        {
+        }
+
+        void operator()(pointer ptr) const {
+            Alloc tmpAlloc{ alloc_ };
+            std::allocator_traits<Alloc>::destroy(tmpAlloc, std::addressof(*ptr));
+            std::allocator_traits<Alloc>::deallocate(tmpAlloc, ptr, copies_);
+        }
+    };
+} // namespace detail
+
+/**
+ * Allocation operation compatible with std::allocate_ptr.
+ * Creates a unique_ptr<T> by default.
+ * This requirement is to ensure that the control block is not
+ * allocated in memory inaccessible to the host.
+ */
+template <class T, class Alloc, class... Args>
+cl::pointer<T, detail::Deleter<Alloc>> allocate_pointer(const Alloc &alloc_, Args&&... args)
+{
+    Alloc alloc(alloc_);
+    static const size_t copies = 1;
+
+    // Ensure that creation of the management block and the
+    // object are dealt with separately such that we only provide a deleter
+
+    T* tmp = std::allocator_traits<Alloc>::allocate(alloc, copies);
+    if (!tmp) {
+        std::bad_alloc excep;
+        throw excep;
+    }
+    try {
+        std::allocator_traits<Alloc>::construct(
+            alloc,
+            std::addressof(*tmp),
+            std::forward<Args>(args)...);
+
+        return cl::pointer<T, detail::Deleter<Alloc>>(tmp, detail::Deleter<Alloc>{alloc, copies});
+    }
+    catch (std::bad_alloc b)
+    {
+        std::allocator_traits<Alloc>::deallocate(alloc, tmp, copies);
+        throw;
+    }
+}
+
+template< class T, class SVMTrait, class... Args >
+cl::pointer<T, detail::Deleter<SVMAllocator<T, SVMTrait>>> allocate_svm(Args... args)
+{
+    SVMAllocator<T, SVMTrait> alloc;
+    return cl::allocate_pointer<T>(alloc, args...);
+}
+
+template< class T, class SVMTrait, class... Args >
+cl::pointer<T, detail::Deleter<SVMAllocator<T, SVMTrait>>> allocate_svm(const cl::Context &c, Args... args)
+{
+    SVMAllocator<T, SVMTrait> alloc(c);
+    return cl::allocate_pointer<T>(alloc, args...);
+}
+#endif // #if !defined(CL_HPP_NO_STD_UNIQUE_PTR)
+
+/*! \brief Vector alias to simplify contruction of coarse-grained SVM containers.
+ * 
+ */
+template < class T >
+using coarse_svm_vector = vector<T, cl::SVMAllocator<int, cl::SVMTraitCoarse<>>>;
+
+/*! \brief Vector alias to simplify contruction of fine-grained SVM containers.
+*
+*/
+template < class T >
+using fine_svm_vector = vector<T, cl::SVMAllocator<int, cl::SVMTraitFine<>>>;
+
+/*! \brief Vector alias to simplify contruction of fine-grained SVM containers that support platform atomics.
+*
+*/
+template < class T >
+using atomic_svm_vector = vector<T, cl::SVMAllocator<int, cl::SVMTraitAtomic<>>>;
+
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+
+/*! \brief Class interface for Buffer Memory Objects.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ *
+ *  \see Memory
+ */
+class Buffer : public Memory
+{
+public:
+
+    /*! \brief Constructs a Buffer in a specified context.
+     *
+     *  Wraps clCreateBuffer().
+     *
+     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
+     *                  specified.  Note alignment & exclusivity requirements.
+     */
+    Buffer(
+        const Context& context,
+        cl_mem_flags flags,
+        size_type size,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Constructs a Buffer in the default context.
+     *
+     *  Wraps clCreateBuffer().
+     *
+     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
+     *                  specified.  Note alignment & exclusivity requirements.
+     *
+     *  \see Context::getDefault()
+     */
+    Buffer(
+         cl_mem_flags flags,
+        size_type size,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        Context context = Context::getDefault(err);
+
+        object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*!
+     * \brief Construct a Buffer from a host container via iterators.
+     * IteratorType must be random access.
+     * If useHostPtr is specified iterators must represent contiguous data.
+     */
+    template< typename IteratorType >
+    Buffer(
+        IteratorType startIterator,
+        IteratorType endIterator,
+        bool readOnly,
+        bool useHostPtr = false,
+        cl_int* err = NULL)
+    {
+        typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+        cl_int error;
+
+        cl_mem_flags flags = 0;
+        if( readOnly ) {
+            flags |= CL_MEM_READ_ONLY;
+        }
+        else {
+            flags |= CL_MEM_READ_WRITE;
+        }
+        if( useHostPtr ) {
+            flags |= CL_MEM_USE_HOST_PTR;
+        }
+        
+        size_type size = sizeof(DataType)*(endIterator - startIterator);
+
+        Context context = Context::getDefault(err);
+
+        if( useHostPtr ) {
+            object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+        } else {
+            object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
+        }
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        if( !useHostPtr ) {
+            error = cl::copy(startIterator, endIterator, *this);
+            detail::errHandler(error, __CREATE_BUFFER_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+    }
+
+    /*!
+     * \brief Construct a Buffer from a host container via iterators using a specified context.
+     * IteratorType must be random access.
+     * If useHostPtr is specified iterators must represent contiguous data.
+     */
+    template< typename IteratorType >
+    Buffer(const Context &context, IteratorType startIterator, IteratorType endIterator,
+        bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
+    
+    /*!
+    * \brief Construct a Buffer from a host container via iterators using a specified queue.
+    * If useHostPtr is specified iterators must be random access.
+    */
+    template< typename IteratorType >
+    Buffer(const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator,
+        bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
+
+    //! \brief Default constructor - initializes to NULL.
+    Buffer() : Memory() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with earlier versions.
+     *
+     *  See Memory for further details.
+     */
+    explicit Buffer(const cl_mem& buffer, bool retainObject = false) :
+        Memory(buffer, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+    *
+    *  See Memory for further details.
+    */
+    Buffer& operator = (const cl_mem& rhs)
+    {
+        Memory::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Buffer(const Buffer& buf) : Memory(buf) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Buffer& operator = (const Buffer &buf)
+    {
+        Memory::operator=(buf);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Buffer(Buffer&& buf) CL_HPP_NOEXCEPT_ : Memory(std::move(buf)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Buffer& operator = (Buffer &&buf)
+    {
+        Memory::operator=(std::move(buf));
+        return *this;
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 110
+    /*! \brief Creates a new buffer object from this.
+     *
+     *  Wraps clCreateSubBuffer().
+     */
+    Buffer createSubBuffer(
+        cl_mem_flags flags,
+        cl_buffer_create_type buffer_create_type,
+        const void * buffer_create_info,
+        cl_int * err = NULL)
+    {
+        Buffer result;
+        cl_int error;
+        result.object_ = ::clCreateSubBuffer(
+            object_, 
+            flags, 
+            buffer_create_type, 
+            buffer_create_info, 
+            &error);
+
+        detail::errHandler(error, __CREATE_SUBBUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        return result;
+    }		
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+};
+
+#if defined (CL_HPP_USE_DX_INTEROP)
+/*! \brief Class interface for creating OpenCL buffers from ID3D10Buffer's.
+ *
+ *  This is provided to facilitate interoperability with Direct3D.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ *
+ *  \see Memory
+ */
+class BufferD3D10 : public Buffer
+{
+public:
+   
+
+    /*! \brief Constructs a BufferD3D10, in a specified context, from a
+     *         given ID3D10Buffer.
+     *
+     *  Wraps clCreateFromD3D10BufferKHR().
+     */
+    BufferD3D10(
+        const Context& context,
+        cl_mem_flags flags,
+        ID3D10Buffer* bufobj,
+        cl_int * err = NULL) : pfn_clCreateFromD3D10BufferKHR(nullptr)
+    {
+        typedef CL_API_ENTRY cl_mem (CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)(
+            cl_context context, cl_mem_flags flags, ID3D10Buffer*  buffer,
+            cl_int* errcode_ret);
+        PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR;
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+        vector<cl_context_properties> props = context.getInfo<CL_CONTEXT_PROPERTIES>();
+        cl_platform platform = -1;
+        for( int i = 0; i < props.size(); ++i ) {
+            if( props[i] == CL_CONTEXT_PLATFORM ) {
+                platform = props[i+1];
+            }
+        }
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCreateFromD3D10BufferKHR);
+#elif CL_HPP_TARGET_OPENCL_VERSION >= 110
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateFromD3D10BufferKHR);
+#endif
+
+        cl_int error;
+        object_ = pfn_clCreateFromD3D10BufferKHR(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    BufferD3D10() : Buffer() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with 
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit BufferD3D10(const cl_mem& buffer, bool retainObject = false) : 
+        Buffer(buffer, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferD3D10& operator = (const cl_mem& rhs)
+    {
+        Buffer::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferD3D10(const BufferD3D10& buf) : 
+        Buffer(buf) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferD3D10& operator = (const BufferD3D10 &buf)
+    {
+        Buffer::operator=(buf);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferD3D10(BufferD3D10&& buf) CL_HPP_NOEXCEPT_ : Buffer(std::move(buf)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferD3D10& operator = (BufferD3D10 &&buf)
+    {
+        Buffer::operator=(std::move(buf));
+        return *this;
+    }
+};
+#endif
+
+/*! \brief Class interface for GL Buffer Memory Objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class BufferGL : public Buffer
+{
+public:
+    /*! \brief Constructs a BufferGL in a specified context, from a given
+     *         GL buffer.
+     *
+     *  Wraps clCreateFromGLBuffer().
+     */
+    BufferGL(
+        const Context& context,
+        cl_mem_flags flags,
+        cl_GLuint bufobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLBuffer(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    BufferGL() : Buffer() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit BufferGL(const cl_mem& buffer, bool retainObject = false) :
+        Buffer(buffer, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferGL& operator = (const cl_mem& rhs)
+    {
+        Buffer::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferGL(const BufferGL& buf) : Buffer(buf) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferGL& operator = (const BufferGL &buf)
+    {
+        Buffer::operator=(buf);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferGL(BufferGL&& buf) CL_HPP_NOEXCEPT_ : Buffer(std::move(buf)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferGL& operator = (BufferGL &&buf)
+    {
+        Buffer::operator=(std::move(buf));
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetGLObjectInfo().
+    cl_int getObjectInfo(
+        cl_gl_object_type *type,
+        cl_GLuint * gl_object_name)
+    {
+        return detail::errHandler(
+            ::clGetGLObjectInfo(object_,type,gl_object_name),
+            __GET_GL_OBJECT_INFO_ERR);
+    }
+};
+
+/*! \brief Class interface for GL Render Buffer Memory Objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class BufferRenderGL : public Buffer
+{
+public:
+    /*! \brief Constructs a BufferRenderGL in a specified context, from a given
+     *         GL Renderbuffer.
+     *
+     *  Wraps clCreateFromGLRenderbuffer().
+     */
+    BufferRenderGL(
+        const Context& context,
+        cl_mem_flags flags,
+        cl_GLuint bufobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLRenderbuffer(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_RENDER_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    BufferRenderGL() : Buffer() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with 
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit BufferRenderGL(const cl_mem& buffer, bool retainObject = false) :
+        Buffer(buffer, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferRenderGL& operator = (const cl_mem& rhs)
+    {
+        Buffer::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferRenderGL(const BufferRenderGL& buf) : Buffer(buf) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferRenderGL& operator = (const BufferRenderGL &buf)
+    {
+        Buffer::operator=(buf);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferRenderGL(BufferRenderGL&& buf) CL_HPP_NOEXCEPT_ : Buffer(std::move(buf)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    BufferRenderGL& operator = (BufferRenderGL &&buf)
+    {
+        Buffer::operator=(std::move(buf));
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetGLObjectInfo().
+    cl_int getObjectInfo(
+        cl_gl_object_type *type,
+        cl_GLuint * gl_object_name)
+    {
+        return detail::errHandler(
+            ::clGetGLObjectInfo(object_,type,gl_object_name),
+            __GET_GL_OBJECT_INFO_ERR);
+    }
+};
+
+/*! \brief C++ base class for Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image : public Memory
+{
+protected:
+    //! \brief Default constructor - initializes to NULL.
+    Image() : Memory() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit Image(const cl_mem& image, bool retainObject = false) :
+        Memory(image, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image& operator = (const cl_mem& rhs)
+    {
+        Memory::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image(const Image& img) : Memory(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image& operator = (const Image &img)
+    {
+        Memory::operator=(img);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image(Image&& img) CL_HPP_NOEXCEPT_ : Memory(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image& operator = (Image &&img)
+    {
+        Memory::operator=(std::move(img));
+        return *this;
+    }
+
+
+public:
+    //! \brief Wrapper for clGetImageInfo().
+    template <typename T>
+    cl_int getImageInfo(cl_image_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetImageInfo, object_, name, param),
+            __GET_IMAGE_INFO_ERR);
+    }
+    
+    //! \brief Wrapper for clGetImageInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_image_info, name>::param_type
+    getImageInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_image_info, name>::param_type param;
+        cl_int result = getImageInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+};
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+/*! \brief Class interface for 1D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image1D : public Image
+{
+public:
+    /*! \brief Constructs a 1D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image1D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        size_type width,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE1D,
+            width,
+            0, 0, 0, 0, 0, 0, 0, 0
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image1D() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit Image1D(const cl_mem& image1D, bool retainObject = false) :
+        Image(image1D, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image1D& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1D(const Image1D& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1D& operator = (const Image1D &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1D(Image1D&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1D& operator = (Image1D &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+
+};
+
+/*! \class Image1DBuffer
+ * \brief Image interface for 1D buffer images.
+ */
+class Image1DBuffer : public Image
+{
+public:
+    Image1DBuffer(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        size_type width,
+        const Buffer &buffer,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE1D_BUFFER,
+            width,
+            0, 0, 0, 0, 0, 0, 0,
+            buffer()
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            NULL, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image1DBuffer() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit Image1DBuffer(const cl_mem& image1D, bool retainObject = false) :
+        Image(image1D, retainObject) { }
+
+    Image1DBuffer& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DBuffer(const Image1DBuffer& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DBuffer& operator = (const Image1DBuffer &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DBuffer(Image1DBuffer&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DBuffer& operator = (Image1DBuffer &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+
+};
+
+/*! \class Image1DArray
+ * \brief Image interface for arrays of 1D images.
+ */
+class Image1DArray : public Image
+{
+public:
+    Image1DArray(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        size_type arraySize,
+        size_type width,
+        size_type rowPitch,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE1D_ARRAY,
+            width,
+            0, 0,  // height, depth (unused)
+            arraySize,
+            rowPitch,
+            0, 0, 0, 0
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image1DArray() { }
+  
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit Image1DArray(const cl_mem& imageArray, bool retainObject = false) :
+        Image(imageArray, retainObject) { }
+
+
+    Image1DArray& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DArray(const Image1DArray& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DArray& operator = (const Image1DArray &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DArray(Image1DArray&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image1DArray& operator = (Image1DArray &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+
+};
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+
+/*! \brief Class interface for 2D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image2D : public Image
+{
+public:
+    /*! \brief Constructs a 2D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image2D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        size_type width,
+        size_type height,
+        size_type row_pitch = 0,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        bool useCreateImage;
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120
+        // Run-time decision based on the actual platform
+        {
+            cl_uint version = detail::getContextPlatformVersion(context());
+            useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
+        }
+#elif CL_HPP_TARGET_OPENCL_VERSION >= 120
+        useCreateImage = true;
+#else
+        useCreateImage = false;
+#endif
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+        if (useCreateImage)
+        {
+            cl_image_desc desc =
+            {
+                CL_MEM_OBJECT_IMAGE2D,
+                width,
+                height,
+                0, 0, // depth, array size (unused)
+                row_pitch,
+                0, 0, 0, 0
+            };
+            object_ = ::clCreateImage(
+                context(),
+                flags,
+                &format,
+                &desc,
+                host_ptr,
+                &error);
+
+            detail::errHandler(error, __CREATE_IMAGE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+#if CL_HPP_MINIMUM_OPENCL_VERSION < 120
+        if (!useCreateImage)
+        {
+            object_ = ::clCreateImage2D(
+                context(), flags,&format, width, height, row_pitch, host_ptr, &error);
+
+            detail::errHandler(error, __CREATE_IMAGE2D_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+    /*! \brief Constructs a 2D Image from a buffer.
+    * \note This will share storage with the underlying buffer.
+    *
+    *  Wraps clCreateImage().
+    */
+    Image2D(
+        const Context& context,
+        ImageFormat format,
+        const Buffer &sourceBuffer,
+        size_type width,
+        size_type height,
+        size_type row_pitch = 0,
+        cl_int* err = nullptr)
+    {
+        cl_int error;
+
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE2D,
+            width,
+            height,
+            0, 0, // depth, array size (unused)
+            row_pitch,
+            0, 0, 0,
+            // Use buffer as input to image
+            sourceBuffer()
+        };
+        object_ = ::clCreateImage(
+            context(),
+            0, // flags inherited from buffer
+            &format,
+            &desc,
+            nullptr,
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != nullptr) {
+            *err = error;
+        }
+    }
+#endif //#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+    /*! \brief Constructs a 2D Image from an image.
+    * \note This will share storage with the underlying image but may
+    *       reinterpret the channel order and type.
+    *
+    * The image will be created matching with a descriptor matching the source. 
+    *
+    * \param order is the channel order to reinterpret the image data as.
+    *              The channel order may differ as described in the OpenCL 
+    *              2.0 API specification.
+    *
+    * Wraps clCreateImage().
+    */
+    Image2D(
+        const Context& context,
+        cl_channel_order order,
+        const Image &sourceImage,
+        cl_int* err = nullptr)
+    {
+        cl_int error;
+
+        // Descriptor fields have to match source image
+        size_type sourceWidth = 
+            sourceImage.getImageInfo<CL_IMAGE_WIDTH>();
+        size_type sourceHeight = 
+            sourceImage.getImageInfo<CL_IMAGE_HEIGHT>();
+        size_type sourceRowPitch =
+            sourceImage.getImageInfo<CL_IMAGE_ROW_PITCH>();
+        cl_uint sourceNumMIPLevels =
+            sourceImage.getImageInfo<CL_IMAGE_NUM_MIP_LEVELS>();
+        cl_uint sourceNumSamples =
+            sourceImage.getImageInfo<CL_IMAGE_NUM_SAMPLES>();
+        cl_image_format sourceFormat =
+            sourceImage.getImageInfo<CL_IMAGE_FORMAT>();
+
+        // Update only the channel order. 
+        // Channel format inherited from source.
+        sourceFormat.image_channel_order = order;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE2D,
+            sourceWidth,
+            sourceHeight,
+            0, 0, // depth (unused), array size (unused)
+            sourceRowPitch,
+            0, // slice pitch (unused)
+            sourceNumMIPLevels,
+            sourceNumSamples,
+            // Use buffer as input to image
+            sourceImage()
+        };
+        object_ = ::clCreateImage(
+            context(),
+            0, // flags should be inherited from mem_object
+            &sourceFormat,
+            &desc,
+            nullptr,
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != nullptr) {
+            *err = error;
+        }
+    }
+#endif //#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+    //! \brief Default constructor - initializes to NULL.
+    Image2D() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit Image2D(const cl_mem& image2D, bool retainObject = false) :
+        Image(image2D, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2D& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2D(const Image2D& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2D& operator = (const Image2D &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2D(Image2D&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2D& operator = (Image2D &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+
+};
+
+
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+/*! \brief Class interface for GL 2D Image Memory objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ *  \note Deprecated for OpenCL 1.2. Please use ImageGL instead.
+ */
+class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL : public Image2D 
+{
+public:
+    /*! \brief Constructs an Image2DGL in a specified context, from a given
+     *         GL Texture.
+     *
+     *  Wraps clCreateFromGLTexture2D().
+     */
+    Image2DGL(
+        const Context& context,
+        cl_mem_flags flags,
+        cl_GLenum target,
+        cl_GLint  miplevel,
+        cl_GLuint texobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLTexture2D(
+            context(),
+            flags,
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_TEXTURE_2D_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+    }
+    
+    //! \brief Default constructor - initializes to NULL.
+    Image2DGL() : Image2D() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit Image2DGL(const cl_mem& image, bool retainObject = false) : 
+        Image2D(image, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *c
+     *  See Memory for further details.
+     */
+    Image2DGL& operator = (const cl_mem& rhs)
+    {
+        Image2D::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DGL(const Image2DGL& img) : Image2D(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DGL& operator = (const Image2DGL &img)
+    {
+        Image2D::operator=(img);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DGL(Image2DGL&& img) CL_HPP_NOEXCEPT_ : Image2D(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DGL& operator = (Image2DGL &&img)
+    {
+        Image2D::operator=(std::move(img));
+        return *this;
+    }
+
+} CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+#endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+/*! \class Image2DArray
+ * \brief Image interface for arrays of 2D images.
+ */
+class Image2DArray : public Image
+{
+public:
+    Image2DArray(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        size_type arraySize,
+        size_type width,
+        size_type height,
+        size_type rowPitch,
+        size_type slicePitch,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE2D_ARRAY,
+            width,
+            height,
+            0,       // depth (unused)
+            arraySize,
+            rowPitch,
+            slicePitch,
+            0, 0, 0
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image2DArray() { }
+    
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit Image2DArray(const cl_mem& imageArray, bool retainObject = false) : Image(imageArray, retainObject) { }
+
+    Image2DArray& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DArray(const Image2DArray& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DArray& operator = (const Image2DArray &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DArray(Image2DArray&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image2DArray& operator = (Image2DArray &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+};
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+/*! \brief Class interface for 3D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image3D : public Image
+{
+public:
+    /*! \brief Constructs a 3D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image3D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        size_type width,
+        size_type height,
+        size_type depth,
+        size_type row_pitch = 0,
+        size_type slice_pitch = 0,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        bool useCreateImage;
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120
+        // Run-time decision based on the actual platform
+        {
+            cl_uint version = detail::getContextPlatformVersion(context());
+            useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
+        }
+#elif CL_HPP_TARGET_OPENCL_VERSION >= 120
+        useCreateImage = true;
+#else
+        useCreateImage = false;
+#endif
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+        if (useCreateImage)
+        {
+            cl_image_desc desc =
+            {
+                CL_MEM_OBJECT_IMAGE3D,
+                width,
+                height,
+                depth,
+                0,      // array size (unused)
+                row_pitch,
+                slice_pitch,
+                0, 0, 0
+            };
+            object_ = ::clCreateImage(
+                context(), 
+                flags, 
+                &format, 
+                &desc, 
+                host_ptr, 
+                &error);
+
+            detail::errHandler(error, __CREATE_IMAGE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif  // CL_HPP_TARGET_OPENCL_VERSION >= 120
+#if CL_HPP_MINIMUM_OPENCL_VERSION < 120
+        if (!useCreateImage)
+        {
+            object_ = ::clCreateImage3D(
+                context(), flags, &format, width, height, depth, row_pitch,
+                slice_pitch, host_ptr, &error);
+
+            detail::errHandler(error, __CREATE_IMAGE3D_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image3D() : Image() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit Image3D(const cl_mem& image3D, bool retainObject = false) : 
+        Image(image3D, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3D& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3D(const Image3D& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3D& operator = (const Image3D &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3D(Image3D&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3D& operator = (Image3D &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+};
+
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+/*! \brief Class interface for GL 3D Image Memory objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image3DGL : public Image3D
+{
+public:
+    /*! \brief Constructs an Image3DGL in a specified context, from a given
+     *         GL Texture.
+     *
+     *  Wraps clCreateFromGLTexture3D().
+     */
+    Image3DGL(
+        const Context& context,
+        cl_mem_flags flags,
+        cl_GLenum target,
+        cl_GLint  miplevel,
+        cl_GLuint texobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLTexture3D(
+            context(),
+            flags,
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_TEXTURE_3D_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image3DGL() : Image3D() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit Image3DGL(const cl_mem& image, bool retainObject = false) : 
+        Image3D(image, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3DGL& operator = (const cl_mem& rhs)
+    {
+        Image3D::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3DGL(const Image3DGL& img) : Image3D(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3DGL& operator = (const Image3DGL &img)
+    {
+        Image3D::operator=(img);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3DGL(Image3DGL&& img) CL_HPP_NOEXCEPT_ : Image3D(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Image3DGL& operator = (Image3DGL &&img)
+    {
+        Image3D::operator=(std::move(img));
+        return *this;
+    }
+};
+#endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+/*! \class ImageGL
+ * \brief general image interface for GL interop.
+ * We abstract the 2D and 3D GL images into a single instance here
+ * that wraps all GL sourced images on the grounds that setup information
+ * was performed by OpenCL anyway.
+ */
+class ImageGL : public Image
+{
+public:
+    ImageGL(
+        const Context& context,
+        cl_mem_flags flags,
+        cl_GLenum target,
+        cl_GLint  miplevel,
+        cl_GLuint texobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLTexture(
+            context(), 
+            flags, 
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_TEXTURE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    ImageGL() : Image() { }
+    
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  See Memory for further details.
+     */
+    explicit ImageGL(const cl_mem& image, bool retainObject = false) : 
+        Image(image, retainObject) { }
+
+    ImageGL& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    ImageGL(const ImageGL& img) : Image(img) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    ImageGL& operator = (const ImageGL &img)
+    {
+        Image::operator=(img);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    ImageGL(ImageGL&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    ImageGL& operator = (ImageGL &&img)
+    {
+        Image::operator=(std::move(img));
+        return *this;
+    }
+};
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+/*! \brief Class interface for Pipe Memory Objects.
+*
+*  See Memory for details about copy semantics, etc.
+*
+*  \see Memory
+*/
+class Pipe : public Memory
+{
+public:
+
+    /*! \brief Constructs a Pipe in a specified context.
+     *
+     * Wraps clCreatePipe().
+     * @param context Context in which to create the pipe.
+     * @param flags Bitfield. Only CL_MEM_READ_WRITE and CL_MEM_HOST_NO_ACCESS are valid.
+     * @param packet_size Size in bytes of a single packet of the pipe.
+     * @param max_packets Number of packets that may be stored in the pipe.
+     *
+     */
+    Pipe(
+        const Context& context,
+        cl_uint packet_size,
+        cl_uint max_packets,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS;
+        object_ = ::clCreatePipe(context(), flags, packet_size, max_packets, nullptr, &error);
+
+        detail::errHandler(error, __CREATE_PIPE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Constructs a Pipe in a the default context.
+     *
+     * Wraps clCreatePipe().
+     * @param flags Bitfield. Only CL_MEM_READ_WRITE and CL_MEM_HOST_NO_ACCESS are valid.
+     * @param packet_size Size in bytes of a single packet of the pipe.
+     * @param max_packets Number of packets that may be stored in the pipe.
+     *
+     */
+    Pipe(
+        cl_uint packet_size,
+        cl_uint max_packets,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        Context context = Context::getDefault(err);
+
+        cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS;
+        object_ = ::clCreatePipe(context(), flags, packet_size, max_packets, nullptr, &error);
+
+        detail::errHandler(error, __CREATE_PIPE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Pipe() : Memory() { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with earlier versions.
+     *
+     *  See Memory for further details.
+     */
+    explicit Pipe(const cl_mem& pipe, bool retainObject = false) :
+        Memory(pipe, retainObject) { }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Pipe& operator = (const cl_mem& rhs)
+    {
+        Memory::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Pipe(const Pipe& pipe) : Memory(pipe) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Pipe& operator = (const Pipe &pipe)
+    {
+        Memory::operator=(pipe);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Pipe(Pipe&& pipe) CL_HPP_NOEXCEPT_ : Memory(std::move(pipe)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Pipe& operator = (Pipe &&pipe)
+    {
+        Memory::operator=(std::move(pipe));
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetMemObjectInfo().
+    template <typename T>
+    cl_int getInfo(cl_pipe_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetPipeInfo, object_, name, param),
+            __GET_PIPE_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetMemObjectInfo() that returns by value.
+    template <cl_int name> typename
+        detail::param_traits<detail::cl_pipe_info, name>::param_type
+        getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_pipe_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+}; // class Pipe
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+
+/*! \brief Class interface for cl_sampler.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_sampler as the original.  For details, see
+ *        clRetainSampler() and clReleaseSampler().
+ *
+ *  \see cl_sampler 
+ */
+class Sampler : public detail::Wrapper<cl_sampler>
+{
+public:
+    //! \brief Default constructor - initializes to NULL.
+    Sampler() { }
+
+    /*! \brief Constructs a Sampler in a specified context.
+     *
+     *  Wraps clCreateSampler().
+     */
+    Sampler(
+        const Context& context,
+        cl_bool normalized_coords,
+        cl_addressing_mode addressing_mode,
+        cl_filter_mode filter_mode,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+        cl_sampler_properties sampler_properties[] = {
+            CL_SAMPLER_NORMALIZED_COORDS, normalized_coords,
+            CL_SAMPLER_ADDRESSING_MODE, addressing_mode,
+            CL_SAMPLER_FILTER_MODE, filter_mode,
+            0 };
+        object_ = ::clCreateSamplerWithProperties(
+            context(),
+            sampler_properties,
+            &error);
+
+        detail::errHandler(error, __CREATE_SAMPLER_WITH_PROPERTIES_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+#else
+        object_ = ::clCreateSampler(
+            context(),
+            normalized_coords,
+            addressing_mode,
+            filter_mode,
+            &error);
+
+        detail::errHandler(error, __CREATE_SAMPLER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+#endif        
+    }
+
+    /*! \brief Constructor from cl_sampler - takes ownership.
+     * 
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  This effectively transfers ownership of a refcount on the cl_sampler
+     *  into the new Sampler object.
+     */
+    explicit Sampler(const cl_sampler& sampler, bool retainObject = false) : 
+        detail::Wrapper<cl_type>(sampler, retainObject) { }
+
+    /*! \brief Assignment operator from cl_sampler - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseSampler() on the value previously held by this instance.
+     */
+    Sampler& operator = (const cl_sampler& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Sampler(const Sampler& sam) : detail::Wrapper<cl_type>(sam) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Sampler& operator = (const Sampler &sam)
+    {
+        detail::Wrapper<cl_type>::operator=(sam);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Sampler(Sampler&& sam) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(sam)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Sampler& operator = (Sampler &&sam)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(sam));
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetSamplerInfo().
+    template <typename T>
+    cl_int getInfo(cl_sampler_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetSamplerInfo, object_, name, param),
+            __GET_SAMPLER_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetSamplerInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_sampler_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_sampler_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+};
+
+class Program;
+class CommandQueue;
+class DeviceCommandQueue;
+class Kernel;
+
+//! \brief Class interface for specifying NDRange values.
+class NDRange
+{
+private:
+    size_type sizes_[3];
+    cl_uint dimensions_;
+
+public:
+    //! \brief Default constructor - resulting range has zero dimensions.
+    NDRange()
+        : dimensions_(0)
+    {
+        sizes_[0] = 0;
+        sizes_[1] = 0;
+        sizes_[2] = 0;
+    }
+
+    //! \brief Constructs one-dimensional range.
+    NDRange(size_type size0)
+        : dimensions_(1)
+    {
+        sizes_[0] = size0;
+        sizes_[1] = 1;
+        sizes_[2] = 1;
+    }
+
+    //! \brief Constructs two-dimensional range.
+    NDRange(size_type size0, size_type size1)
+        : dimensions_(2)
+    {
+        sizes_[0] = size0;
+        sizes_[1] = size1;
+        sizes_[2] = 1;
+    }
+
+    //! \brief Constructs three-dimensional range.
+    NDRange(size_type size0, size_type size1, size_type size2)
+        : dimensions_(3)
+    {
+        sizes_[0] = size0;
+        sizes_[1] = size1;
+        sizes_[2] = size2;
+    }
+
+    /*! \brief Conversion operator to const size_type *.
+     *  
+     *  \returns a pointer to the size of the first dimension.
+     */
+    operator const size_type*() const { 
+        return sizes_; 
+    }
+
+    //! \brief Queries the number of dimensions in the range.
+    size_type dimensions() const 
+    { 
+        return dimensions_; 
+    }
+
+    //! \brief Returns the size of the object in bytes based on the
+    // runtime number of dimensions
+    size_type size() const
+    {
+        return dimensions_*sizeof(size_type);
+    }
+
+    size_type* get()
+    {
+        return sizes_;
+    }
+    
+    const size_type* get() const
+    {
+        return sizes_;
+    }
+};
+
+//! \brief A zero-dimensional range.
+static const NDRange NullRange;
+
+//! \brief Local address wrapper for use with Kernel::setArg
+struct LocalSpaceArg
+{
+    size_type size_;
+};
+
+namespace detail {
+
+template <typename T, class Enable = void>
+struct KernelArgumentHandler;
+
+// Enable for objects that are not subclasses of memory
+// Pointers, constants etc
+template <typename T>
+struct KernelArgumentHandler<T, typename std::enable_if<!std::is_base_of<cl::Memory, T>::value>::type>
+{
+    static size_type size(const T&) { return sizeof(T); }
+    static const T* ptr(const T& value) { return &value; }
+};
+
+// Enable for subclasses of memory where we want to get a reference to the cl_mem out
+// and pass that in for safety
+template <typename T>
+struct KernelArgumentHandler<T, typename std::enable_if<std::is_base_of<cl::Memory, T>::value>::type>
+{
+    static size_type size(const T&) { return sizeof(cl_mem); }
+    static const cl_mem* ptr(const T& value) { return &(value()); }
+};
+
+// Specialization for DeviceCommandQueue defined later
+
+template <>
+struct KernelArgumentHandler<LocalSpaceArg, void>
+{
+    static size_type size(const LocalSpaceArg& value) { return value.size_; }
+    static const void* ptr(const LocalSpaceArg&) { return NULL; }
+};
+
+} 
+//! \endcond
+
+/*! Local
+ * \brief Helper function for generating LocalSpaceArg objects.
+ */
+inline LocalSpaceArg
+Local(size_type size)
+{
+    LocalSpaceArg ret = { size };
+    return ret;
+}
+
+/*! \brief Class interface for cl_kernel.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_kernel as the original.  For details, see
+ *        clRetainKernel() and clReleaseKernel().
+ *
+ *  \see cl_kernel
+ */
+class Kernel : public detail::Wrapper<cl_kernel>
+{
+public:
+    inline Kernel(const Program& program, const char* name, cl_int* err = NULL);
+
+    //! \brief Default constructor - initializes to NULL.
+    Kernel() { }
+
+    /*! \brief Constructor from cl_kernel - takes ownership.
+     * 
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     *  This effectively transfers ownership of a refcount on the cl_kernel
+     *  into the new Kernel object.
+     */
+    explicit Kernel(const cl_kernel& kernel, bool retainObject = false) : 
+        detail::Wrapper<cl_type>(kernel, retainObject) { }
+
+    /*! \brief Assignment operator from cl_kernel - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseKernel() on the value previously held by this instance.
+     */
+    Kernel& operator = (const cl_kernel& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Kernel(const Kernel& kernel) : detail::Wrapper<cl_type>(kernel) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Kernel& operator = (const Kernel &kernel)
+    {
+        detail::Wrapper<cl_type>::operator=(kernel);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Kernel(Kernel&& kernel) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(kernel)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Kernel& operator = (Kernel &&kernel)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(kernel));
+        return *this;
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_kernel_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetKernelInfo, object_, name, param),
+            __GET_KERNEL_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_kernel_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+    template <typename T>
+    cl_int getArgInfo(cl_uint argIndex, cl_kernel_arg_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetKernelArgInfo, object_, argIndex, name, param),
+            __GET_KERNEL_ARG_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_arg_info, name>::param_type
+    getArgInfo(cl_uint argIndex, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_kernel_arg_info, name>::param_type param;
+        cl_int result = getArgInfo(argIndex, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+    template <typename T>
+    cl_int getWorkGroupInfo(
+        const Device& device, cl_kernel_work_group_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetKernelWorkGroupInfo, object_, device(), name, param),
+                __GET_KERNEL_WORK_GROUP_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_work_group_info, name>::param_type
+        getWorkGroupInfo(const Device& device, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+        detail::cl_kernel_work_group_info, name>::param_type param;
+        cl_int result = getWorkGroupInfo(device, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+    
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+#if defined(CL_HPP_USE_CL_SUB_GROUPS_KHR)
+    cl_int getSubGroupInfo(const cl::Device &dev, cl_kernel_sub_group_info name, const cl::NDRange &range, size_type* param) const
+    {
+        typedef clGetKernelSubGroupInfoKHR_fn PFN_clGetKernelSubGroupInfoKHR;
+        static PFN_clGetKernelSubGroupInfoKHR pfn_clGetKernelSubGroupInfoKHR = NULL;
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetKernelSubGroupInfoKHR);
+
+        return detail::errHandler(
+            pfn_clGetKernelSubGroupInfoKHR(object_, dev(), name, range.size(), range.get(), sizeof(size_type), param, nullptr),
+            __GET_KERNEL_ARG_INFO_ERR);
+    }
+
+    template <cl_int name>
+        size_type getSubGroupInfo(const cl::Device &dev, const cl::NDRange &range, cl_int* err = NULL) const
+    {
+        size_type param;
+        cl_int result = getSubGroupInfo(dev, name, range, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+#endif // #if defined(CL_HPP_USE_CL_SUB_GROUPS_KHR)
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+    /*! \brief setArg overload taking a shared_ptr type
+     */
+    template<typename T, class D>
+    cl_int setArg(cl_uint index, const cl::pointer<T, D> &argPtr)
+    {
+        return detail::errHandler(
+            ::clSetKernelArgSVMPointer(object_, index, argPtr.get()),
+            __SET_KERNEL_ARGS_ERR);
+    }
+
+    /*! \brief setArg overload taking a vector type.
+     */
+    template<typename T, class Alloc>
+    cl_int setArg(cl_uint index, const cl::vector<T, Alloc> &argPtr)
+    {
+        return detail::errHandler(
+            ::clSetKernelArgSVMPointer(object_, index, argPtr.data()),
+            __SET_KERNEL_ARGS_ERR);
+    }
+
+    /*! \brief setArg overload taking a pointer type
+     */
+    template<typename T>
+    typename std::enable_if<std::is_pointer<T>::value, cl_int>::type
+        setArg(cl_uint index, const T argPtr)
+    {
+        return detail::errHandler(
+            ::clSetKernelArgSVMPointer(object_, index, argPtr),
+            __SET_KERNEL_ARGS_ERR);
+    }
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+    /*! \brief setArg overload taking a POD type
+     */
+    template <typename T>
+    typename std::enable_if<!std::is_pointer<T>::value, cl_int>::type
+        setArg(cl_uint index, const T &value)
+    {
+        return detail::errHandler(
+            ::clSetKernelArg(
+                object_,
+                index,
+                detail::KernelArgumentHandler<T>::size(value),
+                detail::KernelArgumentHandler<T>::ptr(value)),
+            __SET_KERNEL_ARGS_ERR);
+    }
+
+    cl_int setArg(cl_uint index, size_type size, const void* argPtr)
+    {
+        return detail::errHandler(
+            ::clSetKernelArg(object_, index, size, argPtr),
+            __SET_KERNEL_ARGS_ERR);
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+    /*!
+     * Specify a vector of SVM pointers that the kernel may access in 
+     * addition to its arguments.
+     */
+    cl_int setSVMPointers(const vector<void*> &pointerList)
+    {
+        return detail::errHandler(
+            ::clSetKernelExecInfo(
+                object_,
+                CL_KERNEL_EXEC_INFO_SVM_PTRS,
+                sizeof(void*)*pointerList.size(),
+                pointerList.data()));
+    }
+
+    /*!
+     * Specify a std::array of SVM pointers that the kernel may access in
+     * addition to its arguments.
+     */
+    template<int ArrayLength>
+    cl_int setSVMPointers(const std::array<void*, ArrayLength> &pointerList)
+    {
+        return detail::errHandler(
+            ::clSetKernelExecInfo(
+                object_,
+                CL_KERNEL_EXEC_INFO_SVM_PTRS,
+                sizeof(void*)*pointerList.size(),
+                pointerList.data()));
+    }
+
+    /*! \brief Enable fine-grained system SVM.
+     *
+     * \note It is only possible to enable fine-grained system SVM if all devices
+     *       in the context associated with kernel support it.
+     * 
+     * \param svmEnabled True if fine-grained system SVM is requested. False otherwise.
+     * \return CL_SUCCESS if the function was executed succesfully. CL_INVALID_OPERATION
+     *         if no devices in the context support fine-grained system SVM.
+     *
+     * \see clSetKernelExecInfo
+     */
+    cl_int enableFineGrainedSystemSVM(bool svmEnabled)
+    {
+        cl_bool svmEnabled_ = svmEnabled ? CL_TRUE : CL_FALSE;
+        return detail::errHandler(
+            ::clSetKernelExecInfo(
+                object_,
+                CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM,
+                sizeof(cl_bool),
+                &svmEnabled_
+                )
+            );
+    }
+    
+    template<int index, int ArrayLength, class D, typename T0, typename... Ts>
+    void setSVMPointersHelper(std::array<void*, ArrayLength> &pointerList, const pointer<T0, D> &t0, Ts... ts)
+    {
+        pointerList[index] = static_cast<void*>(t0.get());
+        setSVMPointersHelper<index + 1, Ts...>(ts...);
+    }
+
+    template<int index, int ArrayLength, typename T0, typename... Ts>
+    typename std::enable_if<std::is_pointer<T0>::value, void>::type
+    setSVMPointersHelper(std::array<void*, ArrayLength> &pointerList, T0 t0, Ts... ts)
+    {
+        pointerList[index] = static_cast<void*>(t0);
+        setSVMPointersHelper<index + 1, Ts...>(ts...);
+    }
+    
+    template<int index, int ArrayLength, typename T0, class D>
+    void setSVMPointersHelper(std::array<void*, ArrayLength> &pointerList, const pointer<T0, D> &t0)
+    {
+        pointerList[index] = static_cast<void*>(t0.get());
+    }
+
+    template<int index, int ArrayLength, typename T0>
+    typename std::enable_if<std::is_pointer<T0>::value, void>::type
+    setSVMPointersHelper(std::array<void*, ArrayLength> &pointerList, T0 t0)
+    {
+        pointerList[index] = static_cast<void*>(t0);
+    }
+
+    template<typename T0, typename... Ts>
+    cl_int setSVMPointers(const T0 &t0, Ts... ts)
+    {
+        std::array<void*, 1 + sizeof...(Ts)> pointerList;
+
+        setSVMPointersHelper<0, 1 + sizeof...(Ts)>(pointerList, t0, ts...);
+        return detail::errHandler(
+            ::clSetKernelExecInfo(
+            object_,
+            CL_KERNEL_EXEC_INFO_SVM_PTRS,
+            sizeof(void*)*(1 + sizeof...(Ts)),
+            pointerList.data()));
+    }
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+};
+
+/*! \class Program
+ * \brief Program interface that implements cl_program.
+ */
+class Program : public detail::Wrapper<cl_program>
+{
+public:
+#if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+    typedef vector<vector<unsigned char>> Binaries;
+    typedef vector<string> Sources;
+#else // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+    typedef vector<std::pair<const void*, size_type> > Binaries;
+    typedef vector<std::pair<const char*, size_type> > Sources;
+#endif // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+    
+    Program(
+        const string& source,
+        bool build = false,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const char * strings = source.c_str();
+        const size_type length  = source.size();
+
+        Context context = Context::getDefault(err);
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)1, &strings, &length, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+
+        if (error == CL_SUCCESS && build) {
+
+            error = ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+#if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
+                "-cl-std=CL2.0",
+#else
+                "",
+#endif // #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
+                NULL,
+                NULL);
+
+            detail::buildErrHandler(error, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
+        }
+
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Program(
+        const Context& context,
+        const string& source,
+        bool build = false,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const char * strings = source.c_str();
+        const size_type length  = source.size();
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)1, &strings, &length, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+
+        if (error == CL_SUCCESS && build) {
+            error = ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+#if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
+                "-cl-std=CL2.0",
+#else
+                "",
+#endif // #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
+                NULL,
+                NULL);
+            
+            detail::buildErrHandler(error, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
+        }
+
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /**
+     * Create a program from a vector of source strings and the default context.
+     * Does not compile or link the program.
+     */
+    Program(
+        const Sources& sources,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        Context context = Context::getDefault(err);
+
+        const size_type n = (size_type)sources.size();
+
+        vector<size_type> lengths(n);
+        vector<const char*> strings(n);
+
+        for (size_type i = 0; i < n; ++i) {
+#if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+            strings[i] = sources[(int)i].data();
+            lengths[i] = sources[(int)i].length();
+#else // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+            strings[i] = sources[(int)i].first;
+            lengths[i] = sources[(int)i].second;
+#endif // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+        }
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)n, strings.data(), lengths.data(), &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /**
+     * Create a program from a vector of source strings and a provided context.
+     * Does not compile or link the program.
+     */
+    Program(
+        const Context& context,
+        const Sources& sources,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const size_type n = (size_type)sources.size();
+
+        vector<size_type> lengths(n);
+        vector<const char*> strings(n);
+
+        for (size_type i = 0; i < n; ++i) {
+#if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+            strings[i] = sources[(int)i].data();
+            lengths[i] = sources[(int)i].length();
+#else // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+            strings[i] = sources[(int)i].first;
+            lengths[i] = sources[(int)i].second;
+#endif // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+        }
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)n, strings.data(), lengths.data(), &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /**
+     * Construct a program object from a list of devices and a per-device list of binaries.
+     * \param context A valid OpenCL context in which to construct the program.
+     * \param devices A vector of OpenCL device objects for which the program will be created.
+     * \param binaries A vector of pairs of a pointer to a binary object and its length.
+     * \param binaryStatus An optional vector that on completion will be resized to
+     *   match the size of binaries and filled with values to specify if each binary
+     *   was successfully loaded.
+     *   Set to CL_SUCCESS if the binary was successfully loaded.
+     *   Set to CL_INVALID_VALUE if the length is 0 or the binary pointer is NULL.
+     *   Set to CL_INVALID_BINARY if the binary provided is not valid for the matching device.
+     * \param err if non-NULL will be set to CL_SUCCESS on successful operation or one of the following errors:
+     *   CL_INVALID_CONTEXT if context is not a valid context.
+     *   CL_INVALID_VALUE if the length of devices is zero; or if the length of binaries does not match the length of devices; 
+     *     or if any entry in binaries is NULL or has length 0.
+     *   CL_INVALID_DEVICE if OpenCL devices listed in devices are not in the list of devices associated with context.
+     *   CL_INVALID_BINARY if an invalid program binary was encountered for any device. binaryStatus will return specific status for each device.
+     *   CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host.
+     */
+    Program(
+        const Context& context,
+        const vector<Device>& devices,
+        const Binaries& binaries,
+        vector<cl_int>* binaryStatus = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        
+        const size_type numDevices = devices.size();
+        
+        // Catch size mismatch early and return
+        if(binaries.size() != numDevices) {
+            error = CL_INVALID_VALUE;
+            detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+            return;
+        }
+
+
+        vector<size_type> lengths(numDevices);
+        vector<const unsigned char*> images(numDevices);
+#if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+        for (size_type i = 0; i < numDevices; ++i) {
+            images[i] = binaries[i].data();
+            lengths[i] = binaries[(int)i].size();
+        }
+#else // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+        for (size_type i = 0; i < numDevices; ++i) {
+            images[i] = (const unsigned char*)binaries[i].first;
+            lengths[i] = binaries[(int)i].second;
+        }
+#endif // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
+        
+        vector<cl_device_id> deviceIDs(numDevices);
+        for( size_type deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        if(binaryStatus) {
+            binaryStatus->resize(numDevices);
+        }
+        
+        object_ = ::clCreateProgramWithBinary(
+            context(), (cl_uint) devices.size(),
+            deviceIDs.data(),
+            lengths.data(), images.data(), (binaryStatus != NULL && numDevices > 0)
+               ? &binaryStatus->front()
+               : NULL, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+    /**
+     * Create program using builtin kernels.
+     * \param kernelNames Semi-colon separated list of builtin kernel names
+     */
+    Program(
+        const Context& context,
+        const vector<Device>& devices,
+        const string& kernelNames,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+
+        size_type numDevices = devices.size();
+        vector<cl_device_id> deviceIDs(numDevices);
+        for( size_type deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+        
+        object_ = ::clCreateProgramWithBuiltInKernels(
+            context(), 
+            (cl_uint) devices.size(),
+            deviceIDs.data(),
+            kernelNames.c_str(), 
+            &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+    Program() { }
+    
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     */
+    explicit Program(const cl_program& program, bool retainObject = false) : 
+        detail::Wrapper<cl_type>(program, retainObject) { }
+
+    Program& operator = (const cl_program& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Program(const Program& program) : detail::Wrapper<cl_type>(program) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    Program& operator = (const Program &program)
+    {
+        detail::Wrapper<cl_type>::operator=(program);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Program(Program&& program) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(program)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    Program& operator = (Program &&program)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(program));
+        return *this;
+    }
+
+    cl_int build(
+        const vector<Device>& devices,
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        size_type numDevices = devices.size();
+        vector<cl_device_id> deviceIDs(numDevices);
+        
+        for( size_type deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        cl_int buildError = ::clBuildProgram(
+            object_,
+            (cl_uint)
+            devices.size(),
+            deviceIDs.data(),
+            options,
+            notifyFptr,
+            data);
+
+        return detail::buildErrHandler(buildError, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
+    }
+
+    cl_int build(
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        cl_int buildError = ::clBuildProgram(
+            object_,
+            0,
+            NULL,
+            options,
+            notifyFptr,
+            data);
+
+
+        return detail::buildErrHandler(buildError, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+    cl_int compile(
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        cl_int error = ::clCompileProgram(
+            object_,
+            0,
+            NULL,
+            options,
+            0,
+            NULL,
+            NULL,
+            notifyFptr,
+            data);
+        return detail::buildErrHandler(error, __COMPILE_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
+    }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+    template <typename T>
+    cl_int getInfo(cl_program_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetProgramInfo, object_, name, param),
+            __GET_PROGRAM_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_program_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_program_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    template <typename T>
+    cl_int getBuildInfo(
+        const Device& device, cl_program_build_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetProgramBuildInfo, object_, device(), name, param),
+                __GET_PROGRAM_BUILD_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_program_build_info, name>::param_type
+    getBuildInfo(const Device& device, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_program_build_info, name>::param_type param;
+        cl_int result = getBuildInfo(device, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+    
+    /**
+     * Build info function that returns a vector of device/info pairs for the specified 
+     * info type and for all devices in the program.
+     * On an error reading the info for any device, an empty vector of info will be returned.
+     */
+    template <cl_int name>
+    vector<std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info, name>::param_type>>
+        getBuildInfo(cl_int *err = NULL) const
+    {
+        cl_int result = CL_SUCCESS;
+
+        auto devs = getInfo<CL_PROGRAM_DEVICES>(&result);
+        vector<std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info, name>::param_type>>
+            devInfo;
+
+        // If there was an initial error from getInfo return the error
+        if (result != CL_SUCCESS) {
+            if (err != NULL) {
+                *err = result;
+            }
+            return devInfo;
+        }
+
+        for (cl::Device d : devs) {
+            typename detail::param_traits<
+                detail::cl_program_build_info, name>::param_type param;
+            result = getBuildInfo(d, name, &param);
+            devInfo.push_back(
+                std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info, name>::param_type>
+                (d, param));
+            if (result != CL_SUCCESS) {
+                // On error, leave the loop and return the error code
+                break;
+            }
+        }
+        if (err != NULL) {
+            *err = result;
+        }
+        if (result != CL_SUCCESS) {
+            devInfo.clear();
+        }
+        return devInfo;
+    }
+
+    cl_int createKernels(vector<Kernel>* kernels)
+    {
+        cl_uint numKernels;
+        cl_int err = ::clCreateKernelsInProgram(object_, 0, NULL, &numKernels);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
+        }
+
+        vector<cl_kernel> value(numKernels);
+        
+        err = ::clCreateKernelsInProgram(
+            object_, numKernels, value.data(), NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
+        }
+
+        if (kernels) {
+            kernels->resize(value.size());
+
+            // Assign to param, constructing with retain behaviour
+            // to correctly capture each underlying CL object
+            for (size_type i = 0; i < value.size(); i++) {
+                // We do not need to retain because this kernel is being created 
+                // by the runtime
+                (*kernels)[i] = Kernel(value[i], false);
+            }
+        }
+        return CL_SUCCESS;
+    }
+};
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+inline Program linkProgram(
+    Program input1,
+    Program input2,
+    const char* options = NULL,
+    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+    void* data = NULL,
+    cl_int* err = NULL) 
+{
+    cl_int error_local = CL_SUCCESS;
+
+    cl_program programs[2] = { input1(), input2() };
+
+    Context ctx = input1.getInfo<CL_PROGRAM_CONTEXT>(&error_local);
+    if(error_local!=CL_SUCCESS) {
+        detail::errHandler(error_local, __LINK_PROGRAM_ERR);
+    }
+
+    cl_program prog = ::clLinkProgram(
+        ctx(),
+        0,
+        NULL,
+        options,
+        2,
+        programs,
+        notifyFptr,
+        data,
+        &error_local);
+
+    detail::errHandler(error_local,__COMPILE_PROGRAM_ERR);
+    if (err != NULL) {
+        *err = error_local;
+    }
+
+    return Program(prog);
+}
+
+inline Program linkProgram(
+    vector<Program> inputPrograms,
+    const char* options = NULL,
+    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+    void* data = NULL,
+    cl_int* err = NULL) 
+{
+    cl_int error_local = CL_SUCCESS;
+
+    vector<cl_program> programs(inputPrograms.size());
+
+    for (unsigned int i = 0; i < inputPrograms.size(); i++) {
+        programs[i] = inputPrograms[i]();
+    }
+    
+    Context ctx;
+    if(inputPrograms.size() > 0) {
+        ctx = inputPrograms[0].getInfo<CL_PROGRAM_CONTEXT>(&error_local);
+        if(error_local!=CL_SUCCESS) {
+            detail::errHandler(error_local, __LINK_PROGRAM_ERR);
+        }
+    }
+    cl_program prog = ::clLinkProgram(
+        ctx(),
+        0,
+        NULL,
+        options,
+        (cl_uint)inputPrograms.size(),
+        programs.data(),
+        notifyFptr,
+        data,
+        &error_local);
+
+    detail::errHandler(error_local,__COMPILE_PROGRAM_ERR);
+    if (err != NULL) {
+        *err = error_local;
+    }
+
+    return Program(prog, false);
+}
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+// Template specialization for CL_PROGRAM_BINARIES
+template <>
+inline cl_int cl::Program::getInfo(cl_program_info name, vector<vector<unsigned char>>* param) const
+{
+    if (name != CL_PROGRAM_BINARIES) {
+        return CL_INVALID_VALUE;
+    }
+    if (param) {
+        // Resize the parameter array appropriately for each allocation
+        // and pass down to the helper
+
+        vector<size_type> sizes = getInfo<CL_PROGRAM_BINARY_SIZES>();
+        size_type numBinaries = sizes.size();
+
+        // Resize the parameter array and constituent arrays
+        param->resize(numBinaries);
+        for (int i = 0; i < numBinaries; ++i) {
+            (*param)[i].resize(sizes[i]);
+        }
+
+        return detail::errHandler(
+            detail::getInfo(&::clGetProgramInfo, object_, name, param),
+            __GET_PROGRAM_INFO_ERR);
+    }
+
+    return CL_SUCCESS;
+}
+
+template<>
+inline vector<vector<unsigned char>> cl::Program::getInfo<CL_PROGRAM_BINARIES>(cl_int* err) const
+{
+    vector<vector<unsigned char>> binariesVectors;
+
+    cl_int result = getInfo(CL_PROGRAM_BINARIES, &binariesVectors);
+    if (err != NULL) {
+        *err = result;
+    }
+    return binariesVectors;
+}
+
+inline Kernel::Kernel(const Program& program, const char* name, cl_int* err)
+{
+    cl_int error;
+
+    object_ = ::clCreateKernel(program(), name, &error);
+    detail::errHandler(error, __CREATE_KERNEL_ERR);
+
+    if (err != NULL) {
+        *err = error;
+    }
+
+}
+
+enum class QueueProperties : cl_command_queue_properties
+{
+    None = 0,
+    Profiling = CL_QUEUE_PROFILING_ENABLE,
+    OutOfOrder = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
+};
+
+inline QueueProperties operator|(QueueProperties lhs, QueueProperties rhs)
+{
+    return static_cast<QueueProperties>(static_cast<cl_command_queue_properties>(lhs) | static_cast<cl_command_queue_properties>(rhs));
+}
+
+/*! \class CommandQueue
+ * \brief CommandQueue interface for cl_command_queue.
+ */
+class CommandQueue : public detail::Wrapper<cl_command_queue>
+{
+private:
+    static std::once_flag default_initialized_;
+    static CommandQueue default_;
+    static cl_int default_error_;
+
+    /*! \brief Create the default command queue returned by @ref getDefault.
+     *
+     * It sets default_error_ to indicate success or failure. It does not throw
+     * @c cl::Error.
+     */
+    static void makeDefault()
+    {
+        /* We don't want to throw an error from this function, so we have to
+         * catch and set the error flag.
+         */
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+        try
+#endif
+        {
+            int error;
+            Context context = Context::getDefault(&error);
+
+            if (error != CL_SUCCESS) {
+                default_error_ = error;
+            }
+            else {
+                Device device = Device::getDefault();
+                default_ = CommandQueue(context, device, 0, &default_error_);
+            }
+        }
+#if defined(CL_HPP_ENABLE_EXCEPTIONS)
+        catch (cl::Error &e) {
+            default_error_ = e.err();
+        }
+#endif
+    }
+
+    /*! \brief Create the default command queue.
+     *
+     * This sets @c default_. It does not throw
+     * @c cl::Error.
+     */
+    static void makeDefaultProvided(const CommandQueue &c) {
+        default_ = c;
+    }
+
+public:
+#ifdef CL_HPP_UNIT_TEST_ENABLE
+    /*! \brief Reset the default.
+    *
+    * This sets @c default_ to an empty value to support cleanup in
+    * the unit test framework.
+    * This function is not thread safe.
+    */
+    static void unitTestClearDefault() {
+        default_ = CommandQueue();
+    }
+#endif // #ifdef CL_HPP_UNIT_TEST_ENABLE
+        
+
+    /*!
+     * \brief Constructs a CommandQueue based on passed properties.
+     * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified.
+     */
+   CommandQueue(
+        cl_command_queue_properties properties,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        Context context = Context::getDefault(&error);
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+
+        if (error != CL_SUCCESS) {
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+        else {
+            Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+            cl_queue_properties queue_properties[] = {
+                CL_QUEUE_PROPERTIES, properties, 0 };
+            if ((properties & CL_QUEUE_ON_DEVICE) == 0) {
+                object_ = ::clCreateCommandQueueWithProperties(
+                    context(), device(), queue_properties, &error);
+            }
+            else {
+                error = CL_INVALID_QUEUE_PROPERTIES;
+            }
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+#else
+            object_ = ::clCreateCommandQueue(
+                context(), device(), properties, &error);
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+#endif
+        }
+    }
+
+   /*!
+    * \brief Constructs a CommandQueue based on passed properties.
+    * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified.
+    */
+   CommandQueue(
+       QueueProperties properties,
+       cl_int* err = NULL)
+   {
+       cl_int error;
+
+       Context context = Context::getDefault(&error);
+       detail::errHandler(error, __CREATE_CONTEXT_ERR);
+
+       if (error != CL_SUCCESS) {
+           if (err != NULL) {
+               *err = error;
+           }
+       }
+       else {
+           Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+           cl_queue_properties queue_properties[] = {
+               CL_QUEUE_PROPERTIES, static_cast<cl_queue_properties>(properties), 0 };
+
+           object_ = ::clCreateCommandQueueWithProperties(
+               context(), device(), queue_properties, &error);
+
+
+           detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+           if (err != NULL) {
+               *err = error;
+           }
+#else
+           object_ = ::clCreateCommandQueue(
+               context(), device(), static_cast<cl_command_queue_properties>(properties), &error);
+
+           detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+           if (err != NULL) {
+               *err = error;
+           }
+#endif
+       }
+   }
+
+    /*!
+     * \brief Constructs a CommandQueue for an implementation defined device in the given context
+     * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified.
+     */
+    explicit CommandQueue(
+        const Context& context,
+        cl_command_queue_properties properties = 0,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        vector<cl::Device> devices;
+        error = context.getInfo(CL_CONTEXT_DEVICES, &devices);
+
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+
+        if (error != CL_SUCCESS)
+        {
+            if (err != NULL) {
+                *err = error;
+            }
+            return;
+        }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+        cl_queue_properties queue_properties[] = {
+            CL_QUEUE_PROPERTIES, properties, 0 };
+        if ((properties & CL_QUEUE_ON_DEVICE) == 0) {
+            object_ = ::clCreateCommandQueueWithProperties(
+                context(), devices[0](), queue_properties, &error);
+        }
+        else {
+            error = CL_INVALID_QUEUE_PROPERTIES;
+        }
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+#else
+        object_ = ::clCreateCommandQueue(
+            context(), devices[0](), properties, &error);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+#endif
+
+    }
+
+    /*!
+    * \brief Constructs a CommandQueue for an implementation defined device in the given context
+    * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified.
+    */
+    explicit CommandQueue(
+        const Context& context,
+        QueueProperties properties,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        vector<cl::Device> devices;
+        error = context.getInfo(CL_CONTEXT_DEVICES, &devices);
+
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+
+        if (error != CL_SUCCESS)
+        {
+            if (err != NULL) {
+                *err = error;
+            }
+            return;
+        }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+        cl_queue_properties queue_properties[] = {
+            CL_QUEUE_PROPERTIES, static_cast<cl_queue_properties>(properties), 0 };
+        object_ = ::clCreateCommandQueueWithProperties(
+            context(), devices[0](), queue_properties, &error);
+       
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+#else
+        object_ = ::clCreateCommandQueue(
+            context(), devices[0](), static_cast<cl_command_queue_properties>(properties), &error);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+#endif
+
+    }
+
+    /*!
+     * \brief Constructs a CommandQueue for a passed device and context
+     * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified.
+     */
+    CommandQueue(
+        const Context& context,
+        const Device& device,
+        cl_command_queue_properties properties = 0,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+        cl_queue_properties queue_properties[] = {
+            CL_QUEUE_PROPERTIES, properties, 0 };
+        object_ = ::clCreateCommandQueueWithProperties(
+            context(), device(), queue_properties, &error);
+        
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+#else
+        object_ = ::clCreateCommandQueue(
+            context(), device(), properties, &error);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+#endif
+    }
+
+    /*!
+     * \brief Constructs a CommandQueue for a passed device and context
+     * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified.
+     */
+    CommandQueue(
+        const Context& context,
+        const Device& device,
+        QueueProperties properties,
+        cl_int* err = NULL)
+    {
+            cl_int error;
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+            cl_queue_properties queue_properties[] = {
+                CL_QUEUE_PROPERTIES, static_cast<cl_queue_properties>(properties), 0 };
+            object_ = ::clCreateCommandQueueWithProperties(
+                context(), device(), queue_properties, &error);
+      
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+#else
+            object_ = ::clCreateCommandQueue(
+                context(), device(), static_cast<cl_command_queue_properties>(properties), &error);
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+#endif
+        }
+
+    static CommandQueue getDefault(cl_int * err = NULL) 
+    {
+        std::call_once(default_initialized_, makeDefault);
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+        detail::errHandler(default_error_, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+#else // CL_HPP_TARGET_OPENCL_VERSION >= 200
+        detail::errHandler(default_error_, __CREATE_COMMAND_QUEUE_ERR);
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
+        if (err != NULL) {
+            *err = default_error_;
+        }
+        return default_;
+    }
+
+    /**
+     * Modify the default command queue to be used by
+     * subsequent operations.
+     * Will only set the default if no default was previously created.
+     * @return updated default command queue.
+     *         Should be compared to the passed value to ensure that it was updated.
+     */
+    static CommandQueue setDefault(const CommandQueue &default_queue)
+    {
+        std::call_once(default_initialized_, makeDefaultProvided, std::cref(default_queue));
+        detail::errHandler(default_error_);
+        return default_;
+    }
+
+    CommandQueue() { }
+
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     * \param retainObject will cause the constructor to retain its cl object.
+     *                     Defaults to false to maintain compatibility with
+     *                     earlier versions.
+     */
+    explicit CommandQueue(const cl_command_queue& commandQueue, bool retainObject = false) : 
+        detail::Wrapper<cl_type>(commandQueue, retainObject) { }
+
+    CommandQueue& operator = (const cl_command_queue& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    CommandQueue(const CommandQueue& queue) : detail::Wrapper<cl_type>(queue) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    CommandQueue& operator = (const CommandQueue &queue)
+    {
+        detail::Wrapper<cl_type>::operator=(queue);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    CommandQueue(CommandQueue&& queue) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(queue)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    CommandQueue& operator = (CommandQueue &&queue)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(queue));
+        return *this;
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_command_queue_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetCommandQueueInfo, object_, name, param),
+                __GET_COMMAND_QUEUE_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_command_queue_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_command_queue_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    cl_int enqueueReadBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        size_type offset,
+        size_type size,
+        void* ptr,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueReadBuffer(
+                object_, buffer(), blocking, offset, size,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_READ_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        size_type offset,
+        size_type size,
+        const void* ptr,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteBuffer(
+                object_, buffer(), blocking, offset, size,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_WRITE_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBuffer(
+        const Buffer& src,
+        const Buffer& dst,
+        size_type src_offset,
+        size_type dst_offset,
+        size_type size,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBuffer(
+                object_, src(), dst(), src_offset, dst_offset, size,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQEUE_COPY_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueReadBufferRect(
+        const Buffer& buffer,
+        cl_bool blocking,
+        const array<size_type, 3>& buffer_offset,
+        const array<size_type, 3>& host_offset,
+        const array<size_type, 3>& region,
+        size_type buffer_row_pitch,
+        size_type buffer_slice_pitch,
+        size_type host_row_pitch,
+        size_type host_slice_pitch,
+        void *ptr,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueReadBufferRect(
+                object_, 
+                buffer(), 
+                blocking,
+                buffer_offset.data(),
+                host_offset.data(),
+                region.data(),
+                buffer_row_pitch,
+                buffer_slice_pitch,
+                host_row_pitch,
+                host_slice_pitch,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_READ_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteBufferRect(
+        const Buffer& buffer,
+        cl_bool blocking,
+        const array<size_type, 3>& buffer_offset,
+        const array<size_type, 3>& host_offset,
+        const array<size_type, 3>& region,
+        size_type buffer_row_pitch,
+        size_type buffer_slice_pitch,
+        size_type host_row_pitch,
+        size_type host_slice_pitch,
+        void *ptr,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteBufferRect(
+                object_, 
+                buffer(), 
+                blocking,
+                buffer_offset.data(),
+                host_offset.data(),
+                region.data(),
+                buffer_row_pitch,
+                buffer_slice_pitch,
+                host_row_pitch,
+                host_slice_pitch,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_WRITE_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBufferRect(
+        const Buffer& src,
+        const Buffer& dst,
+        const array<size_type, 3>& src_origin,
+        const array<size_type, 3>& dst_origin,
+        const array<size_type, 3>& region,
+        size_type src_row_pitch,
+        size_type src_slice_pitch,
+        size_type dst_row_pitch,
+        size_type dst_slice_pitch,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBufferRect(
+                object_, 
+                src(), 
+                dst(), 
+                src_origin.data(),
+                dst_origin.data(),
+                region.data(),
+                src_row_pitch,
+                src_slice_pitch,
+                dst_row_pitch,
+                dst_slice_pitch,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQEUE_COPY_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+    /**
+     * Enqueue a command to fill a buffer object with a pattern
+     * of a given size. The pattern is specified as a vector type.
+     * \tparam PatternType The datatype of the pattern field. 
+     *     The pattern type must be an accepted OpenCL data type.
+     * \tparam offset Is the offset in bytes into the buffer at 
+     *     which to start filling. This must be a multiple of 
+     *     the pattern size.
+     * \tparam size Is the size in bytes of the region to fill.
+     *     This must be a multiple of the pattern size.
+     */
+    template<typename PatternType>
+    cl_int enqueueFillBuffer(
+        const Buffer& buffer,
+        PatternType pattern,
+        size_type offset,
+        size_type size,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillBuffer(
+                object_, 
+                buffer(),
+                static_cast<void*>(&pattern),
+                sizeof(PatternType), 
+                offset, 
+                size,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+    cl_int enqueueReadImage(
+        const Image& image,
+        cl_bool blocking,
+        const array<size_type, 3>& origin,
+        const array<size_type, 3>& region,
+        size_type row_pitch,
+        size_type slice_pitch,
+        void* ptr,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueReadImage(
+                object_, 
+                image(), 
+                blocking, 
+                origin.data(),
+                region.data(), 
+                row_pitch, 
+                slice_pitch, 
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_READ_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteImage(
+        const Image& image,
+        cl_bool blocking,
+        const array<size_type, 3>& origin,
+        const array<size_type, 3>& region,
+        size_type row_pitch,
+        size_type slice_pitch,
+        void* ptr,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteImage(
+                object_, 
+                image(), 
+                blocking, 
+                origin.data(),
+                region.data(), 
+                row_pitch, 
+                slice_pitch, 
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_WRITE_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyImage(
+        const Image& src,
+        const Image& dst,
+        const array<size_type, 3>& src_origin,
+        const array<size_type, 3>& dst_origin,
+        const array<size_type, 3>& region,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyImage(
+                object_, 
+                src(), 
+                dst(), 
+                src_origin.data(),
+                dst_origin.data(), 
+                region.data(),
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA floating-point color value if
+     *     the image channel data type is not an unnormalized signed or
+     *     unsigned data type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_float4 fillColor,
+        const array<size_type, 3>& origin,
+        const array<size_type, 3>& region,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                origin.data(),
+                region.data(),
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA signed integer color value if
+     *     the image channel data type is an unnormalized signed integer
+     *     type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_int4 fillColor,
+        const array<size_type, 3>& origin,
+        const array<size_type, 3>& region,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                origin.data(),
+                region.data(),
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA unsigned integer color value if
+     *     the image channel data type is an unnormalized unsigned integer
+     *     type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_uint4 fillColor,
+        const array<size_type, 3>& origin,
+        const array<size_type, 3>& region,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                origin.data(),
+                region.data(),
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+    cl_int enqueueCopyImageToBuffer(
+        const Image& src,
+        const Buffer& dst,
+        const array<size_type, 3>& src_origin,
+        const array<size_type, 3>& region,
+        size_type dst_offset,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyImageToBuffer(
+                object_, 
+                src(), 
+                dst(), 
+                src_origin.data(),
+                region.data(), 
+                dst_offset,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBufferToImage(
+        const Buffer& src,
+        const Image& dst,
+        size_type src_offset,
+        const array<size_type, 3>& dst_origin,
+        const array<size_type, 3>& region,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBufferToImage(
+                object_, 
+                src(), 
+                dst(), 
+                src_offset,
+                dst_origin.data(), 
+                region.data(),
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    void* enqueueMapBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        size_type offset,
+        size_type size,
+        const vector<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL) const
+    {
+        cl_event tmp;
+        cl_int error;
+        void * result = ::clEnqueueMapBuffer(
+            object_, buffer(), blocking, flags, offset, size,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (event != NULL) ? &tmp : NULL,
+            &error);
+
+        detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+        if (event != NULL && error == CL_SUCCESS)
+            *event = tmp;
+
+        return result;
+    }
+
+    void* enqueueMapImage(
+        const Image& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        const array<size_type, 3>& origin,
+        const array<size_type, 3>& region,
+        size_type * row_pitch,
+        size_type * slice_pitch,
+        const vector<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL) const
+    {
+        cl_event tmp;
+        cl_int error;
+        void * result = ::clEnqueueMapImage(
+            object_, buffer(), blocking, flags,
+            origin.data(), 
+            region.data(),
+            row_pitch, slice_pitch,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (event != NULL) ? &tmp : NULL,
+            &error);
+
+        detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR);
+        if (err != NULL) {
+              *err = error;
+        }
+        if (event != NULL && error == CL_SUCCESS)
+            *event = tmp;
+        return result;
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+    /**
+     * Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer.
+     * This variant takes a raw SVM pointer.
+     */
+    template<typename T>
+    cl_int enqueueMapSVM(
+        T* ptr,
+        cl_bool blocking,
+        cl_map_flags flags,
+        size_type size,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(::clEnqueueSVMMap(
+            object_, blocking, flags, static_cast<void*>(ptr), size,
+            (events != NULL) ? (cl_uint)events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+            (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_MAP_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+
+    /**
+     * Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer.
+     * This variant takes a cl::pointer instance.
+     */
+    template<typename T, class D>
+    cl_int enqueueMapSVM(
+        cl::pointer<T, D> &ptr,
+        cl_bool blocking,
+        cl_map_flags flags,
+        size_type size,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(::clEnqueueSVMMap(
+            object_, blocking, flags, static_cast<void*>(ptr.get()), size,
+            (events != NULL) ? (cl_uint)events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+            (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_MAP_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer.
+     * This variant takes a cl::vector instance.
+     */
+    template<typename T, class Alloc>
+    cl_int enqueueMapSVM(
+        cl::vector<T, Alloc> &container,
+        cl_bool blocking,
+        cl_map_flags flags,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(::clEnqueueSVMMap(
+            object_, blocking, flags, static_cast<void*>(container.data()), container.size(),
+            (events != NULL) ? (cl_uint)events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+            (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_MAP_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+    cl_int enqueueUnmapMemObject(
+        const Memory& memory,
+        void* mapped_ptr,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueUnmapMemObject(
+                object_, memory(), mapped_ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+    /**
+     * Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime.
+     * This variant takes a raw SVM pointer.
+     */
+    template<typename T>
+    cl_int enqueueUnmapSVM(
+        T* ptr,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueSVMUnmap(
+            object_, static_cast<void*>(ptr),
+            (events != NULL) ? (cl_uint)events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+            (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime.
+     * This variant takes a cl::pointer instance.
+     */
+    template<typename T, class D>
+    cl_int enqueueUnmapSVM(
+        cl::pointer<T, D> &ptr,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueSVMUnmap(
+            object_, static_cast<void*>(ptr.get()),
+            (events != NULL) ? (cl_uint)events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+            (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime.
+     * This variant takes a cl::vector instance.
+     */
+    template<typename T, class Alloc>
+    cl_int enqueueUnmapSVM(
+        cl::vector<T, Alloc> &container,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueSVMUnmap(
+            object_, static_cast<void*>(container.data()),
+            (events != NULL) ? (cl_uint)events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+            (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+    /**
+     * Enqueues a marker command which waits for either a list of events to complete, 
+     * or all previously enqueued commands to complete.
+     *
+     * Enqueues a marker command which waits for either a list of events to complete, 
+     * or if the list is empty it waits for all commands previously enqueued in command_queue 
+     * to complete before it completes. This command returns an event which can be waited on, 
+     * i.e. this event can be waited on to insure that all events either in the event_wait_list 
+     * or all previously enqueued commands, queued before this command to command_queue, 
+     * have completed.
+     */
+    cl_int enqueueMarkerWithWaitList(
+        const vector<Event> *events = 0,
+        Event *event = 0)
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueMarkerWithWaitList(
+                object_,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_MARKER_WAIT_LIST_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * A synchronization point that enqueues a barrier operation.
+     *
+     * Enqueues a barrier command which waits for either a list of events to complete, 
+     * or if the list is empty it waits for all commands previously enqueued in command_queue 
+     * to complete before it completes. This command blocks command execution, that is, any 
+     * following commands enqueued after it do not execute until it completes. This command 
+     * returns an event which can be waited on, i.e. this event can be waited on to insure that 
+     * all events either in the event_wait_list or all previously enqueued commands, queued 
+     * before this command to command_queue, have completed.
+     */
+    cl_int enqueueBarrierWithWaitList(
+        const vector<Event> *events = 0,
+        Event *event = 0)
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueBarrierWithWaitList(
+                object_,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_BARRIER_WAIT_LIST_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+    
+    /**
+     * Enqueues a command to indicate with which device a set of memory objects
+     * should be associated.
+     */
+    cl_int enqueueMigrateMemObjects(
+        const vector<Memory> &memObjects,
+        cl_mem_migration_flags flags,
+        const vector<Event>* events = NULL,
+        Event* event = NULL
+        )
+    {
+        cl_event tmp;
+        
+        vector<cl_mem> localMemObjects(memObjects.size());
+
+        for( int i = 0; i < (int)memObjects.size(); ++i ) {
+            localMemObjects[i] = memObjects[i]();
+        }
+
+
+        cl_int err = detail::errHandler(
+            ::clEnqueueMigrateMemObjects(
+                object_, 
+                (cl_uint)memObjects.size(), 
+                localMemObjects.data(),
+                flags,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+
+    cl_int enqueueNDRangeKernel(
+        const Kernel& kernel,
+        const NDRange& offset,
+        const NDRange& global,
+        const NDRange& local = NullRange,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueNDRangeKernel(
+                object_, kernel(), (cl_uint) global.dimensions(),
+                offset.dimensions() != 0 ? (const size_type*) offset : NULL,
+                (const size_type*) global,
+                local.dimensions() != 0 ? (const size_type*) local : NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_NDRANGE_KERNEL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
+    CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int enqueueTask(
+        const Kernel& kernel,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueTask(
+                object_, kernel(),
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_TASK_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
+
+    cl_int enqueueNativeKernel(
+        void (CL_CALLBACK *userFptr)(void *),
+        std::pair<void*, size_type> args,
+        const vector<Memory>* mem_objects = NULL,
+        const vector<const void*>* mem_locs = NULL,
+        const vector<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        size_type elements = 0;
+        if (mem_objects != NULL) {
+            elements = mem_objects->size();
+        }
+        vector<cl_mem> mems(elements);
+        for (unsigned int i = 0; i < elements; i++) {
+            mems[i] = ((*mem_objects)[i])();
+        }
+        
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueNativeKernel(
+                object_, userFptr, args.first, args.second,
+                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                mems.data(),
+                (mem_locs != NULL && mem_locs->size() > 0) ? (const void **) &mem_locs->front() : NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_NATIVE_KERNEL);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED 
+    cl_int enqueueMarker(Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueMarker(
+                object_, 
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_MARKER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    cl_int enqueueWaitForEvents(const vector<Event>& events) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        return detail::errHandler(
+            ::clEnqueueWaitForEvents(
+                object_,
+                (cl_uint) events.size(),
+                events.size() > 0 ? (const cl_event*) &events.front() : NULL),
+            __ENQUEUE_WAIT_FOR_EVENTS_ERR);
+    }
+#endif // defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+
+    cl_int enqueueAcquireGLObjects(
+         const vector<Memory>* mem_objects = NULL,
+         const vector<Event>* events = NULL,
+         Event* event = NULL) const
+     {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             ::clEnqueueAcquireGLObjects(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_ACQUIRE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+    cl_int enqueueReleaseGLObjects(
+         const vector<Memory>* mem_objects = NULL,
+         const vector<Event>* events = NULL,
+         Event* event = NULL) const
+     {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             ::clEnqueueReleaseGLObjects(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_RELEASE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+#if defined (CL_HPP_USE_DX_INTEROP)
+typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list, cl_event* event);
+typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem* mem_objects,  cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list, cl_event* event);
+
+    cl_int enqueueAcquireD3D10Objects(
+         const vector<Memory>* mem_objects = NULL,
+         const vector<Event>* events = NULL,
+         Event* event = NULL) const
+    {
+        static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL;
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+        cl_context context = getInfo<CL_QUEUE_CONTEXT>();
+        cl::Device device(getInfo<CL_QUEUE_DEVICE>());
+        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueAcquireD3D10ObjectsKHR);
+#endif
+#if CL_HPP_TARGET_OPENCL_VERSION >= 110
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clEnqueueAcquireD3D10ObjectsKHR);
+#endif
+        
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             pfn_clEnqueueAcquireD3D10ObjectsKHR(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_ACQUIRE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+    cl_int enqueueReleaseD3D10Objects(
+         const vector<Memory>* mem_objects = NULL,
+         const vector<Event>* events = NULL,
+         Event* event = NULL) const
+    {
+        static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL;
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120
+        cl_context context = getInfo<CL_QUEUE_CONTEXT>();
+        cl::Device device(getInfo<CL_QUEUE_DEVICE>());
+        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
+        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueReleaseD3D10ObjectsKHR);
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
+#if CL_HPP_TARGET_OPENCL_VERSION >= 110
+        CL_HPP_INIT_CL_EXT_FCN_PTR_(clEnqueueReleaseD3D10ObjectsKHR);
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            pfn_clEnqueueReleaseD3D10ObjectsKHR(
+                object_,
+                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_RELEASE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    cl_int enqueueBarrier() const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        return detail::errHandler(
+            ::clEnqueueBarrier(object_),
+            __ENQUEUE_BARRIER_ERR);
+    }
+#endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS
+
+    cl_int flush() const
+    {
+        return detail::errHandler(::clFlush(object_), __FLUSH_ERR);
+    }
+
+    cl_int finish() const
+    {
+        return detail::errHandler(::clFinish(object_), __FINISH_ERR);
+    }
+}; // CommandQueue
+
+CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag CommandQueue::default_initialized_;
+CL_HPP_DEFINE_STATIC_MEMBER_ CommandQueue CommandQueue::default_;
+CL_HPP_DEFINE_STATIC_MEMBER_ cl_int CommandQueue::default_error_ = CL_SUCCESS;
+
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+enum class DeviceQueueProperties : cl_command_queue_properties
+{
+    None = 0,
+    Profiling = CL_QUEUE_PROFILING_ENABLE,
+};
+
+inline DeviceQueueProperties operator|(DeviceQueueProperties lhs, DeviceQueueProperties rhs)
+{
+    return static_cast<DeviceQueueProperties>(static_cast<cl_command_queue_properties>(lhs) | static_cast<cl_command_queue_properties>(rhs));
+}
+
+/*! \class DeviceCommandQueue
+ * \brief DeviceCommandQueue interface for device cl_command_queues.
+ */
+class DeviceCommandQueue : public detail::Wrapper<cl_command_queue>
+{
+public:
+
+    /*!
+     * Trivial empty constructor to create a null queue.
+     */
+    DeviceCommandQueue() { }
+
+    /*!
+     * Default construct device command queue on default context and device
+     */
+    DeviceCommandQueue(DeviceQueueProperties properties, cl_int* err = NULL)
+    {
+        cl_int error;
+        cl::Context context = cl::Context::getDefault();
+        cl::Device device = cl::Device::getDefault();
+
+        cl_command_queue_properties mergedProperties =
+            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);
+
+        cl_queue_properties queue_properties[] = {
+            CL_QUEUE_PROPERTIES, mergedProperties, 0 };
+        object_ = ::clCreateCommandQueueWithProperties(
+            context(), device(), queue_properties, &error);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*!
+     * Create a device command queue for a specified device in the passed context.
+     */
+    DeviceCommandQueue(
+        const Context& context,
+        const Device& device,
+        DeviceQueueProperties properties = DeviceQueueProperties::None,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        cl_command_queue_properties mergedProperties =
+            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);
+        cl_queue_properties queue_properties[] = {
+            CL_QUEUE_PROPERTIES, mergedProperties, 0 };
+        object_ = ::clCreateCommandQueueWithProperties(
+            context(), device(), queue_properties, &error);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*!
+     * Create a device command queue for a specified device in the passed context.
+     */
+    DeviceCommandQueue(
+        const Context& context,
+        const Device& device,
+        cl_uint queueSize,
+        DeviceQueueProperties properties = DeviceQueueProperties::None,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        cl_command_queue_properties mergedProperties =
+            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);
+        cl_queue_properties queue_properties[] = {
+            CL_QUEUE_PROPERTIES, mergedProperties,
+            CL_QUEUE_SIZE, queueSize, 
+            0 };
+        object_ = ::clCreateCommandQueueWithProperties(
+            context(), device(), queue_properties, &error);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Constructor from cl_command_queue - takes ownership.
+    *
+    * \param retainObject will cause the constructor to retain its cl object.
+    *                     Defaults to false to maintain compatibility with
+    *                     earlier versions.
+    */
+    explicit DeviceCommandQueue(const cl_command_queue& commandQueue, bool retainObject = false) :
+        detail::Wrapper<cl_type>(commandQueue, retainObject) { }
+
+    DeviceCommandQueue& operator = (const cl_command_queue& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    /*! \brief Copy constructor to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    DeviceCommandQueue(const DeviceCommandQueue& queue) : detail::Wrapper<cl_type>(queue) {}
+
+    /*! \brief Copy assignment to forward copy to the superclass correctly.
+     * Required for MSVC.
+     */
+    DeviceCommandQueue& operator = (const DeviceCommandQueue &queue)
+    {
+        detail::Wrapper<cl_type>::operator=(queue);
+        return *this;
+    }
+
+    /*! \brief Move constructor to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    DeviceCommandQueue(DeviceCommandQueue&& queue) CL_HPP_NOEXCEPT_ : detail::Wrapper<cl_type>(std::move(queue)) {}
+
+    /*! \brief Move assignment to forward move to the superclass correctly.
+     * Required for MSVC.
+     */
+    DeviceCommandQueue& operator = (DeviceCommandQueue &&queue)
+    {
+        detail::Wrapper<cl_type>::operator=(std::move(queue));
+        return *this;
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_command_queue_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+            &::clGetCommandQueueInfo, object_, name, param),
+            __GET_COMMAND_QUEUE_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+        detail::param_traits<detail::cl_command_queue_info, name>::param_type
+        getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_command_queue_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*!
+    * Create a new default device command queue for the default device,
+    * in the default context and of the default size.
+    * If there is already a default queue for the specified device this
+    * function will return the pre-existing queue.
+    */
+    static DeviceCommandQueue makeDefault(
+        cl_int *err = nullptr)
+    {
+        cl_int error;
+        cl::Context context = cl::Context::getDefault();
+        cl::Device device = cl::Device::getDefault();
+
+        cl_command_queue_properties properties =
+            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT;
+        cl_queue_properties queue_properties[] = {
+            CL_QUEUE_PROPERTIES, properties,
+            0 };
+        DeviceCommandQueue deviceQueue(
+            ::clCreateCommandQueueWithProperties(
+            context(), device(), queue_properties, &error));
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        return deviceQueue;
+    }
+
+    /*!
+    * Create a new default device command queue for the specified device
+    * and of the default size.
+    * If there is already a default queue for the specified device this
+    * function will return the pre-existing queue.
+    */
+    static DeviceCommandQueue makeDefault(
+        const Context &context, const Device &device, cl_int *err = nullptr)
+    {
+        cl_int error;
+
+        cl_command_queue_properties properties =
+            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT;
+        cl_queue_properties queue_properties[] = {
+            CL_QUEUE_PROPERTIES, properties,
+            0 };
+        DeviceCommandQueue deviceQueue(
+            ::clCreateCommandQueueWithProperties(
+            context(), device(), queue_properties, &error));
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        return deviceQueue;
+    }
+
+    /*!
+     * Create a new default device command queue for the specified device 
+     * and of the requested size in bytes.
+     * If there is already a default queue for the specified device this
+     * function will return the pre-existing queue.
+     */
+    static DeviceCommandQueue makeDefault(
+        const Context &context, const Device &device, cl_uint queueSize, cl_int *err = nullptr)
+    {
+        cl_int error;
+
+        cl_command_queue_properties properties =
+            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT;
+        cl_queue_properties queue_properties[] = {
+            CL_QUEUE_PROPERTIES, properties,
+            CL_QUEUE_SIZE, queueSize,
+            0 };
+        DeviceCommandQueue deviceQueue(
+            ::clCreateCommandQueueWithProperties(
+                context(), device(), queue_properties, &error));
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        return deviceQueue;
+    }
+}; // DeviceCommandQueue
+
+namespace detail
+{
+    // Specialization for device command queue
+    template <>
+    struct KernelArgumentHandler<cl::DeviceCommandQueue, void>
+    {
+        static size_type size(const cl::DeviceCommandQueue&) { return sizeof(cl_command_queue); }
+        static const cl_command_queue* ptr(const cl::DeviceCommandQueue& value) { return &(value()); }
+    };
+} // namespace detail
+
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+
+template< typename IteratorType >
+Buffer::Buffer(
+    const Context &context,
+    IteratorType startIterator,
+    IteratorType endIterator,
+    bool readOnly,
+    bool useHostPtr,
+    cl_int* err)
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+
+    cl_mem_flags flags = 0;
+    if( readOnly ) {
+        flags |= CL_MEM_READ_ONLY;
+    }
+    else {
+        flags |= CL_MEM_READ_WRITE;
+    }
+    if( useHostPtr ) {
+        flags |= CL_MEM_USE_HOST_PTR;
+    }
+    
+    size_type size = sizeof(DataType)*(endIterator - startIterator);
+
+    if( useHostPtr ) {
+        object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+    } else {
+        object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
+    }
+
+    detail::errHandler(error, __CREATE_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+
+    if( !useHostPtr ) {
+        CommandQueue queue(context, 0, &error);
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        error = cl::copy(queue, startIterator, endIterator, *this);
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+}
+
+template< typename IteratorType >
+Buffer::Buffer(
+    const CommandQueue &queue,
+    IteratorType startIterator,
+    IteratorType endIterator,
+    bool readOnly,
+    bool useHostPtr,
+    cl_int* err)
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+
+    cl_mem_flags flags = 0;
+    if (readOnly) {
+        flags |= CL_MEM_READ_ONLY;
+    }
+    else {
+        flags |= CL_MEM_READ_WRITE;
+    }
+    if (useHostPtr) {
+        flags |= CL_MEM_USE_HOST_PTR;
+    }
+
+    size_type size = sizeof(DataType)*(endIterator - startIterator);
+
+    Context context = queue.getInfo<CL_QUEUE_CONTEXT>();
+
+    if (useHostPtr) {
+        object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+    }
+    else {
+        object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
+    }
+
+    detail::errHandler(error, __CREATE_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+
+    if (!useHostPtr) {
+        error = cl::copy(queue, startIterator, endIterator, *this);
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+}
+
+inline cl_int enqueueReadBuffer(
+    const Buffer& buffer,
+    cl_bool blocking,
+    size_type offset,
+    size_type size,
+    void* ptr,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadBuffer(buffer, blocking, offset, size, ptr, events, event);
+}
+
+inline cl_int enqueueWriteBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        size_type offset,
+        size_type size,
+        const void* ptr,
+        const vector<Event>* events = NULL,
+        Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteBuffer(buffer, blocking, offset, size, ptr, events, event);
+}
+
+inline void* enqueueMapBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        size_type offset,
+        size_type size,
+        const vector<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+
+    void * result = ::clEnqueueMapBuffer(
+            queue(), buffer(), blocking, flags, offset, size,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (cl_event*) event,
+            &error);
+
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+    return result;
+}
+
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+/**
+ * Enqueues to the default queue a command that will allow the host to
+ * update a region of a coarse-grained SVM buffer.
+ * This variant takes a raw SVM pointer.
+ */
+template<typename T>
+inline cl_int enqueueMapSVM(
+    T* ptr,
+    cl_bool blocking,
+    cl_map_flags flags,
+    size_type size,
+    const vector<Event>* events,
+    Event* event)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS) {
+        return detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    }
+
+    return queue.enqueueMapSVM(
+        ptr, blocking, flags, size, events, event);
+}
+
+/**
+ * Enqueues to the default queue a command that will allow the host to 
+ * update a region of a coarse-grained SVM buffer.
+ * This variant takes a cl::pointer instance.
+ */
+template<typename T, class D>
+inline cl_int enqueueMapSVM(
+    cl::pointer<T, D> ptr,
+    cl_bool blocking,
+    cl_map_flags flags,
+    size_type size,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS) {
+        return detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    }
+
+    return queue.enqueueMapSVM(
+        ptr, blocking, flags, size, events, event);
+}
+
+/**
+ * Enqueues to the default queue a command that will allow the host to
+ * update a region of a coarse-grained SVM buffer.
+ * This variant takes a cl::vector instance.
+ */
+template<typename T, class Alloc>
+inline cl_int enqueueMapSVM(
+    cl::vector<T, Alloc> container,
+    cl_bool blocking,
+    cl_map_flags flags,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS) {
+        return detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    }
+
+    return queue.enqueueMapSVM(
+        container, blocking, flags, events, event);
+}
+
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+inline cl_int enqueueUnmapMemObject(
+    const Memory& memory,
+    void* mapped_ptr,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    cl_event tmp;
+    cl_int err = detail::errHandler(
+        ::clEnqueueUnmapMemObject(
+        queue(), memory(), mapped_ptr,
+        (events != NULL) ? (cl_uint)events->size() : 0,
+        (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL,
+        (event != NULL) ? &tmp : NULL),
+        __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+    if (event != NULL && err == CL_SUCCESS)
+        *event = tmp;
+
+    return err;
+}
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+/**
+ * Enqueues to the default queue a command that will release a coarse-grained 
+ * SVM buffer back to the OpenCL runtime.
+ * This variant takes a raw SVM pointer.
+ */
+template<typename T>
+inline cl_int enqueueUnmapSVM(
+    T* ptr,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS) {
+        return detail::errHandler(error, __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+    }
+
+    return detail::errHandler(queue.enqueueUnmapSVM(ptr, events, event), 
+        __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+}
+
+/**
+ * Enqueues to the default queue a command that will release a coarse-grained 
+ * SVM buffer back to the OpenCL runtime.
+ * This variant takes a cl::pointer instance.
+ */
+template<typename T, class D>
+inline cl_int enqueueUnmapSVM(
+    cl::pointer<T, D> &ptr,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS) {
+        return detail::errHandler(error, __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+    }
+
+    return detail::errHandler(queue.enqueueUnmapSVM(ptr, events, event),
+        __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+}
+
+/**
+ * Enqueues to the default queue a command that will release a coarse-grained 
+ * SVM buffer back to the OpenCL runtime.
+ * This variant takes a cl::vector instance.
+ */
+template<typename T, class Alloc>
+inline cl_int enqueueUnmapSVM(
+    cl::vector<T, Alloc> &container,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS) {
+        return detail::errHandler(error, __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+    }
+
+    return detail::errHandler(queue.enqueueUnmapSVM(container, events, event),
+        __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+}
+
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+inline cl_int enqueueCopyBuffer(
+        const Buffer& src,
+        const Buffer& dst,
+        size_type src_offset,
+        size_type dst_offset,
+        size_type size,
+        const vector<Event>* events = NULL,
+        Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBuffer(src, dst, src_offset, dst_offset, size, events, event);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Host to Device.
+ * Uses default command queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS)
+        return error;
+
+    return cl::copy(queue, startIterator, endIterator, buffer);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Device to Host.
+ * Uses default command queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS)
+        return error;
+
+    return cl::copy(queue, buffer, startIterator, endIterator);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Host to Device.
+ * Uses specified queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+    
+    size_type length = endIterator-startIterator;
+    size_type byteLength = length*sizeof(DataType);
+
+    DataType *pointer = 
+        static_cast<DataType*>(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_WRITE, 0, byteLength, 0, 0, &error));
+    // if exceptions enabled, enqueueMapBuffer will throw
+    if( error != CL_SUCCESS ) {
+        return error;
+    }
+#if defined(_MSC_VER)
+    std::copy(
+        startIterator, 
+        endIterator, 
+        stdext::checked_array_iterator<DataType*>(
+            pointer, length));
+#else
+    std::copy(startIterator, endIterator, pointer);
+#endif
+    Event endEvent;
+    error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
+    // if exceptions enabled, enqueueUnmapMemObject will throw
+    if( error != CL_SUCCESS ) { 
+        return error;
+    }
+    endEvent.wait();
+    return CL_SUCCESS;
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Device to Host.
+ * Uses specified queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+        
+    size_type length = endIterator-startIterator;
+    size_type byteLength = length*sizeof(DataType);
+
+    DataType *pointer = 
+        static_cast<DataType*>(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_READ, 0, byteLength, 0, 0, &error));
+    // if exceptions enabled, enqueueMapBuffer will throw
+    if( error != CL_SUCCESS ) {
+        return error;
+    }
+    std::copy(pointer, pointer + length, startIterator);
+    Event endEvent;
+    error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
+    // if exceptions enabled, enqueueUnmapMemObject will throw
+    if( error != CL_SUCCESS ) { 
+        return error;
+    }
+    endEvent.wait();
+    return CL_SUCCESS;
+}
+
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+/**
+ * Blocking SVM map operation - performs a blocking map underneath.
+ */
+template<typename T, class Alloc>
+inline cl_int mapSVM(cl::vector<T, Alloc> &container)
+{
+    return enqueueMapSVM(container, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE);
+}
+
+/**
+* Blocking SVM map operation - performs a blocking map underneath.
+*/
+template<typename T, class Alloc>
+inline cl_int unmapSVM(cl::vector<T, Alloc> &container)
+{
+    return enqueueUnmapSVM(container);
+}
+
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 110
+inline cl_int enqueueReadBufferRect(
+    const Buffer& buffer,
+    cl_bool blocking,
+    const array<size_type, 3>& buffer_offset,
+    const array<size_type, 3>& host_offset,
+    const array<size_type, 3>& region,
+    size_type buffer_row_pitch,
+    size_type buffer_slice_pitch,
+    size_type host_row_pitch,
+    size_type host_slice_pitch,
+    void *ptr,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadBufferRect(
+        buffer, 
+        blocking, 
+        buffer_offset, 
+        host_offset,
+        region,
+        buffer_row_pitch,
+        buffer_slice_pitch,
+        host_row_pitch,
+        host_slice_pitch,
+        ptr, 
+        events, 
+        event);
+}
+
+inline cl_int enqueueWriteBufferRect(
+    const Buffer& buffer,
+    cl_bool blocking,
+    const array<size_type, 3>& buffer_offset,
+    const array<size_type, 3>& host_offset,
+    const array<size_type, 3>& region,
+    size_type buffer_row_pitch,
+    size_type buffer_slice_pitch,
+    size_type host_row_pitch,
+    size_type host_slice_pitch,
+    void *ptr,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteBufferRect(
+        buffer, 
+        blocking, 
+        buffer_offset, 
+        host_offset,
+        region,
+        buffer_row_pitch,
+        buffer_slice_pitch,
+        host_row_pitch,
+        host_slice_pitch,
+        ptr, 
+        events, 
+        event);
+}
+
+inline cl_int enqueueCopyBufferRect(
+    const Buffer& src,
+    const Buffer& dst,
+    const array<size_type, 3>& src_origin,
+    const array<size_type, 3>& dst_origin,
+    const array<size_type, 3>& region,
+    size_type src_row_pitch,
+    size_type src_slice_pitch,
+    size_type dst_row_pitch,
+    size_type dst_slice_pitch,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBufferRect(
+        src,
+        dst,
+        src_origin,
+        dst_origin,
+        region,
+        src_row_pitch,
+        src_slice_pitch,
+        dst_row_pitch,
+        dst_slice_pitch,
+        events, 
+        event);
+}
+#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
+
+inline cl_int enqueueReadImage(
+    const Image& image,
+    cl_bool blocking,
+    const array<size_type, 3>& origin,
+    const array<size_type, 3>& region,
+    size_type row_pitch,
+    size_type slice_pitch,
+    void* ptr,
+    const vector<Event>* events = NULL,
+    Event* event = NULL) 
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadImage(
+        image,
+        blocking,
+        origin,
+        region,
+        row_pitch,
+        slice_pitch,
+        ptr,
+        events, 
+        event);
+}
+
+inline cl_int enqueueWriteImage(
+    const Image& image,
+    cl_bool blocking,
+    const array<size_type, 3>& origin,
+    const array<size_type, 3>& region,
+    size_type row_pitch,
+    size_type slice_pitch,
+    void* ptr,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteImage(
+        image,
+        blocking,
+        origin,
+        region,
+        row_pitch,
+        slice_pitch,
+        ptr,
+        events, 
+        event);
+}
+
+inline cl_int enqueueCopyImage(
+    const Image& src,
+    const Image& dst,
+    const array<size_type, 3>& src_origin,
+    const array<size_type, 3>& dst_origin,
+    const array<size_type, 3>& region,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyImage(
+        src,
+        dst,
+        src_origin,
+        dst_origin,
+        region,
+        events,
+        event);
+}
+
+inline cl_int enqueueCopyImageToBuffer(
+    const Image& src,
+    const Buffer& dst,
+    const array<size_type, 3>& src_origin,
+    const array<size_type, 3>& region,
+    size_type dst_offset,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyImageToBuffer(
+        src,
+        dst,
+        src_origin,
+        region,
+        dst_offset,
+        events,
+        event);
+}
+
+inline cl_int enqueueCopyBufferToImage(
+    const Buffer& src,
+    const Image& dst,
+    size_type src_offset,
+    const array<size_type, 3>& dst_origin,
+    const array<size_type, 3>& region,
+    const vector<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBufferToImage(
+        src,
+        dst,
+        src_offset,
+        dst_origin,
+        region,
+        events,
+        event);
+}
+
+
+inline cl_int flush(void)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.flush();
+}
+
+inline cl_int finish(void)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    } 
+
+
+    return queue.finish();
+}
+
+class EnqueueArgs
+{
+private:
+    CommandQueue queue_;
+    const NDRange offset_;
+    const NDRange global_;
+    const NDRange local_;
+    vector<Event> events_;
+
+    template<typename... Ts>
+    friend class KernelFunctor;
+
+public:
+    EnqueueArgs(NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+
+    }
+
+    EnqueueArgs(NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(Event e, NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(Event e, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(Event e, NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(const vector<Event> &events, NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(const vector<Event> &events, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(const vector<Event> &events, NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(CommandQueue &queue, const vector<Event> &events, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, const vector<Event> &events, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, const vector<Event> &events, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+};
+
+
+//----------------------------------------------------------------------------------------------
+
+
+/**
+ * Type safe kernel functor.
+ * 
+ */
+template<typename... Ts>
+class KernelFunctor
+{
+private:
+    Kernel kernel_;
+
+    template<int index, typename T0, typename... T1s>
+    void setArgs(T0&& t0, T1s&&... t1s)
+    {
+        kernel_.setArg(index, t0);
+        setArgs<index + 1, T1s...>(std::forward<T1s>(t1s)...);
+    }
+
+    template<int index, typename T0>
+    void setArgs(T0&& t0)
+    {
+        kernel_.setArg(index, t0);
+    }
+
+    template<int index>
+    void setArgs()
+    {
+    }
+
+
+public:
+    KernelFunctor(Kernel kernel) : kernel_(kernel)
+    {}
+
+    KernelFunctor(
+        const Program& program,
+        const string name,
+        cl_int * err = NULL) :
+        kernel_(program, name.c_str(), err)
+    {}
+
+    //! \brief Return type of the functor
+    typedef Event result_type;
+
+    /**
+     * Enqueue kernel.
+     * @param args Launch parameters of the kernel.
+     * @param t0... List of kernel arguments based on the template type of the functor.
+     */
+    Event operator() (
+        const EnqueueArgs& args,
+        Ts... ts)
+    {
+        Event event;
+        setArgs<0>(std::forward<Ts>(ts)...);
+        
+        args.queue_.enqueueNDRangeKernel(
+            kernel_,
+            args.offset_,
+            args.global_,
+            args.local_,
+            &args.events_,
+            &event);
+
+        return event;
+    }
+
+    /**
+    * Enqueue kernel with support for error code.
+    * @param args Launch parameters of the kernel.
+    * @param t0... List of kernel arguments based on the template type of the functor.
+    * @param error Out parameter returning the error code from the execution.
+    */
+    Event operator() (
+        const EnqueueArgs& args,
+        Ts... ts,
+        cl_int &error)
+    {
+        Event event;
+        setArgs<0>(std::forward<Ts>(ts)...);
+
+        error = args.queue_.enqueueNDRangeKernel(
+            kernel_,
+            args.offset_,
+            args.global_,
+            args.local_,
+            &args.events_,
+            &event);
+        
+        return event;
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+    cl_int setSVMPointers(const vector<void*> &pointerList)
+    {
+        return kernel_.setSVMPointers(pointerList);
+    }
+
+    template<typename T0, typename... T1s>
+    cl_int setSVMPointers(const T0 &t0, T1s... ts)
+    {
+        return kernel_.setSVMPointers(t0, ts...);
+    }
+#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
+
+    Kernel getKernel()
+    {
+        return kernel_;
+    }
+};
+
+namespace compatibility {
+    /**
+     * Backward compatibility class to ensure that cl.hpp code works with cl2.hpp.
+     * Please use KernelFunctor directly.
+     */
+    template<typename... Ts>
+    struct make_kernel
+    {
+        typedef KernelFunctor<Ts...> FunctorType;
+
+        FunctorType functor_;
+
+        make_kernel(
+            const Program& program,
+            const string name,
+            cl_int * err = NULL) :
+            functor_(FunctorType(program, name, err))
+        {}
+
+        make_kernel(
+            const Kernel kernel) :
+            functor_(FunctorType(kernel))
+        {}
+
+        //! \brief Return type of the functor
+        typedef Event result_type;
+
+        //! \brief Function signature of kernel functor with no event dependency.
+        typedef Event type_(
+            const EnqueueArgs&,
+            Ts...);
+
+        Event operator()(
+            const EnqueueArgs& enqueueArgs,
+            Ts... args)
+        {
+            return functor_(
+                enqueueArgs, args...);
+        }
+    };
+} // namespace compatibility
+
+
+//----------------------------------------------------------------------------------------------------------------------
+
+#undef CL_HPP_ERR_STR_
+#if !defined(CL_HPP_USER_OVERRIDE_ERROR_STRINGS)
+#undef __GET_DEVICE_INFO_ERR
+#undef __GET_PLATFORM_INFO_ERR
+#undef __GET_DEVICE_IDS_ERR
+#undef __GET_CONTEXT_INFO_ERR
+#undef __GET_EVENT_INFO_ERR
+#undef __GET_EVENT_PROFILE_INFO_ERR
+#undef __GET_MEM_OBJECT_INFO_ERR
+#undef __GET_IMAGE_INFO_ERR
+#undef __GET_SAMPLER_INFO_ERR
+#undef __GET_KERNEL_INFO_ERR
+#undef __GET_KERNEL_ARG_INFO_ERR
+#undef __GET_KERNEL_WORK_GROUP_INFO_ERR
+#undef __GET_PROGRAM_INFO_ERR
+#undef __GET_PROGRAM_BUILD_INFO_ERR
+#undef __GET_COMMAND_QUEUE_INFO_ERR
+
+#undef __CREATE_CONTEXT_ERR
+#undef __CREATE_CONTEXT_FROM_TYPE_ERR
+#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR
+
+#undef __CREATE_BUFFER_ERR
+#undef __CREATE_SUBBUFFER_ERR
+#undef __CREATE_IMAGE2D_ERR
+#undef __CREATE_IMAGE3D_ERR
+#undef __CREATE_SAMPLER_ERR
+#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR
+
+#undef __CREATE_USER_EVENT_ERR
+#undef __SET_USER_EVENT_STATUS_ERR
+#undef __SET_EVENT_CALLBACK_ERR
+#undef __SET_PRINTF_CALLBACK_ERR
+
+#undef __WAIT_FOR_EVENTS_ERR
+
+#undef __CREATE_KERNEL_ERR
+#undef __SET_KERNEL_ARGS_ERR
+#undef __CREATE_PROGRAM_WITH_SOURCE_ERR
+#undef __CREATE_PROGRAM_WITH_BINARY_ERR
+#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR
+#undef __BUILD_PROGRAM_ERR
+#undef __CREATE_KERNELS_IN_PROGRAM_ERR
+
+#undef __CREATE_COMMAND_QUEUE_ERR
+#undef __SET_COMMAND_QUEUE_PROPERTY_ERR
+#undef __ENQUEUE_READ_BUFFER_ERR
+#undef __ENQUEUE_WRITE_BUFFER_ERR
+#undef __ENQUEUE_READ_BUFFER_RECT_ERR
+#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR
+#undef __ENQEUE_COPY_BUFFER_ERR
+#undef __ENQEUE_COPY_BUFFER_RECT_ERR
+#undef __ENQUEUE_READ_IMAGE_ERR
+#undef __ENQUEUE_WRITE_IMAGE_ERR
+#undef __ENQUEUE_COPY_IMAGE_ERR
+#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR
+#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR
+#undef __ENQUEUE_MAP_BUFFER_ERR
+#undef __ENQUEUE_MAP_IMAGE_ERR
+#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR
+#undef __ENQUEUE_NDRANGE_KERNEL_ERR
+#undef __ENQUEUE_TASK_ERR
+#undef __ENQUEUE_NATIVE_KERNEL
+
+#undef __UNLOAD_COMPILER_ERR
+#undef __CREATE_SUB_DEVICES_ERR
+
+#undef __CREATE_PIPE_ERR
+#undef __GET_PIPE_INFO_ERR
+
+#endif //CL_HPP_USER_OVERRIDE_ERROR_STRINGS
+
+// Extensions
+#undef CL_HPP_INIT_CL_EXT_FCN_PTR_
+#undef CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_
+
+#if defined(CL_HPP_USE_CL_DEVICE_FISSION)
+#undef CL_HPP_PARAM_NAME_DEVICE_FISSION_
+#endif // CL_HPP_USE_CL_DEVICE_FISSION
+
+#undef CL_HPP_NOEXCEPT_
+#undef CL_HPP_DEFINE_STATIC_MEMBER_
+
+} // namespace cl
+
+#endif // CL_HPP_
diff --git a/include/CL/cl_d3d10.h b/include/CL/cl_d3d10.h
new file mode 100644
index 0000000000..81b0d37214
--- /dev/null
+++ b/include/CL/cl_d3d10.h
@@ -0,0 +1,126 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_CL_D3D10_H
+#define __OPENCL_CL_D3D10_H
+
+#include <d3d10.h>
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+ * cl_khr_d3d10_sharing                                                       */
+#define cl_khr_d3d10_sharing 1
+
+typedef cl_uint cl_d3d10_device_source_khr;
+typedef cl_uint cl_d3d10_device_set_khr;
+
+/******************************************************************************/
+
+// Error Codes
+#define CL_INVALID_D3D10_DEVICE_KHR                  -1002
+#define CL_INVALID_D3D10_RESOURCE_KHR                -1003
+#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR       -1004
+#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR           -1005
+
+// cl_d3d10_device_source_nv
+#define CL_D3D10_DEVICE_KHR                          0x4010
+#define CL_D3D10_DXGI_ADAPTER_KHR                    0x4011
+
+// cl_d3d10_device_set_nv
+#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR           0x4012
+#define CL_ALL_DEVICES_FOR_D3D10_KHR                 0x4013
+
+// cl_context_info
+#define CL_CONTEXT_D3D10_DEVICE_KHR                  0x4014
+#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C
+
+// cl_mem_info
+#define CL_MEM_D3D10_RESOURCE_KHR                    0x4015
+
+// cl_image_info
+#define CL_IMAGE_D3D10_SUBRESOURCE_KHR               0x4016
+
+// cl_command_type
+#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR         0x4017
+#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR         0x4018
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
+    cl_platform_id             platform,
+    cl_d3d10_device_source_khr d3d_device_source,
+    void *                     d3d_object,
+    cl_d3d10_device_set_khr    d3d_device_set,
+    cl_uint                    num_entries,
+    cl_device_id *             devices,
+    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(
+    cl_context     context,
+    cl_mem_flags   flags,
+    ID3D10Buffer * resource,
+    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D10Texture2D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D10Texture3D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // __OPENCL_CL_D3D10_H
+
diff --git a/include/CL/cl_d3d11.h b/include/CL/cl_d3d11.h
new file mode 100644
index 0000000000..d3c8bdc2b1
--- /dev/null
+++ b/include/CL/cl_d3d11.h
@@ -0,0 +1,126 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_CL_D3D11_H
+#define __OPENCL_CL_D3D11_H
+
+#include <d3d11.h>
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+ * cl_khr_d3d11_sharing                                                       */
+#define cl_khr_d3d11_sharing 1
+
+typedef cl_uint cl_d3d11_device_source_khr;
+typedef cl_uint cl_d3d11_device_set_khr;
+
+/******************************************************************************/
+
+// Error Codes
+#define CL_INVALID_D3D11_DEVICE_KHR                  -1006
+#define CL_INVALID_D3D11_RESOURCE_KHR                -1007
+#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR       -1008
+#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR           -1009
+
+// cl_d3d11_device_source
+#define CL_D3D11_DEVICE_KHR                          0x4019
+#define CL_D3D11_DXGI_ADAPTER_KHR                    0x401A
+
+// cl_d3d11_device_set
+#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR           0x401B
+#define CL_ALL_DEVICES_FOR_D3D11_KHR                 0x401C
+
+// cl_context_info
+#define CL_CONTEXT_D3D11_DEVICE_KHR                  0x401D
+#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D
+
+// cl_mem_info
+#define CL_MEM_D3D11_RESOURCE_KHR                    0x401E
+
+// cl_image_info
+#define CL_IMAGE_D3D11_SUBRESOURCE_KHR               0x401F
+
+// cl_command_type
+#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR         0x4020
+#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR         0x4021
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)(
+    cl_platform_id             platform,
+    cl_d3d11_device_source_khr d3d_device_source,
+    void *                     d3d_object,
+    cl_d3d11_device_set_khr    d3d_device_set,
+    cl_uint                    num_entries,
+    cl_device_id *             devices,
+    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)(
+    cl_context     context,
+    cl_mem_flags   flags,
+    ID3D11Buffer * resource,
+    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D11Texture2D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D11Texture3D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // __OPENCL_CL_D3D11_H
+
diff --git a/include/CL/cl_dx9_media_sharing.h b/include/CL/cl_dx9_media_sharing.h
new file mode 100644
index 0000000000..1ef543a5af
--- /dev/null
+++ b/include/CL/cl_dx9_media_sharing.h
@@ -0,0 +1,127 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H
+#define __OPENCL_CL_DX9_MEDIA_SHARING_H
+
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+/* cl_khr_dx9_media_sharing                                                   */
+#define cl_khr_dx9_media_sharing 1
+
+typedef cl_uint             cl_dx9_media_adapter_type_khr;
+typedef cl_uint             cl_dx9_media_adapter_set_khr;
+    
+#if defined(_WIN32)
+#include <d3d9.h>
+typedef struct _cl_dx9_surface_info_khr
+{
+    IDirect3DSurface9 *resource;
+    HANDLE shared_handle;
+} cl_dx9_surface_info_khr;
+#endif
+
+
+/******************************************************************************/
+
+// Error Codes
+#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR                -1010
+#define CL_INVALID_DX9_MEDIA_SURFACE_KHR                -1011
+#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR       -1012
+#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR           -1013
+
+// cl_media_adapter_type_khr
+#define CL_ADAPTER_D3D9_KHR                              0x2020
+#define CL_ADAPTER_D3D9EX_KHR                            0x2021
+#define CL_ADAPTER_DXVA_KHR                              0x2022
+
+// cl_media_adapter_set_khr
+#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR   0x2023
+#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR         0x2024
+
+// cl_context_info
+#define CL_CONTEXT_ADAPTER_D3D9_KHR                      0x2025
+#define CL_CONTEXT_ADAPTER_D3D9EX_KHR                    0x2026
+#define CL_CONTEXT_ADAPTER_DXVA_KHR                      0x2027
+
+// cl_mem_info
+#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR                0x2028
+#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR                0x2029
+
+// cl_image_info
+#define CL_IMAGE_DX9_MEDIA_PLANE_KHR                     0x202A
+
+// cl_command_type
+#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR        0x202B
+#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR        0x202C
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)(
+    cl_platform_id                   platform,
+    cl_uint                          num_media_adapters,
+    cl_dx9_media_adapter_type_khr *  media_adapter_type,
+    void *                           media_adapters,
+    cl_dx9_media_adapter_set_khr     media_adapter_set,
+    cl_uint                          num_entries,
+    cl_device_id *                   devices,
+    cl_uint *                        num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)(
+    cl_context                    context,
+    cl_mem_flags                  flags,
+    cl_dx9_media_adapter_type_khr adapter_type,
+    void *                        surface_info,
+    cl_uint                       plane,                                                                          
+    cl_int *                      errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // __OPENCL_CL_DX9_MEDIA_SHARING_H
+
diff --git a/include/CL/cl_egl.h b/include/CL/cl_egl.h
new file mode 100644
index 0000000000..c1bd4f3942
--- /dev/null
+++ b/include/CL/cl_egl.h
@@ -0,0 +1,131 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2010 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#ifndef __OPENCL_CL_EGL_H
+#define __OPENCL_CL_EGL_H
+
+#ifdef __APPLE__
+
+#else
+#include <CL/cl.h>
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+#endif  
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */
+#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR  0x202F
+#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR    0x202D
+#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR    0x202E
+
+/* Error type for clCreateFromEGLImageKHR */
+#define CL_INVALID_EGL_OBJECT_KHR             -1093
+#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR      -1092
+
+/* CLeglImageKHR is an opaque handle to an EGLImage */
+typedef void* CLeglImageKHR;
+
+/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
+typedef void* CLeglDisplayKHR;
+
+/* properties passed to clCreateFromEGLImageKHR */
+typedef intptr_t cl_egl_image_properties_khr;
+
+
+#define cl_khr_egl_image 1
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromEGLImageKHR(cl_context                  /* context */,
+                        CLeglDisplayKHR             /* egldisplay */,
+                        CLeglImageKHR               /* eglimage */,
+                        cl_mem_flags                /* flags */,
+                        const cl_egl_image_properties_khr * /* properties */,
+                        cl_int *                    /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(
+	cl_context                  context,
+	CLeglDisplayKHR             egldisplay,
+	CLeglImageKHR               eglimage,
+	cl_mem_flags                flags,
+	const cl_egl_image_properties_khr * properties,
+	cl_int *                    errcode_ret);
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireEGLObjectsKHR(cl_command_queue /* command_queue */,
+                              cl_uint          /* num_objects */,
+                              const cl_mem *   /* mem_objects */,
+                              cl_uint          /* num_events_in_wait_list */,
+                              const cl_event * /* event_wait_list */,
+                              cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(
+	cl_command_queue command_queue,
+	cl_uint          num_objects,
+	const cl_mem *   mem_objects,
+	cl_uint          num_events_in_wait_list,
+	const cl_event * event_wait_list,
+	cl_event *       event);
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseEGLObjectsKHR(cl_command_queue /* command_queue */,
+                              cl_uint          /* num_objects */,
+                              const cl_mem *   /* mem_objects */,
+                              cl_uint          /* num_events_in_wait_list */,
+                              const cl_event * /* event_wait_list */,
+                              cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
+	cl_command_queue command_queue,
+	cl_uint          num_objects,
+	const cl_mem *   mem_objects,
+	cl_uint          num_events_in_wait_list,
+	const cl_event * event_wait_list,
+	cl_event *       event);
+
+
+#define cl_khr_egl_event 1
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromEGLSyncKHR(cl_context /* context */,
+                            EGLSyncKHR /* sync */,
+                            EGLDisplay /* display */,
+                            cl_int *   /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
+	cl_context context,
+	EGLSyncKHR sync,
+	EGLDisplay display,
+	cl_int *   errcode_ret);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __OPENCL_CL_EGL_H */
diff --git a/include/CL/cl_ext.h b/include/CL/cl_ext.h
new file mode 100644
index 0000000000..710bea8837
--- /dev/null
+++ b/include/CL/cl_ext.h
@@ -0,0 +1,316 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2013 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11928 $ on $Date: 2010-07-13 09:04:56 -0700 (Tue, 13 Jul 2010) $ */
+
+/* cl_ext.h contains OpenCL extensions which don't have external */
+/* (OpenGL, D3D) dependencies.                                   */
+
+#ifndef __CL_EXT_H
+#define __CL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+        #include <OpenCL/cl.h>
+    #include <AvailabilityMacros.h>
+#else
+        #include <CL/cl.h>
+#endif
+
+/* cl_khr_fp16 extension - no extension #define since it has no functions  */
+#define CL_DEVICE_HALF_FP_CONFIG                    0x1033
+
+/* Memory object destruction
+ *
+ * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
+ *
+ * Registers a user callback function that will be called when the memory object is deleted and its resources 
+ * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback 
+ * stack associated with memobj. The registered user callback functions are called in the reverse order in 
+ * which they were registered. The user callback functions are called and then the memory object is deleted 
+ * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be 
+ * notified when the memory referenced by host_ptr, specified when the memory object is created and used as 
+ * the storage bits for the memory object, can be reused or freed.
+ *
+ * The application may not call CL api's with the cl_mem object passed to the pfn_notify.
+ *
+ * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ */
+#define cl_APPLE_SetMemObjectDestructor 1
+cl_int  CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem /* memobj */, 
+                                        void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), 
+                                        void * /*user_data */ )             CL_EXT_SUFFIX__VERSION_1_0;  
+
+
+/* Context Logging Functions
+ *
+ * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
+ * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ *
+ * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger 
+ */
+#define cl_APPLE_ContextLoggingFunctions 1
+extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE(  const char * /* errstr */, 
+                                            const void * /* private_info */, 
+                                            size_t       /* cb */, 
+                                            void *       /* user_data */ )  CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
+extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE(   const char * /* errstr */, 
+                                          const void * /* private_info */, 
+                                          size_t       /* cb */, 
+                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
+extern void CL_API_ENTRY clLogMessagesToStderrAPPLE(   const char * /* errstr */, 
+                                          const void * /* private_info */, 
+                                          size_t       /* cb */, 
+                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
+
+
+/************************ 
+* cl_khr_icd extension *                                                  
+************************/
+#define cl_khr_icd 1
+
+/* cl_platform_info                                                        */
+#define CL_PLATFORM_ICD_SUFFIX_KHR                  0x0920
+
+/* Additional Error Codes                                                  */
+#define CL_PLATFORM_NOT_FOUND_KHR                   -1001
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clIcdGetPlatformIDsKHR(cl_uint          /* num_entries */,
+                       cl_platform_id * /* platforms */,
+                       cl_uint *        /* num_platforms */);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
+    cl_uint          /* num_entries */,
+    cl_platform_id * /* platforms */,
+    cl_uint *        /* num_platforms */);
+
+
+/* Extension: cl_khr_image2D_buffer
+ *
+ * This extension allows a 2D image to be created from a cl_mem buffer without a copy.
+ * The type associated with a 2D image created from a buffer in an OpenCL program is image2d_t.
+ * Both the sampler and sampler-less read_image built-in functions are supported for 2D images
+ * and 2D images created from a buffer.  Similarly, the write_image built-ins are also supported
+ * for 2D images created from a buffer.
+ *
+ * When the 2D image from buffer is created, the client must specify the width,
+ * height, image format (i.e. channel order and channel data type) and optionally the row pitch
+ *
+ * The pitch specified must be a multiple of CL_DEVICE_IMAGE_PITCH_ALIGNMENT pixels.
+ * The base address of the buffer must be aligned to CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT pixels.
+ */
+    
+/*************************************
+ * cl_khr_initalize_memory extension *
+ *************************************/
+    
+#define CL_CONTEXT_MEMORY_INITIALIZE_KHR            0x200E
+    
+    
+/**************************************
+ * cl_khr_terminate_context extension *
+ **************************************/
+    
+#define CL_DEVICE_TERMINATE_CAPABILITY_KHR          0x200F
+#define CL_CONTEXT_TERMINATE_KHR                    0x2010
+
+#define cl_khr_terminate_context 1
+extern CL_API_ENTRY cl_int CL_API_CALL clTerminateContextKHR(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
+    
+    
+/*
+ * Extension: cl_khr_spir
+ *
+ * This extension adds support to create an OpenCL program object from a 
+ * Standard Portable Intermediate Representation (SPIR) instance
+ */
+
+#define CL_DEVICE_SPIR_VERSIONS                     0x40E0
+#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE         0x40E1
+
+
+/******************************************
+* cl_nv_device_attribute_query extension *
+******************************************/
+/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
+#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
+#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
+#define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
+#define CL_DEVICE_WARP_SIZE_NV                      0x4003
+#define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
+#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
+#define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
+
+/*********************************
+* cl_amd_device_attribute_query *
+*********************************/
+#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD        0x4036
+
+/*********************************
+* cl_arm_printf extension
+*********************************/
+#define CL_PRINTF_CALLBACK_ARM                      0x40B0
+#define CL_PRINTF_BUFFERSIZE_ARM                    0x40B1
+
+#ifdef CL_VERSION_1_1
+   /***********************************
+    * cl_ext_device_fission extension *
+    ***********************************/
+    #define cl_ext_device_fission   1
+    
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
+    
+    typedef CL_API_ENTRY cl_int 
+    (CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
+    
+    typedef CL_API_ENTRY cl_int 
+    (CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    typedef cl_ulong  cl_device_partition_property_ext;
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clCreateSubDevicesEXT(  cl_device_id /*in_device*/,
+                            const cl_device_partition_property_ext * /* properties */,
+                            cl_uint /*num_entries*/,
+                            cl_device_id * /*out_devices*/,
+                            cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    typedef CL_API_ENTRY cl_int 
+    ( CL_API_CALL * clCreateSubDevicesEXT_fn)(  cl_device_id /*in_device*/,
+                                                const cl_device_partition_property_ext * /* properties */,
+                                                cl_uint /*num_entries*/,
+                                                cl_device_id * /*out_devices*/,
+                                                cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    /* cl_device_partition_property_ext */
+    #define CL_DEVICE_PARTITION_EQUALLY_EXT             0x4050
+    #define CL_DEVICE_PARTITION_BY_COUNTS_EXT           0x4051
+    #define CL_DEVICE_PARTITION_BY_NAMES_EXT            0x4052
+    #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT  0x4053
+    
+    /* clDeviceGetInfo selectors */
+    #define CL_DEVICE_PARENT_DEVICE_EXT                 0x4054
+    #define CL_DEVICE_PARTITION_TYPES_EXT               0x4055
+    #define CL_DEVICE_AFFINITY_DOMAINS_EXT              0x4056
+    #define CL_DEVICE_REFERENCE_COUNT_EXT               0x4057
+    #define CL_DEVICE_PARTITION_STYLE_EXT               0x4058
+    
+    /* error codes */
+    #define CL_DEVICE_PARTITION_FAILED_EXT              -1057
+    #define CL_INVALID_PARTITION_COUNT_EXT              -1058
+    #define CL_INVALID_PARTITION_NAME_EXT               -1059
+    
+    /* CL_AFFINITY_DOMAINs */
+    #define CL_AFFINITY_DOMAIN_L1_CACHE_EXT             0x1
+    #define CL_AFFINITY_DOMAIN_L2_CACHE_EXT             0x2
+    #define CL_AFFINITY_DOMAIN_L3_CACHE_EXT             0x3
+    #define CL_AFFINITY_DOMAIN_L4_CACHE_EXT             0x4
+    #define CL_AFFINITY_DOMAIN_NUMA_EXT                 0x10
+    #define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT     0x100
+    
+    /* cl_device_partition_property_ext list terminators */
+    #define CL_PROPERTIES_LIST_END_EXT                  ((cl_device_partition_property_ext) 0)
+    #define CL_PARTITION_BY_COUNTS_LIST_END_EXT         ((cl_device_partition_property_ext) 0)
+    #define CL_PARTITION_BY_NAMES_LIST_END_EXT          ((cl_device_partition_property_ext) 0 - 1)
+
+/*********************************
+* cl_qcom_ext_host_ptr extension
+*********************************/
+
+#define CL_MEM_EXT_HOST_PTR_QCOM                  (1 << 29)
+
+#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM   0x40A0      
+#define CL_DEVICE_PAGE_SIZE_QCOM                  0x40A1
+#define CL_IMAGE_ROW_ALIGNMENT_QCOM               0x40A2
+#define CL_IMAGE_SLICE_ALIGNMENT_QCOM             0x40A3
+#define CL_MEM_HOST_UNCACHED_QCOM                 0x40A4
+#define CL_MEM_HOST_WRITEBACK_QCOM                0x40A5
+#define CL_MEM_HOST_WRITETHROUGH_QCOM             0x40A6
+#define CL_MEM_HOST_WRITE_COMBINING_QCOM          0x40A7
+
+typedef cl_uint                                   cl_image_pitch_info_qcom;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceImageInfoQCOM(cl_device_id             device,
+                         size_t                   image_width,
+                         size_t                   image_height,
+                         const cl_image_format   *image_format,
+                         cl_image_pitch_info_qcom param_name,
+                         size_t                   param_value_size,
+                         void                    *param_value,
+                         size_t                  *param_value_size_ret);
+
+typedef struct _cl_mem_ext_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Legal values will be defined in layered extensions. */
+    cl_uint  allocation_type;
+            
+	/* Host cache policy for this external memory allocation. */
+    cl_uint  host_cache_policy;
+
+} cl_mem_ext_host_ptr;
+
+/*********************************
+* cl_qcom_ion_host_ptr extension
+*********************************/
+
+#define CL_MEM_ION_HOST_PTR_QCOM                  0x40A8
+
+typedef struct _cl_mem_ion_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */
+    cl_mem_ext_host_ptr  ext_host_ptr;
+
+    /* ION file descriptor */
+    int                  ion_filedesc;
+            
+    /* Host pointer to the ION allocated memory */
+    void*                ion_hostptr;
+
+} cl_mem_ion_host_ptr;
+
+#endif /* CL_VERSION_1_1 */
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* __CL_EXT_H */
diff --git a/include/CL/cl_gl.h b/include/CL/cl_gl.h
new file mode 100644
index 0000000000..af2036cc99
--- /dev/null
+++ b/include/CL/cl_gl.h
@@ -0,0 +1,162 @@
+/**********************************************************************************
+ * Copyright (c) 2008 - 2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+#ifndef __OPENCL_CL_GL_H
+#define __OPENCL_CL_GL_H
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif	
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef cl_uint     cl_gl_object_type;
+typedef cl_uint     cl_gl_texture_info;
+typedef cl_uint     cl_gl_platform_info;
+typedef struct __GLsync *cl_GLsync;
+
+/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken           */
+#define CL_GL_OBJECT_BUFFER                     0x2000
+#define CL_GL_OBJECT_TEXTURE2D                  0x2001
+#define CL_GL_OBJECT_TEXTURE3D                  0x2002
+#define CL_GL_OBJECT_RENDERBUFFER               0x2003
+#define CL_GL_OBJECT_TEXTURE2D_ARRAY            0x200E
+#define CL_GL_OBJECT_TEXTURE1D                  0x200F
+#define CL_GL_OBJECT_TEXTURE1D_ARRAY            0x2010
+#define CL_GL_OBJECT_TEXTURE_BUFFER             0x2011
+
+/* cl_gl_texture_info           */
+#define CL_GL_TEXTURE_TARGET                    0x2004
+#define CL_GL_MIPMAP_LEVEL                      0x2005
+#define CL_GL_NUM_SAMPLES                       0x2012
+
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLBuffer(cl_context     /* context */,
+                     cl_mem_flags   /* flags */,
+                     cl_GLuint      /* bufobj */,
+                     int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLTexture(cl_context      /* context */,
+                      cl_mem_flags    /* flags */,
+                      cl_GLenum       /* target */,
+                      cl_GLint        /* miplevel */,
+                      cl_GLuint       /* texture */,
+                      cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+    
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLRenderbuffer(cl_context   /* context */,
+                           cl_mem_flags /* flags */,
+                           cl_GLuint    /* renderbuffer */,
+                           cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLObjectInfo(cl_mem                /* memobj */,
+                  cl_gl_object_type *   /* gl_object_type */,
+                  cl_GLuint *           /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0;
+                  
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLTextureInfo(cl_mem               /* memobj */,
+                   cl_gl_texture_info   /* param_name */,
+                   size_t               /* param_value_size */,
+                   void *               /* param_value */,
+                   size_t *             /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireGLObjects(cl_command_queue      /* command_queue */,
+                          cl_uint               /* num_objects */,
+                          const cl_mem *        /* mem_objects */,
+                          cl_uint               /* num_events_in_wait_list */,
+                          const cl_event *      /* event_wait_list */,
+                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseGLObjects(cl_command_queue      /* command_queue */,
+                          cl_uint               /* num_objects */,
+                          const cl_mem *        /* mem_objects */,
+                          cl_uint               /* num_events_in_wait_list */,
+                          const cl_event *      /* event_wait_list */,
+                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+
+// Deprecated OpenCL 1.1 APIs
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture2D(cl_context      /* context */,
+                        cl_mem_flags    /* flags */,
+                        cl_GLenum       /* target */,
+                        cl_GLint        /* miplevel */,
+                        cl_GLuint       /* texture */,
+                        cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture3D(cl_context      /* context */,
+                        cl_mem_flags    /* flags */,
+                        cl_GLenum       /* target */,
+                        cl_GLint        /* miplevel */,
+                        cl_GLuint       /* texture */,
+                        cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+/* cl_khr_gl_sharing extension  */
+    
+#define cl_khr_gl_sharing 1
+    
+typedef cl_uint     cl_gl_context_info;
+    
+/* Additional Error Codes  */
+#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR  -1000
+    
+/* cl_gl_context_info  */
+#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR    0x2006
+#define CL_DEVICES_FOR_GL_CONTEXT_KHR           0x2007
+    
+/* Additional cl_context_properties  */
+#define CL_GL_CONTEXT_KHR                       0x2008
+#define CL_EGL_DISPLAY_KHR                      0x2009
+#define CL_GLX_DISPLAY_KHR                      0x200A
+#define CL_WGL_HDC_KHR                          0x200B
+#define CL_CGL_SHAREGROUP_KHR                   0x200C
+    
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLContextInfoKHR(const cl_context_properties * /* properties */,
+                      cl_gl_context_info            /* param_name */,
+                      size_t                        /* param_value_size */,
+                      void *                        /* param_value */,
+                      size_t *                      /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+    
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
+    const cl_context_properties * properties,
+    cl_gl_context_info            param_name,
+    size_t                        param_value_size,
+    void *                        param_value,
+    size_t *                      param_value_size_ret);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_GL_H */
diff --git a/include/CL/cl_gl_ext.h b/include/CL/cl_gl_ext.h
new file mode 100644
index 0000000000..77d53536f6
--- /dev/null
+++ b/include/CL/cl_gl_ext.h
@@ -0,0 +1,69 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+/* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have           */
+/* OpenGL dependencies.                                                         */
+
+#ifndef __OPENCL_CL_GL_EXT_H
+#define __OPENCL_CL_GL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+    #include <OpenCL/cl_gl.h>
+#else
+    #include <CL/cl_gl.h>
+#endif
+
+/*
+ * For each extension, follow this template
+ *  cl_VEN_extname extension  */
+/* #define cl_VEN_extname 1
+ * ... define new types, if any
+ * ... define new tokens, if any
+ * ... define new APIs, if any
+ *
+ *  If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header
+ *  This allows us to avoid having to decide whether to include GL headers or GLES here.
+ */
+
+/* 
+ *  cl_khr_gl_event  extension
+ *  See section 9.9 in the OpenCL 1.1 spec for more information
+ */
+#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR     0x200D
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromGLsyncKHR(cl_context           /* context */,
+                           cl_GLsync            /* cl_GLsync */,
+                           cl_int *             /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* __OPENCL_CL_GL_EXT_H  */
diff --git a/include/CL/cl_platform.h b/include/CL/cl_platform.h
new file mode 100644
index 0000000000..cf2b7210ac
--- /dev/null
+++ b/include/CL/cl_platform.h
@@ -0,0 +1,1254 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11803 $ on $Date: 2010-06-25 10:02:12 -0700 (Fri, 25 Jun 2010) $ */
+
+#ifndef __CL_PLATFORM_H
+#define __CL_PLATFORM_H
+
+#ifdef __APPLE__
+    /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */
+    #include <AvailabilityMacros.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_WIN32)
+    #define CL_API_ENTRY
+    #define CL_API_CALL     __stdcall
+    #define CL_CALLBACK     __stdcall
+#else
+    #define CL_API_ENTRY
+    #define CL_API_CALL
+    #define CL_CALLBACK
+#endif
+
+#ifdef __APPLE__
+    #define CL_EXTENSION_WEAK_LINK       __attribute__((weak_import))
+    #define CL_API_SUFFIX__VERSION_1_0                  AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_0                  CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+    #define CL_API_SUFFIX__VERSION_1_1                  AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define GCL_API_SUFFIX__VERSION_1_1                 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_1                  CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED       CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7
+    
+    #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_API_SUFFIX__VERSION_1_2              AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define GCL_API_SUFFIX__VERSION_1_2             AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_2              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8
+    #else
+        #warning  This path should never happen outside of internal operating system development.  AvailabilityMacros do not function correctly here!
+        #define CL_API_SUFFIX__VERSION_1_2              AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define GCL_API_SUFFIX__VERSION_1_2             AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_2              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #endif
+#else
+    #define CL_EXTENSION_WEAK_LINK  
+    #define CL_API_SUFFIX__VERSION_1_0
+    #define CL_EXT_SUFFIX__VERSION_1_0
+    #define CL_API_SUFFIX__VERSION_1_1
+    #define CL_EXT_SUFFIX__VERSION_1_1
+    #define CL_API_SUFFIX__VERSION_1_2
+    #define CL_EXT_SUFFIX__VERSION_1_2
+    
+    #ifdef __GNUC__
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED    
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED    
+        #endif
+    
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED    
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED    
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED    
+        #endif
+    #elif _WIN32
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED    
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED    
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED 
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED __declspec(deprecated)     
+        #endif
+    
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED    
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED 
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED __declspec(deprecated)     
+        #endif
+    #else
+        #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+    
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    #endif
+#endif
+
+#if (defined (_WIN32) && defined(_MSC_VER))
+
+/* scalar types  */
+typedef signed   __int8         cl_char;
+typedef unsigned __int8         cl_uchar;
+typedef signed   __int16        cl_short;
+typedef unsigned __int16        cl_ushort;
+typedef signed   __int32        cl_int;
+typedef unsigned __int32        cl_uint;
+typedef signed   __int64        cl_long;
+typedef unsigned __int64        cl_ulong;
+
+typedef unsigned __int16        cl_half;
+typedef float                   cl_float;
+typedef double                  cl_double;
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
+#define CL_FLT_MIN          1.175494350822287507969e-38f
+#define CL_FLT_EPSILON      0x1.0p-23f
+
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0
+#define CL_DBL_MIN          2.225073858507201383090e-308
+#define CL_DBL_EPSILON      2.220446049250313080847e-16
+
+#define  CL_M_E             2.718281828459045090796
+#define  CL_M_LOG2E         1.442695040888963387005
+#define  CL_M_LOG10E        0.434294481903251816668
+#define  CL_M_LN2           0.693147180559945286227
+#define  CL_M_LN10          2.302585092994045901094
+#define  CL_M_PI            3.141592653589793115998
+#define  CL_M_PI_2          1.570796326794896557999
+#define  CL_M_PI_4          0.785398163397448278999
+#define  CL_M_1_PI          0.318309886183790691216
+#define  CL_M_2_PI          0.636619772367581382433
+#define  CL_M_2_SQRTPI      1.128379167095512558561
+#define  CL_M_SQRT2         1.414213562373095145475
+#define  CL_M_SQRT1_2       0.707106781186547572737
+
+#define  CL_M_E_F           2.71828174591064f
+#define  CL_M_LOG2E_F       1.44269502162933f
+#define  CL_M_LOG10E_F      0.43429449200630f
+#define  CL_M_LN2_F         0.69314718246460f
+#define  CL_M_LN10_F        2.30258512496948f
+#define  CL_M_PI_F          3.14159274101257f
+#define  CL_M_PI_2_F        1.57079637050629f
+#define  CL_M_PI_4_F        0.78539818525314f
+#define  CL_M_1_PI_F        0.31830987334251f
+#define  CL_M_2_PI_F        0.63661974668503f
+#define  CL_M_2_SQRTPI_F    1.12837922573090f
+#define  CL_M_SQRT2_F       1.41421353816986f
+#define  CL_M_SQRT1_2_F     0.70710676908493f
+
+#define CL_NAN              (CL_INFINITY - CL_INFINITY)
+#define CL_HUGE_VALF        ((cl_float) 1e50)
+#define CL_HUGE_VAL         ((cl_double) 1e500)
+#define CL_MAXFLOAT         CL_FLT_MAX
+#define CL_INFINITY         CL_HUGE_VALF
+
+#else
+
+#include <stdint.h>
+
+/* scalar types  */
+typedef int8_t          cl_char;
+typedef uint8_t         cl_uchar;
+typedef int16_t         cl_short    __attribute__((aligned(2)));
+typedef uint16_t        cl_ushort   __attribute__((aligned(2)));
+typedef int32_t         cl_int      __attribute__((aligned(4)));
+typedef uint32_t        cl_uint     __attribute__((aligned(4)));
+typedef int64_t         cl_long     __attribute__((aligned(8)));
+typedef uint64_t        cl_ulong    __attribute__((aligned(8)));
+
+typedef uint16_t        cl_half     __attribute__((aligned(2)));
+typedef float           cl_float    __attribute__((aligned(4)));
+typedef double          cl_double   __attribute__((aligned(8)));
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          0x1.fffffep127f
+#define CL_FLT_MIN          0x1.0p-126f
+#define CL_FLT_EPSILON      0x1.0p-23f
+
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          0x1.fffffffffffffp1023
+#define CL_DBL_MIN          0x1.0p-1022
+#define CL_DBL_EPSILON      0x1.0p-52
+
+#define  CL_M_E             2.718281828459045090796
+#define  CL_M_LOG2E         1.442695040888963387005
+#define  CL_M_LOG10E        0.434294481903251816668
+#define  CL_M_LN2           0.693147180559945286227
+#define  CL_M_LN10          2.302585092994045901094
+#define  CL_M_PI            3.141592653589793115998
+#define  CL_M_PI_2          1.570796326794896557999
+#define  CL_M_PI_4          0.785398163397448278999
+#define  CL_M_1_PI          0.318309886183790691216
+#define  CL_M_2_PI          0.636619772367581382433
+#define  CL_M_2_SQRTPI      1.128379167095512558561
+#define  CL_M_SQRT2         1.414213562373095145475
+#define  CL_M_SQRT1_2       0.707106781186547572737
+
+#define  CL_M_E_F           2.71828174591064f
+#define  CL_M_LOG2E_F       1.44269502162933f
+#define  CL_M_LOG10E_F      0.43429449200630f
+#define  CL_M_LN2_F         0.69314718246460f
+#define  CL_M_LN10_F        2.30258512496948f
+#define  CL_M_PI_F          3.14159274101257f
+#define  CL_M_PI_2_F        1.57079637050629f
+#define  CL_M_PI_4_F        0.78539818525314f
+#define  CL_M_1_PI_F        0.31830987334251f
+#define  CL_M_2_PI_F        0.63661974668503f
+#define  CL_M_2_SQRTPI_F    1.12837922573090f
+#define  CL_M_SQRT2_F       1.41421353816986f
+#define  CL_M_SQRT1_2_F     0.70710676908493f
+
+#if defined( __GNUC__ )
+   #define CL_HUGE_VALF     __builtin_huge_valf()
+   #define CL_HUGE_VAL      __builtin_huge_val()
+   #define CL_NAN           __builtin_nanf( "" )
+#else
+   #define CL_HUGE_VALF     ((cl_float) 1e50)
+   #define CL_HUGE_VAL      ((cl_double) 1e500)
+   float nanf( const char * );
+   #define CL_NAN           nanf( "" )  
+#endif
+#define CL_MAXFLOAT         CL_FLT_MAX
+#define CL_INFINITY         CL_HUGE_VALF
+
+#endif
+
+#include <stddef.h>
+
+/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */
+typedef unsigned int cl_GLuint;
+typedef int          cl_GLint;
+typedef unsigned int cl_GLenum;
+
+/*
+ * Vector types 
+ *
+ *  Note:   OpenCL requires that all types be naturally aligned. 
+ *          This means that vector types must be naturally aligned.
+ *          For example, a vector of four floats must be aligned to
+ *          a 16 byte boundary (calculated as 4 * the natural 4-byte 
+ *          alignment of the float).  The alignment qualifiers here
+ *          will only function properly if your compiler supports them
+ *          and if you don't actively work to defeat them.  For example,
+ *          in order for a cl_float4 to be 16 byte aligned in a struct,
+ *          the start of the struct must itself be 16-byte aligned. 
+ *
+ *          Maintaining proper alignment is the user's responsibility.
+ */
+
+/* Define basic vector types */
+#if defined( __VEC__ )
+   #include <altivec.h>   /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */
+   typedef vector unsigned char     __cl_uchar16;
+   typedef vector signed char       __cl_char16;
+   typedef vector unsigned short    __cl_ushort8;
+   typedef vector signed short      __cl_short8;
+   typedef vector unsigned int      __cl_uint4;
+   typedef vector signed int        __cl_int4;
+   typedef vector float             __cl_float4;
+   #define  __CL_UCHAR16__  1
+   #define  __CL_CHAR16__   1
+   #define  __CL_USHORT8__  1
+   #define  __CL_SHORT8__   1
+   #define  __CL_UINT4__    1
+   #define  __CL_INT4__     1
+   #define  __CL_FLOAT4__   1
+#endif
+
+#if defined( __SSE__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <xmmintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef float __cl_float4   __attribute__((vector_size(16)));
+    #else
+        typedef __m128 __cl_float4;
+    #endif
+    #define __CL_FLOAT4__   1
+#endif
+
+#if defined( __SSE2__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <emmintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef cl_uchar    __cl_uchar16    __attribute__((vector_size(16)));
+        typedef cl_char     __cl_char16     __attribute__((vector_size(16)));
+        typedef cl_ushort   __cl_ushort8    __attribute__((vector_size(16)));
+        typedef cl_short    __cl_short8     __attribute__((vector_size(16)));
+        typedef cl_uint     __cl_uint4      __attribute__((vector_size(16)));
+        typedef cl_int      __cl_int4       __attribute__((vector_size(16)));
+        typedef cl_ulong    __cl_ulong2     __attribute__((vector_size(16)));
+        typedef cl_long     __cl_long2      __attribute__((vector_size(16)));
+        typedef cl_double   __cl_double2    __attribute__((vector_size(16)));
+    #else
+        typedef __m128i __cl_uchar16;
+        typedef __m128i __cl_char16;
+        typedef __m128i __cl_ushort8;
+        typedef __m128i __cl_short8;
+        typedef __m128i __cl_uint4;
+        typedef __m128i __cl_int4;
+        typedef __m128i __cl_ulong2;
+        typedef __m128i __cl_long2;
+        typedef __m128d __cl_double2;
+    #endif
+    #define __CL_UCHAR16__  1
+    #define __CL_CHAR16__   1
+    #define __CL_USHORT8__  1
+    #define __CL_SHORT8__   1
+    #define __CL_INT4__     1
+    #define __CL_UINT4__    1
+    #define __CL_ULONG2__   1
+    #define __CL_LONG2__    1
+    #define __CL_DOUBLE2__  1
+#endif
+
+#if defined( __MMX__ )
+    #include <mmintrin.h>
+    #if defined( __GNUC__ )
+        typedef cl_uchar    __cl_uchar8     __attribute__((vector_size(8)));
+        typedef cl_char     __cl_char8      __attribute__((vector_size(8)));
+        typedef cl_ushort   __cl_ushort4    __attribute__((vector_size(8)));
+        typedef cl_short    __cl_short4     __attribute__((vector_size(8)));
+        typedef cl_uint     __cl_uint2      __attribute__((vector_size(8)));
+        typedef cl_int      __cl_int2       __attribute__((vector_size(8)));
+        typedef cl_ulong    __cl_ulong1     __attribute__((vector_size(8)));
+        typedef cl_long     __cl_long1      __attribute__((vector_size(8)));
+        typedef cl_float    __cl_float2     __attribute__((vector_size(8)));
+    #else
+        typedef __m64       __cl_uchar8;
+        typedef __m64       __cl_char8;
+        typedef __m64       __cl_ushort4;
+        typedef __m64       __cl_short4;
+        typedef __m64       __cl_uint2;
+        typedef __m64       __cl_int2;
+        typedef __m64       __cl_ulong1;
+        typedef __m64       __cl_long1;
+        typedef __m64       __cl_float2;
+    #endif
+    #define __CL_UCHAR8__   1
+    #define __CL_CHAR8__    1
+    #define __CL_USHORT4__  1
+    #define __CL_SHORT4__   1
+    #define __CL_INT2__     1
+    #define __CL_UINT2__    1
+    #define __CL_ULONG1__   1
+    #define __CL_LONG1__    1
+    #define __CL_FLOAT2__   1
+#endif
+
+#if defined( __AVX__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <immintrin.h> 
+    #endif
+    #if defined( __GNUC__ )
+        typedef cl_float    __cl_float8     __attribute__((vector_size(32)));
+        typedef cl_double   __cl_double4    __attribute__((vector_size(32)));
+    #else
+        typedef __m256      __cl_float8;
+        typedef __m256d     __cl_double4;
+    #endif
+    #define __CL_FLOAT8__   1
+    #define __CL_DOUBLE4__  1
+#endif
+
+/* Define alignment keys */
+#if defined( __GNUC__ )
+    #define CL_ALIGNED(_x)          __attribute__ ((aligned(_x)))
+#elif defined( _WIN32) && (_MSC_VER)
+    /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements     */
+    /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx                                                 */
+    /* #include <crtdefs.h>                                                                                             */
+    /* #define CL_ALIGNED(_x)          _CRT_ALIGN(_x)                                                                   */
+    #define CL_ALIGNED(_x)
+#else
+   #warning  Need to implement some method to align data here
+   #define  CL_ALIGNED(_x)
+#endif
+
+/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+    /* .xyzw and .s0123...{f|F} are supported */
+    #define CL_HAS_NAMED_VECTOR_FIELDS 1
+    /* .hi and .lo are supported */
+    #define CL_HAS_HI_LO_VECTOR_FIELDS 1
+#endif
+
+/* Define cl_vector types */
+
+/* ---- cl_charn ---- */
+typedef union
+{
+    cl_char  CL_ALIGNED(2) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_char  x, y; };
+   __extension__ struct{ cl_char  s0, s1; };
+   __extension__ struct{ cl_char  lo, hi; };
+#endif
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2;
+#endif
+}cl_char2;
+
+typedef union
+{
+    cl_char  CL_ALIGNED(4) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_char  x, y, z, w; };
+   __extension__ struct{ cl_char  s0, s1, s2, s3; };
+   __extension__ struct{ cl_char2 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2[2];
+#endif
+#if defined( __CL_CHAR4__) 
+    __cl_char4     v4;
+#endif
+}cl_char4;
+
+/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */
+typedef  cl_char4  cl_char3;
+
+typedef union
+{
+    cl_char   CL_ALIGNED(8) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_char  x, y, z, w; };
+   __extension__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_char4 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2[4];
+#endif
+#if defined( __CL_CHAR4__) 
+    __cl_char4     v4[2];
+#endif
+#if defined( __CL_CHAR8__ )
+    __cl_char8     v8;
+#endif
+}cl_char8;
+
+typedef union
+{
+    cl_char  CL_ALIGNED(16) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_char  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_char8 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2[8];
+#endif
+#if defined( __CL_CHAR4__) 
+    __cl_char4     v4[4];
+#endif
+#if defined( __CL_CHAR8__ )
+    __cl_char8     v8[2];
+#endif
+#if defined( __CL_CHAR16__ )
+    __cl_char16    v16;
+#endif
+}cl_char16;
+
+
+/* ---- cl_ucharn ---- */
+typedef union
+{
+    cl_uchar  CL_ALIGNED(2) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_uchar  x, y; };
+   __extension__ struct{ cl_uchar  s0, s1; };
+   __extension__ struct{ cl_uchar  lo, hi; };
+#endif
+#if defined( __cl_uchar2__) 
+    __cl_uchar2     v2;
+#endif
+}cl_uchar2;
+
+typedef union
+{
+    cl_uchar  CL_ALIGNED(4) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_uchar  x, y, z, w; };
+   __extension__ struct{ cl_uchar  s0, s1, s2, s3; };
+   __extension__ struct{ cl_uchar2 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__) 
+    __cl_uchar2     v2[2];
+#endif
+#if defined( __CL_UCHAR4__) 
+    __cl_uchar4     v4;
+#endif
+}cl_uchar4;
+
+/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */
+typedef  cl_uchar4  cl_uchar3;
+
+typedef union
+{
+    cl_uchar   CL_ALIGNED(8) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_uchar  x, y, z, w; };
+   __extension__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_uchar4 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__) 
+    __cl_uchar2     v2[4];
+#endif
+#if defined( __CL_UCHAR4__) 
+    __cl_uchar4     v4[2];
+#endif
+#if defined( __CL_UCHAR8__ )
+    __cl_uchar8     v8;
+#endif
+}cl_uchar8;
+
+typedef union
+{
+    cl_uchar  CL_ALIGNED(16) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_uchar  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_uchar8 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__) 
+    __cl_uchar2     v2[8];
+#endif
+#if defined( __CL_UCHAR4__) 
+    __cl_uchar4     v4[4];
+#endif
+#if defined( __CL_UCHAR8__ )
+    __cl_uchar8     v8[2];
+#endif
+#if defined( __CL_UCHAR16__ )
+    __cl_uchar16    v16;
+#endif
+}cl_uchar16;
+
+
+/* ---- cl_shortn ---- */
+typedef union
+{
+    cl_short  CL_ALIGNED(4) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_short  x, y; };
+   __extension__ struct{ cl_short  s0, s1; };
+   __extension__ struct{ cl_short  lo, hi; };
+#endif
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2;
+#endif
+}cl_short2;
+
+typedef union
+{
+    cl_short  CL_ALIGNED(8) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_short  x, y, z, w; };
+   __extension__ struct{ cl_short  s0, s1, s2, s3; };
+   __extension__ struct{ cl_short2 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2[2];
+#endif
+#if defined( __CL_SHORT4__) 
+    __cl_short4     v4;
+#endif
+}cl_short4;
+
+/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */
+typedef  cl_short4  cl_short3;
+
+typedef union
+{
+    cl_short   CL_ALIGNED(16) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_short  x, y, z, w; };
+   __extension__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_short4 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2[4];
+#endif
+#if defined( __CL_SHORT4__) 
+    __cl_short4     v4[2];
+#endif
+#if defined( __CL_SHORT8__ )
+    __cl_short8     v8;
+#endif
+}cl_short8;
+
+typedef union
+{
+    cl_short  CL_ALIGNED(32) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_short  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_short8 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2[8];
+#endif
+#if defined( __CL_SHORT4__) 
+    __cl_short4     v4[4];
+#endif
+#if defined( __CL_SHORT8__ )
+    __cl_short8     v8[2];
+#endif
+#if defined( __CL_SHORT16__ )
+    __cl_short16    v16;
+#endif
+}cl_short16;
+
+
+/* ---- cl_ushortn ---- */
+typedef union
+{
+    cl_ushort  CL_ALIGNED(4) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_ushort  x, y; };
+   __extension__ struct{ cl_ushort  s0, s1; };
+   __extension__ struct{ cl_ushort  lo, hi; };
+#endif
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2;
+#endif
+}cl_ushort2;
+
+typedef union
+{
+    cl_ushort  CL_ALIGNED(8) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_ushort  x, y, z, w; };
+   __extension__ struct{ cl_ushort  s0, s1, s2, s3; };
+   __extension__ struct{ cl_ushort2 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2[2];
+#endif
+#if defined( __CL_USHORT4__) 
+    __cl_ushort4     v4;
+#endif
+}cl_ushort4;
+
+/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */
+typedef  cl_ushort4  cl_ushort3;
+
+typedef union
+{
+    cl_ushort   CL_ALIGNED(16) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_ushort  x, y, z, w; };
+   __extension__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_ushort4 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2[4];
+#endif
+#if defined( __CL_USHORT4__) 
+    __cl_ushort4     v4[2];
+#endif
+#if defined( __CL_USHORT8__ )
+    __cl_ushort8     v8;
+#endif
+}cl_ushort8;
+
+typedef union
+{
+    cl_ushort  CL_ALIGNED(32) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_ushort  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_ushort8 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2[8];
+#endif
+#if defined( __CL_USHORT4__) 
+    __cl_ushort4     v4[4];
+#endif
+#if defined( __CL_USHORT8__ )
+    __cl_ushort8     v8[2];
+#endif
+#if defined( __CL_USHORT16__ )
+    __cl_ushort16    v16;
+#endif
+}cl_ushort16;
+
+/* ---- cl_intn ---- */
+typedef union
+{
+    cl_int  CL_ALIGNED(8) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_int  x, y; };
+   __extension__ struct{ cl_int  s0, s1; };
+   __extension__ struct{ cl_int  lo, hi; };
+#endif
+#if defined( __CL_INT2__) 
+    __cl_int2     v2;
+#endif
+}cl_int2;
+
+typedef union
+{
+    cl_int  CL_ALIGNED(16) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_int  x, y, z, w; };
+   __extension__ struct{ cl_int  s0, s1, s2, s3; };
+   __extension__ struct{ cl_int2 lo, hi; };
+#endif
+#if defined( __CL_INT2__) 
+    __cl_int2     v2[2];
+#endif
+#if defined( __CL_INT4__) 
+    __cl_int4     v4;
+#endif
+}cl_int4;
+
+/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */
+typedef  cl_int4  cl_int3;
+
+typedef union
+{
+    cl_int   CL_ALIGNED(32) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_int  x, y, z, w; };
+   __extension__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_int4 lo, hi; };
+#endif
+#if defined( __CL_INT2__) 
+    __cl_int2     v2[4];
+#endif
+#if defined( __CL_INT4__) 
+    __cl_int4     v4[2];
+#endif
+#if defined( __CL_INT8__ )
+    __cl_int8     v8;
+#endif
+}cl_int8;
+
+typedef union
+{
+    cl_int  CL_ALIGNED(64) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_int  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_int8 lo, hi; };
+#endif
+#if defined( __CL_INT2__) 
+    __cl_int2     v2[8];
+#endif
+#if defined( __CL_INT4__) 
+    __cl_int4     v4[4];
+#endif
+#if defined( __CL_INT8__ )
+    __cl_int8     v8[2];
+#endif
+#if defined( __CL_INT16__ )
+    __cl_int16    v16;
+#endif
+}cl_int16;
+
+
+/* ---- cl_uintn ---- */
+typedef union
+{
+    cl_uint  CL_ALIGNED(8) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_uint  x, y; };
+   __extension__ struct{ cl_uint  s0, s1; };
+   __extension__ struct{ cl_uint  lo, hi; };
+#endif
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2;
+#endif
+}cl_uint2;
+
+typedef union
+{
+    cl_uint  CL_ALIGNED(16) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_uint  x, y, z, w; };
+   __extension__ struct{ cl_uint  s0, s1, s2, s3; };
+   __extension__ struct{ cl_uint2 lo, hi; };
+#endif
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2[2];
+#endif
+#if defined( __CL_UINT4__) 
+    __cl_uint4     v4;
+#endif
+}cl_uint4;
+
+/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */
+typedef  cl_uint4  cl_uint3;
+
+typedef union
+{
+    cl_uint   CL_ALIGNED(32) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_uint  x, y, z, w; };
+   __extension__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_uint4 lo, hi; };
+#endif
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2[4];
+#endif
+#if defined( __CL_UINT4__) 
+    __cl_uint4     v4[2];
+#endif
+#if defined( __CL_UINT8__ )
+    __cl_uint8     v8;
+#endif
+}cl_uint8;
+
+typedef union
+{
+    cl_uint  CL_ALIGNED(64) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_uint  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_uint8 lo, hi; };
+#endif
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2[8];
+#endif
+#if defined( __CL_UINT4__) 
+    __cl_uint4     v4[4];
+#endif
+#if defined( __CL_UINT8__ )
+    __cl_uint8     v8[2];
+#endif
+#if defined( __CL_UINT16__ )
+    __cl_uint16    v16;
+#endif
+}cl_uint16;
+
+/* ---- cl_longn ---- */
+typedef union
+{
+    cl_long  CL_ALIGNED(16) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_long  x, y; };
+   __extension__ struct{ cl_long  s0, s1; };
+   __extension__ struct{ cl_long  lo, hi; };
+#endif
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2;
+#endif
+}cl_long2;
+
+typedef union
+{
+    cl_long  CL_ALIGNED(32) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_long  x, y, z, w; };
+   __extension__ struct{ cl_long  s0, s1, s2, s3; };
+   __extension__ struct{ cl_long2 lo, hi; };
+#endif
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2[2];
+#endif
+#if defined( __CL_LONG4__) 
+    __cl_long4     v4;
+#endif
+}cl_long4;
+
+/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */
+typedef  cl_long4  cl_long3;
+
+typedef union
+{
+    cl_long   CL_ALIGNED(64) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_long  x, y, z, w; };
+   __extension__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_long4 lo, hi; };
+#endif
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2[4];
+#endif
+#if defined( __CL_LONG4__) 
+    __cl_long4     v4[2];
+#endif
+#if defined( __CL_LONG8__ )
+    __cl_long8     v8;
+#endif
+}cl_long8;
+
+typedef union
+{
+    cl_long  CL_ALIGNED(128) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_long  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_long8 lo, hi; };
+#endif
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2[8];
+#endif
+#if defined( __CL_LONG4__) 
+    __cl_long4     v4[4];
+#endif
+#if defined( __CL_LONG8__ )
+    __cl_long8     v8[2];
+#endif
+#if defined( __CL_LONG16__ )
+    __cl_long16    v16;
+#endif
+}cl_long16;
+
+
+/* ---- cl_ulongn ---- */
+typedef union
+{
+    cl_ulong  CL_ALIGNED(16) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_ulong  x, y; };
+   __extension__ struct{ cl_ulong  s0, s1; };
+   __extension__ struct{ cl_ulong  lo, hi; };
+#endif
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2;
+#endif
+}cl_ulong2;
+
+typedef union
+{
+    cl_ulong  CL_ALIGNED(32) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_ulong  x, y, z, w; };
+   __extension__ struct{ cl_ulong  s0, s1, s2, s3; };
+   __extension__ struct{ cl_ulong2 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2[2];
+#endif
+#if defined( __CL_ULONG4__) 
+    __cl_ulong4     v4;
+#endif
+}cl_ulong4;
+
+/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */
+typedef  cl_ulong4  cl_ulong3;
+
+typedef union
+{
+    cl_ulong   CL_ALIGNED(64) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_ulong  x, y, z, w; };
+   __extension__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_ulong4 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2[4];
+#endif
+#if defined( __CL_ULONG4__) 
+    __cl_ulong4     v4[2];
+#endif
+#if defined( __CL_ULONG8__ )
+    __cl_ulong8     v8;
+#endif
+}cl_ulong8;
+
+typedef union
+{
+    cl_ulong  CL_ALIGNED(128) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_ulong  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_ulong8 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2[8];
+#endif
+#if defined( __CL_ULONG4__) 
+    __cl_ulong4     v4[4];
+#endif
+#if defined( __CL_ULONG8__ )
+    __cl_ulong8     v8[2];
+#endif
+#if defined( __CL_ULONG16__ )
+    __cl_ulong16    v16;
+#endif
+}cl_ulong16;
+
+
+/* --- cl_floatn ---- */
+
+typedef union
+{
+    cl_float  CL_ALIGNED(8) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_float  x, y; };
+   __extension__ struct{ cl_float  s0, s1; };
+   __extension__ struct{ cl_float  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2;
+#endif
+}cl_float2;
+
+typedef union
+{
+    cl_float  CL_ALIGNED(16) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_float   x, y, z, w; };
+   __extension__ struct{ cl_float   s0, s1, s2, s3; };
+   __extension__ struct{ cl_float2  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2[2];
+#endif
+#if defined( __CL_FLOAT4__) 
+    __cl_float4     v4;
+#endif
+}cl_float4;
+
+/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */
+typedef  cl_float4  cl_float3;
+
+typedef union
+{
+    cl_float   CL_ALIGNED(32) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_float   x, y, z, w; };
+   __extension__ struct{ cl_float   s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_float4  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2[4];
+#endif
+#if defined( __CL_FLOAT4__) 
+    __cl_float4     v4[2];
+#endif
+#if defined( __CL_FLOAT8__ )
+    __cl_float8     v8;
+#endif
+}cl_float8;
+
+typedef union
+{
+    cl_float  CL_ALIGNED(64) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_float  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_float  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_float8 lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2[8];
+#endif
+#if defined( __CL_FLOAT4__) 
+    __cl_float4     v4[4];
+#endif
+#if defined( __CL_FLOAT8__ )
+    __cl_float8     v8[2];
+#endif
+#if defined( __CL_FLOAT16__ )
+    __cl_float16    v16;
+#endif
+}cl_float16;
+
+/* --- cl_doublen ---- */
+
+typedef union
+{
+    cl_double  CL_ALIGNED(16) s[2];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_double  x, y; };
+   __extension__ struct{ cl_double s0, s1; };
+   __extension__ struct{ cl_double lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2;
+#endif
+}cl_double2;
+
+typedef union
+{
+    cl_double  CL_ALIGNED(32) s[4];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_double  x, y, z, w; };
+   __extension__ struct{ cl_double  s0, s1, s2, s3; };
+   __extension__ struct{ cl_double2 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2[2];
+#endif
+#if defined( __CL_DOUBLE4__) 
+    __cl_double4     v4;
+#endif
+}cl_double4;
+
+/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */
+typedef  cl_double4  cl_double3;
+
+typedef union
+{
+    cl_double   CL_ALIGNED(64) s[8];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_double  x, y, z, w; };
+   __extension__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __extension__ struct{ cl_double4 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2[4];
+#endif
+#if defined( __CL_DOUBLE4__) 
+    __cl_double4     v4[2];
+#endif
+#if defined( __CL_DOUBLE8__ )
+    __cl_double8     v8;
+#endif
+}cl_double8;
+
+typedef union
+{
+    cl_double  CL_ALIGNED(128) s[16];
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+   __extension__ struct{ cl_double  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __extension__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __extension__ struct{ cl_double8 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2[8];
+#endif
+#if defined( __CL_DOUBLE4__) 
+    __cl_double4     v4[4];
+#endif
+#if defined( __CL_DOUBLE8__ )
+    __cl_double8     v8[2];
+#endif
+#if defined( __CL_DOUBLE16__ )
+    __cl_double16    v16;
+#endif
+}cl_double16;
+
+/* Macro to facilitate debugging 
+ * Usage:
+ *   Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source. 
+ *   The first line ends with:   CL_PROGRAM_STRING_DEBUG_INFO \"
+ *   Each line thereafter of OpenCL C source must end with: \n\
+ *   The last line ends in ";
+ *
+ *   Example:
+ *
+ *   const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\
+ *   kernel void foo( int a, float * b )             \n\
+ *   {                                               \n\
+ *      // my comment                                \n\
+ *      *b[ get_global_id(0)] = a;                   \n\
+ *   }                                               \n\
+ *   ";
+ *
+ * This should correctly set up the line, (column) and file information for your source 
+ * string so you can do source level debugging.
+ */
+#define  __CL_STRINGIFY( _x )               # _x
+#define  _CL_STRINGIFY( _x )                __CL_STRINGIFY( _x )
+#define  CL_PROGRAM_STRING_DEBUG_INFO       "#line "  _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n" 
+  
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __CL_PLATFORM_H  */
diff --git a/include/CL/opencl.h b/include/CL/opencl.h
new file mode 100644
index 0000000000..3f00524719
--- /dev/null
+++ b/include/CL/opencl.h
@@ -0,0 +1,54 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_H
+#define __OPENCL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+
+#include <OpenCL/cl.h>
+#include <OpenCL/cl_gl.h>
+#include <OpenCL/cl_gl_ext.h>
+#include <OpenCL/cl_ext.h>
+
+#else
+
+#include <CL/cl.h>
+#include <CL/cl_gl.h>
+#include <CL/cl_gl_ext.h>
+#include <CL/cl_ext.h>
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_H   */
+
diff --git a/opencl-1.2-stubs/SConscript b/opencl-1.2-stubs/SConscript
new file mode 100644
index 0000000000..dab2e2e7f7
--- /dev/null
+++ b/opencl-1.2-stubs/SConscript
@@ -0,0 +1,7 @@
+Import("env")
+
+opencl = env.SharedLibrary("OpenCL", "opencl_stubs.c")
+alias = Alias("opencl", opencl)
+Default(alias)
+
+Export("opencl")
diff --git a/opencl-1.2-stubs/opencl_stubs.c b/opencl-1.2-stubs/opencl_stubs.c
new file mode 100755
index 0000000000..a76eaa0bf9
--- /dev/null
+++ b/opencl-1.2-stubs/opencl_stubs.c
@@ -0,0 +1,1002 @@
+#include <CL/cl.h>
+#include <stdio.h>
+
+#define PRINT_STUB_ERROR printf("\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\nERROR: %s from stub libOpenCL.so library called! This library can be used to resolve OpenCL symbols at compile time but must *not* be in your runtime path (You need to use a real OpenCL implementation, this one is empty)\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n", __func__)
+
+cl_int
+clGetPlatformIDs(cl_uint          num_entries,
+                 cl_platform_id * platforms,
+                 cl_uint *        num_platforms)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clGetPlatformInfo(cl_platform_id   platform,
+                  cl_platform_info param_name,
+                  size_t           param_value_size,
+                  void *           param_value,
+                  size_t *         param_value_size_ret)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clGetDeviceIDs(cl_platform_id   platform,
+               cl_device_type   device_type,
+               cl_uint          num_entries,
+               cl_device_id *   devices,
+               cl_uint *        num_devices)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clGetDeviceInfo(cl_device_id    device,
+                cl_device_info  param_name,
+                size_t          param_value_size,
+                void *          param_value,
+                size_t *        param_value_size_ret)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clCreateSubDevices(cl_device_id                         in_device,
+                   const cl_device_partition_property * properties,
+                   cl_uint                              num_devices,
+                   cl_device_id *                       out_devices,
+                   cl_uint *                            num_devices_ret)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clRetainDevice(cl_device_id device)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clReleaseDevice(cl_device_id device)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_context
+clCreateContext(const cl_context_properties * properties,
+                cl_uint                 num_devices,
+                const cl_device_id *    devices,
+                void (CL_CALLBACK *     pfn_notify)(const char *, const void *, size_t, void *),
+                void *                  user_data,
+                cl_int *                errcode_ret)
+{
+	PRINT_STUB_ERROR;
+	if( errcode_ret ) *errcode_ret = CL_OUT_OF_RESOURCES;
+	return NULL;
+}
+
+cl_context
+clCreateContextFromType(const cl_context_properties * properties,
+                        cl_device_type                device_type,
+                        void (CL_CALLBACK *           pfn_notify )(const char *, const void *, size_t, void *),
+                        void *                        user_data,
+                        cl_int *                      errcode_ret)
+{
+	PRINT_STUB_ERROR;
+	if( errcode_ret ) *errcode_ret = CL_OUT_OF_RESOURCES;
+	return NULL;
+}
+
+cl_int
+clRetainContext(cl_context context)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clReleaseContext(cl_context context)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clGetContextInfo(cl_context         context,
+                 cl_context_info    param_name,
+                 size_t             param_value_size,
+                 void *             param_value,
+                 size_t *           param_value_size_ret)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_command_queue
+clCreateCommandQueue(cl_context                     context,
+                     cl_device_id                   device,
+                     cl_command_queue_properties    properties,
+                     cl_int *                       errcode_ret)
+{
+	PRINT_STUB_ERROR;
+	if( errcode_ret ) *errcode_ret = CL_OUT_OF_RESOURCES;
+	return NULL;
+}
+
+cl_int
+clRetainCommandQueue(cl_command_queue command_queue)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clReleaseCommandQueue(cl_command_queue command_queue)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clGetCommandQueueInfo(cl_command_queue      command_queue,
+                      cl_command_queue_info param_name,
+                      size_t                param_value_size,
+                      void *                param_value,
+                      size_t *              param_value_size_ret)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_mem
+clCreateBuffer(cl_context   context,
+               cl_mem_flags flags,
+               size_t       size,
+               void *       host_ptr,
+               cl_int *     errcode_ret)
+{
+	PRINT_STUB_ERROR;
+	if( errcode_ret ) *errcode_ret = CL_OUT_OF_RESOURCES;
+	return NULL;
+}
+
+cl_mem
+clCreateSubBuffer(cl_mem                   buffer,
+                  cl_mem_flags             flags,
+                  cl_buffer_create_type    buffer_create_type,
+                  const void *             buffer_create_info,
+                  cl_int *                 errcode_ret)
+{
+	PRINT_STUB_ERROR;
+	if( errcode_ret ) *errcode_ret = CL_OUT_OF_RESOURCES;
+	return NULL;
+}
+
+cl_mem
+clCreateImage(cl_context              context,
+              cl_mem_flags            flags,
+              const cl_image_format * image_format,
+              const cl_image_desc *   image_desc,
+              void *                  host_ptr,
+              cl_int *                errcode_ret)
+{
+	PRINT_STUB_ERROR;
+	if( errcode_ret ) *errcode_ret = CL_OUT_OF_RESOURCES;
+	return NULL;
+}
+
+cl_int
+clRetainMemObject(cl_mem memobj)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clReleaseMemObject(cl_mem memobj)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clGetSupportedImageFormats(cl_context           context,
+                           cl_mem_flags         flags,
+                           cl_mem_object_type   image_type,
+                           cl_uint              num_entries,
+                           cl_image_format *    image_formats,
+                           cl_uint *            num_image_formats)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clGetMemObjectInfo(cl_mem           memobj,
+                   cl_mem_info      param_name,
+                   size_t           param_value_size,
+                   void *           param_value,
+                   size_t *         param_value_size_ret)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clGetImageInfo(cl_mem           image,
+               cl_image_info    param_name,
+               size_t           param_value_size,
+               void *           param_value,
+               size_t *         param_value_size_ret)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clSetMemObjectDestructorCallback(  cl_mem memobj,
+                                   void (CL_CALLBACK * pfn_notify)( cl_mem memobj, void* user_data),
+                                   void * user_data )
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_sampler
+clCreateSampler(cl_context          context,
+                cl_bool             normalized_coords,
+                cl_addressing_mode  addressing_mode,
+                cl_filter_mode      filter_mode,
+                cl_int *            errcode_ret)
+{
+	PRINT_STUB_ERROR;
+	if( errcode_ret ) *errcode_ret = CL_OUT_OF_RESOURCES;
+	return NULL;
+}
+
+cl_int
+clRetainSampler(cl_sampler sampler)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clReleaseSampler(cl_sampler sampler)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clGetSamplerInfo(cl_sampler         sampler,
+                 cl_sampler_info    param_name,
+                 size_t             param_value_size,
+                 void *             param_value,
+                 size_t *           param_value_size_ret)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_program
+clCreateProgramWithSource(cl_context        context,
+                          cl_uint           count,
+                          const char **     strings,
+                          const size_t *    lengths,
+                          cl_int *          errcode_ret)
+{
+	PRINT_STUB_ERROR;
+	if( errcode_ret ) *errcode_ret = CL_OUT_OF_RESOURCES;
+	return NULL;
+}
+
+cl_program
+clCreateProgramWithBinary(cl_context                     context,
+                          cl_uint                        num_devices,
+                          const cl_device_id *           device_list,
+                          const size_t *                 lengths,
+                          const unsigned char **         binaries,
+                          cl_int *                       binary_status,
+                          cl_int *                       errcode_ret)
+{
+	PRINT_STUB_ERROR;
+	if( errcode_ret ) *errcode_ret = CL_OUT_OF_RESOURCES;
+	return NULL;
+}
+
+cl_program
+clCreateProgramWithBuiltInKernels(cl_context            context,
+                                  cl_uint               num_devices,
+                                  const cl_device_id *  device_list,
+                                  const char *          kernel_names,
+                                  cl_int *              errcode_ret)
+{
+	PRINT_STUB_ERROR;
+	if( errcode_ret ) *errcode_ret = CL_OUT_OF_RESOURCES;
+	return NULL;
+}
+
+cl_int
+clRetainProgram(cl_program program)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clReleaseProgram(cl_program program)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clBuildProgram(cl_program           program,
+               cl_uint              num_devices,
+               const cl_device_id * device_list,
+               const char *         options,
+               void (CL_CALLBACK *  pfn_notify)(cl_program program, void * user_data),
+               void *               user_data)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clCompileProgram(cl_program           program,
+                 cl_uint              num_devices,
+                 const cl_device_id * device_list,
+                 const char *         options,
+                 cl_uint              num_input_headers,
+                 const cl_program *   input_headers,
+                 const char **        header_include_names,
+                 void (CL_CALLBACK *  pfn_notify)(cl_program program, void * user_data),
+                 void *               user_data)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_program
+clLinkProgram(cl_context           context,
+              cl_uint              num_devices,
+              const cl_device_id * device_list,
+              const char *         options,
+              cl_uint              num_input_programs,
+              const cl_program *   input_programs,
+              void (CL_CALLBACK *  pfn_notify)(cl_program program, void * user_data),
+              void *               user_data,
+              cl_int *             errcode_ret )
+{
+	PRINT_STUB_ERROR;
+	if( errcode_ret ) *errcode_ret = CL_OUT_OF_RESOURCES;
+	return NULL;
+}
+
+
+cl_int
+clUnloadPlatformCompiler(cl_platform_id platform)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clGetProgramInfo(cl_program         program,
+                 cl_program_info    param_name,
+                 size_t             param_value_size,
+                 void *             param_value,
+                 size_t *           param_value_size_ret)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clGetProgramBuildInfo(cl_program            program,
+                      cl_device_id          device,
+                      cl_program_build_info param_name,
+                      size_t                param_value_size,
+                      void *                param_value,
+                      size_t *              param_value_size_ret)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_kernel
+clCreateKernel(cl_program      program,
+               const char *    kernel_name,
+               cl_int *        errcode_ret)
+{
+	PRINT_STUB_ERROR;
+	if( errcode_ret ) *errcode_ret = CL_OUT_OF_RESOURCES;
+	return NULL;
+}
+
+cl_int
+clCreateKernelsInProgram(cl_program     program,
+                         cl_uint        num_kernels,
+                         cl_kernel *    kernels,
+                         cl_uint *      num_kernels_ret)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clRetainKernel(cl_kernel    kernel)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clReleaseKernel(cl_kernel   kernel)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clSetKernelArg(cl_kernel    kernel,
+               cl_uint      arg_index,
+               size_t       arg_size,
+               const void * arg_value)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clGetKernelInfo(cl_kernel       kernel,
+                cl_kernel_info  param_name,
+                size_t          param_value_size,
+                void *          param_value,
+                size_t *        param_value_size_ret)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clGetKernelArgInfo(cl_kernel           kernel,
+                   cl_uint             arg_indx,
+                   cl_kernel_arg_info  param_name,
+                   size_t              param_value_size,
+                   void *              param_value,
+                   size_t *            param_value_size_ret)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clGetKernelWorkGroupInfo(cl_kernel                  kernel,
+                         cl_device_id               device,
+                         cl_kernel_work_group_info  param_name,
+                         size_t                     param_value_size,
+                         void *                     param_value,
+                         size_t *                   param_value_size_ret)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clWaitForEvents(cl_uint             num_events,
+                const cl_event *    event_list)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clGetEventInfo(cl_event         event,
+               cl_event_info    param_name,
+               size_t           param_value_size,
+               void *           param_value,
+               size_t *         param_value_size_ret)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_event
+clCreateUserEvent(cl_context    context,
+                  cl_int *      errcode_ret)
+{
+	PRINT_STUB_ERROR;
+	if( errcode_ret ) *errcode_ret = CL_OUT_OF_RESOURCES;
+	return NULL;
+}
+
+cl_int
+clRetainEvent(cl_event event)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clReleaseEvent(cl_event event)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clSetUserEventStatus(cl_event   event,
+                     cl_int     execution_status)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clSetEventCallback( cl_event            event,
+                    cl_int              command_exec_callback_type,
+                    void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *),
+                    void *              user_data)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clGetEventProfilingInfo(cl_event            event,
+                        cl_profiling_info   param_name,
+                        size_t              param_value_size,
+                        void *              param_value,
+                        size_t *            param_value_size_ret)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clFlush(cl_command_queue command_queue)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clFinish(cl_command_queue command_queue)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clEnqueueReadBuffer(cl_command_queue    command_queue,
+                    cl_mem              buffer,
+                    cl_bool             blocking_read,
+                    size_t              offset,
+                    size_t              size,
+                    void *              ptr,
+                    cl_uint             num_events_in_wait_list,
+                    const cl_event *    event_wait_list,
+                    cl_event *          event)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clEnqueueReadBufferRect(cl_command_queue    command_queue,
+                        cl_mem              buffer,
+                        cl_bool             blocking_read,
+                        const size_t *      buffer_offset,
+                        const size_t *      host_offset,
+                        const size_t *      region,
+                        size_t              buffer_row_pitch,
+                        size_t              buffer_slice_pitch,
+                        size_t              host_row_pitch,
+                        size_t              host_slice_pitch,
+                        void *              ptr,
+                        cl_uint             num_events_in_wait_list,
+                        const cl_event *    event_wait_list,
+                        cl_event *          event)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clEnqueueWriteBuffer(cl_command_queue   command_queue,
+                     cl_mem             buffer,
+                     cl_bool            blocking_write,
+                     size_t             offset,
+                     size_t             size,
+                     const void *       ptr,
+                     cl_uint            num_events_in_wait_list,
+                     const cl_event *   event_wait_list,
+                     cl_event *         event)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clEnqueueWriteBufferRect(cl_command_queue    command_queue,
+                         cl_mem              buffer,
+                         cl_bool             blocking_write,
+                         const size_t *      buffer_offset,
+                         const size_t *      host_offset,
+                         const size_t *      region,
+                         size_t              buffer_row_pitch,
+                         size_t              buffer_slice_pitch,
+                         size_t              host_row_pitch,
+                         size_t              host_slice_pitch,
+                         const void *        ptr,
+                         cl_uint             num_events_in_wait_list,
+                         const cl_event *    event_wait_list,
+                         cl_event *          event)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clEnqueueFillBuffer(cl_command_queue   command_queue,
+                    cl_mem             buffer,
+                    const void *       pattern,
+                    size_t             pattern_size,
+                    size_t             offset,
+                    size_t             size,
+                    cl_uint            num_events_in_wait_list,
+                    const cl_event *   event_wait_list,
+                    cl_event *         event)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clEnqueueCopyBuffer(cl_command_queue    command_queue,
+                    cl_mem              src_buffer,
+                    cl_mem              dst_buffer,
+                    size_t              src_offset,
+                    size_t              dst_offset,
+                    size_t              size,
+                    cl_uint             num_events_in_wait_list,
+                    const cl_event *    event_wait_list,
+                    cl_event *          event)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clEnqueueCopyBufferRect(cl_command_queue    command_queue,
+                        cl_mem              src_buffer,
+                        cl_mem              dst_buffer,
+                        const size_t *      src_origin,
+                        const size_t *      dst_origin,
+                        const size_t *      region,
+                        size_t              src_row_pitch,
+                        size_t              src_slice_pitch,
+                        size_t              dst_row_pitch,
+                        size_t              dst_slice_pitch,
+                        cl_uint             num_events_in_wait_list,
+                        const cl_event *    event_wait_list,
+                        cl_event *          event)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clEnqueueReadImage(cl_command_queue     command_queue,
+                   cl_mem               image,
+                   cl_bool              blocking_read,
+                   const size_t *       origin,
+                   const size_t *       region,
+                   size_t               row_pitch,
+                   size_t               slice_pitch,
+                   void *               ptr,
+                   cl_uint              num_events_in_wait_list,
+                   const cl_event *     event_wait_list,
+                   cl_event *           event)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clEnqueueWriteImage(cl_command_queue    command_queue,
+                    cl_mem              image,
+                    cl_bool             blocking_write,
+                    const size_t *      origin,
+                    const size_t *      region,
+                    size_t              input_row_pitch,
+                    size_t              input_slice_pitch,
+                    const void *        ptr,
+                    cl_uint             num_events_in_wait_list,
+                    const cl_event *    event_wait_list,
+                    cl_event *          event)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clEnqueueFillImage(cl_command_queue   command_queue,
+                   cl_mem             image,
+                   const void *       fill_color,
+                   const size_t *     origin,
+                   const size_t *     region,
+                   cl_uint            num_events_in_wait_list,
+                   const cl_event *   event_wait_list,
+                   cl_event *         event)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clEnqueueCopyImage(cl_command_queue     command_queue,
+                   cl_mem               src_image,
+                   cl_mem               dst_image,
+                   const size_t *       src_origin,
+                   const size_t *       dst_origin,
+                   const size_t *       region,
+                   cl_uint              num_events_in_wait_list,
+                   const cl_event *     event_wait_list,
+                   cl_event *           event)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clEnqueueCopyImageToBuffer(cl_command_queue command_queue,
+                           cl_mem           src_image,
+                           cl_mem           dst_buffer,
+                           const size_t *   src_origin,
+                           const size_t *   region,
+                           size_t           dst_offset,
+                           cl_uint          num_events_in_wait_list,
+                           const cl_event * event_wait_list,
+                           cl_event *       event)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clEnqueueCopyBufferToImage(cl_command_queue command_queue,
+                           cl_mem           src_buffer,
+                           cl_mem           dst_image,
+                           size_t           src_offset,
+                           const size_t *   dst_origin,
+                           const size_t *   region,
+                           cl_uint          num_events_in_wait_list,
+                           const cl_event * event_wait_list,
+                           cl_event *       event)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+void *
+clEnqueueMapBuffer(cl_command_queue command_queue,
+                   cl_mem           buffer,
+                   cl_bool          blocking_map,
+                   cl_map_flags     map_flags,
+                   size_t           offset,
+                   size_t           size,
+                   cl_uint          num_events_in_wait_list,
+                   const cl_event * event_wait_list,
+                   cl_event *       event,
+                   cl_int *         errcode_ret)
+{
+	PRINT_STUB_ERROR;
+	if( errcode_ret ) *errcode_ret = CL_OUT_OF_RESOURCES;
+	return NULL;
+}
+
+void *
+clEnqueueMapImage(cl_command_queue  command_queue,
+                  cl_mem            image,
+                  cl_bool           blocking_map,
+                  cl_map_flags      map_flags,
+                  const size_t *    origin,
+                  const size_t *    region,
+                  size_t *          image_row_pitch,
+                  size_t *          image_slice_pitch,
+                  cl_uint           num_events_in_wait_list,
+                  const cl_event *  event_wait_list,
+                  cl_event *        event,
+                  cl_int *          errcode_ret)
+{
+	PRINT_STUB_ERROR;
+	if( errcode_ret ) *errcode_ret = CL_OUT_OF_RESOURCES;
+	return NULL;
+}
+
+cl_int
+clEnqueueUnmapMemObject(cl_command_queue command_queue,
+                        cl_mem           memobj,
+                        void *           mapped_ptr,
+                        cl_uint          num_events_in_wait_list,
+                        const cl_event * event_wait_list,
+                        cl_event *       event)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clEnqueueMigrateMemObjects(cl_command_queue       command_queue,
+                           cl_uint                num_mem_objects,
+                           const cl_mem *         mem_objects,
+                           cl_mem_migration_flags flags,
+                           cl_uint                num_events_in_wait_list,
+                           const cl_event *       event_wait_list,
+                           cl_event *             event)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clEnqueueNDRangeKernel(cl_command_queue command_queue,
+                       cl_kernel        kernel,
+                       cl_uint          work_dim,
+                       const size_t *   global_work_offset,
+                       const size_t *   global_work_size,
+                       const size_t *   local_work_size,
+                       cl_uint          num_events_in_wait_list,
+                       const cl_event * event_wait_list,
+                       cl_event *       event)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clEnqueueTask(cl_command_queue  command_queue,
+              cl_kernel         kernel,
+              cl_uint           num_events_in_wait_list,
+              const cl_event *  event_wait_list,
+              cl_event *        event)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clEnqueueNativeKernel(cl_command_queue    command_queue,
+                      void (CL_CALLBACK * user_func)(void *),
+                      void *              args,
+                      size_t              cb_args,
+                      cl_uint             num_mem_objects,
+                      const cl_mem *      mem_list,
+                      const void **       args_mem_loc,
+                      cl_uint             num_events_in_wait_list,
+                      const cl_event *    event_wait_list,
+                      cl_event *          event)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clEnqueueMarkerWithWaitList(cl_command_queue command_queue,
+                            cl_uint          num_events_in_wait_list,
+                            const cl_event * event_wait_list,
+                            cl_event *       event)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clEnqueueBarrierWithWaitList(cl_command_queue command_queue,
+                             cl_uint          num_events_in_wait_list,
+                             const cl_event * event_wait_list,
+                             cl_event *       event)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+void *
+clGetExtensionFunctionAddressForPlatform(cl_platform_id platform,
+                                         const char *   func_name)
+{
+	PRINT_STUB_ERROR;
+	return NULL;
+}
+
+
+cl_mem
+clCreateImage2D(cl_context              context,
+                cl_mem_flags            flags,
+                const cl_image_format * image_format,
+                size_t                  image_width,
+                size_t                  image_height,
+                size_t                  image_row_pitch,
+                void *                  host_ptr,
+                cl_int *                errcode_ret)
+{
+	PRINT_STUB_ERROR;
+	if( errcode_ret ) *errcode_ret = CL_OUT_OF_RESOURCES;
+	return NULL;
+}
+
+cl_mem
+clCreateImage3D(cl_context              context,
+                cl_mem_flags            flags,
+                const cl_image_format * image_format,
+                size_t                  image_width,
+                size_t                  image_height,
+                size_t                  image_depth,
+                size_t                  image_row_pitch,
+                size_t                  image_slice_pitch,
+                void *                  host_ptr,
+                cl_int *                errcode_ret)
+{
+	PRINT_STUB_ERROR;
+	if( errcode_ret ) *errcode_ret = CL_OUT_OF_RESOURCES;
+	return NULL;
+}
+
+cl_int
+clEnqueueMarker(cl_command_queue    command_queue,
+                cl_event *          event)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clEnqueueWaitForEvents(cl_command_queue command_queue,
+                       cl_uint          num_events,
+                       const cl_event * event_list)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clEnqueueBarrier(cl_command_queue command_queue)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+cl_int
+clUnloadCompiler(void)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
+
+void *
+clGetExtensionFunctionAddress(const char * func_name)
+{
+	PRINT_STUB_ERROR;
+	return NULL;
+}
+
+cl_int
+clSetCommandQueueProperty(cl_command_queue              command_queue,
+                          cl_command_queue_properties   properties,
+                          cl_bool                       enable,
+                          cl_command_queue_properties * old_properties)
+{
+	PRINT_STUB_ERROR;
+	return CL_OUT_OF_RESOURCES;
+}
diff --git a/scripts/add_copyright.py b/scripts/add_copyright.py
new file mode 100755
index 0000000000..6142298828
--- /dev/null
+++ b/scripts/add_copyright.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+
+import glob
+import os.path
+
+eula_copyright = open("scripts/copyright_eula.txt",'r').read()
+
+def add_cpp_copyright( f, content):
+    global eula_copyright
+    out = open(f,'w')
+    out.write("/*\n")
+    for line in eula_copyright.split('\n')[:-1]:
+        out.write(" *");
+        if line.strip() != "":
+            out.write(" %s" %line)
+        out.write("\n")
+    out.write(" */\n")
+    out.write(content.strip())
+    out.write("\n")
+    out.close()
+
+def add_python_copyright( f, content):
+    global eula_copyright
+    out = open(f,'w')
+    for line in eula_copyright.split('\n')[:-1]:
+        out.write("#");
+        if line.strip() != "":
+            out.write(" %s" %line)
+        out.write("\n")
+    out.write(content.strip())
+    out.write("\n")
+    out.close()
+
+def remove_comment( content ):
+    comment=True
+    out=""
+    for line in content.split('\n'):
+        if comment:
+            if line.startswith(' */'):
+                comment=False
+            elif line.startswith('/*') or line.startswith(' *'):
+                #print(line)
+                continue
+            else:
+                raise Exception("ERROR: not a comment ? '%s'"% line)
+        else:
+            out += line + "\n"
+    return out
+def remove_comment_python( content ):
+    comment=True
+    out=""
+    for line in content.split('\n'):
+        if comment and line.startswith('#'):
+            continue
+        else:
+            comment = False
+            out += line + "\n"
+    return out
+
+for top in ['./arm_compute', './tests','./src','./examples','./utils/']:
+    for root, _, files in os.walk(top):
+        for f in files:
+            path = os.path.join(root, f)
+
+            if f in ['.clang-tidy', '.clang-format']:
+                print("Skipping file: {}".format(path))
+                continue
+
+            with open(path, 'r', encoding='utf-8') as fd:
+                content = fd.read()
+                _, extension = os.path.splitext(f)
+
+                if extension in ['.cpp', '.h', '.inl', '.cl']:
+                    if not  content.startswith('/*'):
+                        add_cpp_copyright(path, content)
+                elif extension == '.py' or f in ['SConstruct', 'SConscript']:
+                    if not content.startswith('# Copyright'):
+                        add_python_copyright(path, content)
+                elif f == 'CMakeLists.txt':
+                    if not content.startswith('# Copyright'):
+                        add_python_copyright(path, content)
+                else:
+                    raise Exception("Unhandled file: {}".format(path))
diff --git a/scripts/check_bad_style.sh b/scripts/check_bad_style.sh
new file mode 100755
index 0000000000..1cc514cdc3
--- /dev/null
+++ b/scripts/check_bad_style.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+set -e
+
+DIRECTORIES="./arm_compute ./src ./examples ./tests ./utils"
+
+grep -HrnP "/\*\*$" $DIRECTORIES | tee bad_style.log
+if (( `cat bad_style.log | wc -l` > 0 ))
+then
+    echo ""
+    echo "ERROR: Doxygen comments should start on the first line: \"/** My comment\""
+    exit -1
+fi
+
+grep -Hnr --exclude=Doxyfile "@brief" $DIRECTORIES | tee bad_style.log
+if (( `cat bad_style.log | wc -l` > 0 ))
+then
+    echo ""
+    echo "ERROR: Doxygen comments shouldn't use '@brief'"
+    exit -1
+fi
+
+grep -HnRE "\buint " --exclude-dir=cl_kernels $DIRECTORIES | tee bad_style.log
+if [[ $(cat bad_style.log | wc -l) > 0 ]]
+then
+    echo ""
+    echo "ERROR: C/C++ don't define 'uint'. Use 'unsigned int' instead."
+    exit -1
+fi
+
+grep -HnR "float32_t" $DIRECTORIES | tee bad_style.log
+if [[ $(cat bad_style.log | wc -l) > 0 ]]
+then
+    echo ""
+    echo "ERROR: C/C++ don't define 'float32_t'. Use 'float' instead."
+    exit -1
+fi
+
+grep -Hnir "arm[_ ]\?cv" $DIRECTORIES | tee bad_style.log
+if [[ $(cat bad_style.log | wc -l) > 0 ]]
+then
+    echo ""
+    echo "ERROR: Reference to arm_cv detected in the files above (Replace with arm_compute)"
+    exit -1
+fi
+
+spdx_missing=0
+for f in $(find $DIRECTORIES -type f)
+do
+    if [[ $(grep SPDX $f | wc -l) == 0 ]]
+    then
+        # List of exceptions:
+        case `basename $f` in
+            "arm_compute_version.embed");;
+            ".clang-format");;
+            ".clang-tidy");;
+            #It's an error for other files to not contain the MIT header:
+            *)
+                spdx_missing=1
+                echo $f;
+                ;;
+        esac
+    fi;
+done
+
+if [[ $spdx_missing > 0 ]]
+then
+    echo ""
+    echo "ERROR: MIT Copyright header missing from the file(s) above."
+    exit -1
+fi
diff --git a/scripts/check_clang-tidy.py b/scripts/check_clang-tidy.py
new file mode 100755
index 0000000000..ead2513a93
--- /dev/null
+++ b/scripts/check_clang-tidy.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+
+import sys
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("usage: {} CLANG-TIDY_OUTPUT_FILE".format(sys.argv[0]))
+        sys.exit(1)
+
+    failed = False
+
+    with open(sys.argv[1], mode="r") as clang_tidy_file:
+        lines = clang_tidy_file.readlines()
+
+        for i in range(0, len(lines)):
+            line = lines[i]
+
+            if "error:" in line:
+                if (("Utils.cpp" in line and "'arm_compute_version.embed' file not found" in line) or
+                    ("cl2.hpp" in line and "cast from pointer to smaller type 'cl_context_properties' (aka 'int') loses information" in line) or
+                    ("memory" in line and "cast from pointer to smaller type 'uintptr_t' (aka 'unsigned int') loses information" in line) or
+                    "3rdparty" in line):
+                    continue
+
+                failed = True
+                print(line)
+            elif "warning:" in line:
+                if ("uninitialized record type: '__ret'" in line or
+                   "local variable '__bound_functor' is still referred to by the global variable '__once_callable'" in line or
+                   ("Error.cpp" in line and "thrown exception type is not nothrow copy constructible" in line) or
+                   ("Error.cpp" in line and "uninitialized record type: 'args'" in line) or
+                   ("Error.cpp" in line and "do not call c-style vararg functions" in line) or
+                   ("Error.cpp" in line and "do not define a C-style variadic function" in line) or
+                   ("NEMinMaxLocationKernel.cpp" in line and "move constructors should be marked noexcept" in line) or
+                   ("NEMinMaxLocationKernel.cpp" in line and "move assignment operators should be marked noexcept" in line) or
+                   ("PMUCounter.cpp" in line and "consider replacing 'long long' with 'int64'" in line) or
+                   "3rdparty" in line):
+                    continue
+
+                if "do not use C-style cast to convert between unrelated types" in line:
+                    if i + 1 < len(lines) and "vgetq_lane_f16" in lines[i + 1]:
+                        continue
+
+                if "use 'using' instead of 'typedef'" in line:
+                    if i + 1 < len(lines) and "BOOST_FIXTURE_TEST_SUITE" in lines[i + 1]:
+                        continue
+
+                if "do not call c-style vararg functions" in line:
+                    if (i + 1 < len(lines) and
+                       ("BOOST_TEST" in lines[i + 1] or
+                        "BOOST_FAIL" in lines[i + 1] or
+                        "BOOST_CHECK_THROW" in lines[i + 1] or
+                        "syscall" in lines[i + 1])):
+                            continue
+
+                failed = True
+                print(line)
+
+    sys.exit(0 if not failed else 1)
diff --git a/scripts/clang-tidy.h b/scripts/clang-tidy.h
new file mode 100644
index 0000000000..32b0f6955e
--- /dev/null
+++ b/scripts/clang-tidy.h
@@ -0,0 +1,112 @@
+#include <arm_neon.h>
+
+inline float16x8_t vcvtq_f16_u16(uint16x8_t)
+{
+  return vdupq_n_f16(0);
+}
+
+inline uint16x8_t vcvtq_u16_f16(float16x8_t)
+{
+  return vdupq_n_u16(0);
+}
+
+inline int16x8_t vcvtq_s16_f16(float16x8_t)
+{
+  return vdupq_n_s16(0);
+}
+
+inline float16x8_t vaddq_f16(float16x8_t, float16x8_t)
+{
+  return vdupq_n_f16(0);
+}
+
+inline float16x8_t vsubq_f16(float16x8_t, float16x8_t)
+{
+  return vdupq_n_f16(0);
+}
+
+inline float16x8_t vmulq_f16(float16x8_t, float16x8_t)
+{
+  return vdupq_n_f16(0);
+}
+
+inline float16x8_t vmulq_n_f16(float16x8_t, float16_t)
+{
+  return vdupq_n_f16(0);
+}
+
+inline float16x8_t vfmaq_f16(float16x8_t, float16x8_t, float16x8_t)
+{
+  return vdupq_n_f16(0);
+}
+
+inline uint16x8_t vcgeq_f16(float16x8_t, float16x8_t)
+{
+  return vdupq_n_u16(0);
+}
+
+inline uint16x8_t vcgtq_f16(float16x8_t, float16x8_t)
+{
+  return vdupq_n_u16(0);
+}
+
+inline float16x8_t vbslq_f16 (uint16x8_t, float16x8_t, float16x8_t)
+{
+  return vdupq_n_f16(0);;
+}
+
+inline float16x8_t vextq_f16(float16x8_t, float16x8_t, int)
+{
+  return vdupq_n_f16(0);
+}
+
+inline float16x8_t vabsq_f16(float16x8_t)
+{
+  return vdupq_n_f16(0);
+}
+
+inline uint16x8_t vcvtq_f16_s16(float16x8_t)
+{
+  return vdupq_n_s16(0);
+}
+
+inline float16x4_t vbsl_f16 (uint16x4_t,float16x4_t, float16x4_t)
+{
+  return vdup_n_f16(0);
+}
+
+inline float16x8_t vrsqrteq_f16(float16x8_t)
+{
+   return vdupq_n_f16(0);
+}
+
+inline float16x8_t vfmsq_f16 (float16x8_t, float16x8_t, float16x8_t)
+{
+   return vdupq_n_f16(0);
+}
+
+inline float16x8_t vrecpeq_f16 (float16x8_t)
+{
+   return vdupq_n_f16(0);
+}
+
+inline float16x8_t vrecpsq_f16 (float16x8_t, float16x8_t)
+{
+   return vdupq_n_f16(0);
+}
+
+inline float16x8_t vmaxq_f16 (float16x8_t, float16x8_t)
+{
+   return vdupq_n_f16(0);
+}
+
+inline float16x8_t vminq_f16 (float16x8_t, float16x8_t)
+{
+   return vdupq_n_f16(0);
+}
+
+inline uint16x8_t vcltq_f16(float16x8_t, float16x8_t)
+{
+   return vdupq_n_u16(0);
+}
+
diff --git a/scripts/clang-tidy.sh b/scripts/clang-tidy.sh
new file mode 100755
index 0000000000..053e5783c2
--- /dev/null
+++ b/scripts/clang-tidy.sh
@@ -0,0 +1,91 @@
+#!/usr/bin/env bash
+
+DIRECTORIES="./arm_compute ./src ./examples ./tests ./utils"
+
+if [ $# -eq 0 ]
+then
+    files=$(find $DIRECTORIES -type f -name \*.cpp | sort)
+else
+    files=$@
+fi
+
+SCRIPT_PATH=$(dirname $0)
+
+CLANG_TIDY=$(which clang-tidy)
+
+if [[ -z $CLANG_TIDY ]]; then
+    echo "clang-tidy not found!"
+    exit 1
+else
+    echo "Found clang-tidy:" $CLANG_TIDY
+fi
+
+CLANG_TIDY_PATH=$(dirname $CLANG_TIDY)/..
+
+ARMV7_GCC=$(which arm-linux-gnueabihf-g++)
+
+if [[ -z $ARMV7_GCC ]]; then
+    echo "arm-linux-gnueabihf-g++ not found!"
+    exit 1
+else
+    echo "Found arm-linux-gnueabihf-g++:" $ARMV7_GCC
+fi
+
+ARMV7_GCC_PATH=$(dirname $ARMV7_GCC)/..
+
+AARCH64_GCC=$(which aarch64-linux-gnu-g++)
+
+if [[ -z $AARCH64_GCC ]]; then
+    echo "aarch64-linux-gnu-g++ not found!"
+    exit 1
+else
+    echo "Found aarch64-linux-gnu-g++:" $AARCH64_GCC
+fi
+
+ARMV7_GCC_PATH=$(dirname $ARMV7_GCC)/..
+AARCH64_GCC_PATH=$(dirname $AARCH64_GCC)/..
+
+function armv7
+{
+    USE_BOOST=""
+
+    if [[ "$1" == *tests/validation* ]]
+    then
+        USE_BOOST="-DBOOST"
+    fi
+
+    $CLANG_TIDY \
+    "$1" \
+    -- \
+    -target armv7a-none-linux-gnueabihf \
+    --gcc-toolchain=$ARMV7_GCC_PATH \
+    -std=c++11 \
+    -Iinclude -I. -I3rdparty/include -Ikernels -Itests -Icomputer_vision \
+    -DARM_COMPUTE_CPP_SCHEDULER=1 $USE_BOOST
+    #read -rsp $'Press enter to continue...\n'
+}
+
+function aarch64
+{
+    USE_BOOST=""
+
+    if [[ "$1" == *tests/validation* ]]
+    then
+        USE_BOOST="-DBOOST"
+    fi
+
+    $CLANG_TIDY \
+    "$1" \
+    -- \
+    -target aarch64-none-linux-gnueabi \
+    --gcc-toolchain=$AARCH64_GCC_PATH \
+    -std=c++11 \
+    -include $SCRIPT_PATH/clang-tidy.h \
+    -Iinclude -I. -I3rdparty/include -Ikernels -Itests -Icomputer_vision \
+    -DARM_COMPUTE_ENABLE_FP16 -DARM_COMPUTE_CPP_SCHEDULER=1 $USE_BOOST
+}
+
+for f in $files; do
+    #armv7 "$f"
+    aarch64 "$f"
+done
diff --git a/scripts/copyright_eula.txt b/scripts/copyright_eula.txt
new file mode 100644
index 0000000000..50e8090e4f
--- /dev/null
+++ b/scripts/copyright_eula.txt
@@ -0,0 +1,19 @@
+Copyright (c) 2016, 2017 ARM Limited.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to
+deal in the Software without restriction, including without limitation the
+rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+sell copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/scripts/copyright_mit.txt b/scripts/copyright_mit.txt
new file mode 100644
index 0000000000..0beacf8d93
--- /dev/null
+++ b/scripts/copyright_mit.txt
@@ -0,0 +1,21 @@
+Copyright (c) 2016, 2017 ARM Limited.
+
+SPDX-License-Identifier: MIT
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to
+deal in the Software without restriction, including without limitation the
+rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+sell copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/scripts/fix_code_formatting.sh b/scripts/fix_code_formatting.sh
new file mode 100755
index 0000000000..2ab3c1d532
--- /dev/null
+++ b/scripts/fix_code_formatting.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+ASTYLE_PARAMETERS=" --style=ansi \
+	--indent=spaces \
+	--indent-switches \
+	--indent-col1-comments \
+	--min-conditional-indent=0 \
+	--max-instatement-indent=120 \
+	--pad-oper \
+	--align-pointer=name \
+	--align-reference=name \
+	--break-closing-brackets \
+	--keep-one-line-statements \
+	--max-code-length=200 \
+	--mode=c \
+	--lineend=linux \
+	--indent-preprocessor \
+	"
+
+DIRECTORIES="./arm_compute ./src ./examples ./tests ./utils"
+
+if [ $# -eq 0 ]
+then
+    files=$(find $DIRECTORIES -type f \( -name \*.cpp -o -iname \*.h -o -name \*.inl -o -name \*.cl \))
+else
+	files=$@
+fi
+for f in $files
+do
+	sed -i 's/\t/    /g' $f
+	clang-format -i -style=file $f
+	astyle -n -q $ASTYLE_PARAMETERS $f
+done
diff --git a/scripts/format_doxygen.py b/scripts/format_doxygen.py
new file mode 100755
index 0000000000..96adf52726
--- /dev/null
+++ b/scripts/format_doxygen.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+
+import os.path
+import re
+import sys
+
+def process_comment(fd, comment, first_param, last_param):
+    if first_param < 0:
+        # Nothing to do: just copy the comment
+        fd.write("".join(comment))
+    else:
+        params = list()
+
+        # Copy the non param lines unmodified:
+        fd.write("".join(comment[:first_param]))
+
+        # Measure the indentation of the first param and use that to create an empty comment line string:
+        m = re.match(r" *\*", comment[first_param])
+
+        if not m:
+            raise Exception("Not a comment ? '{}'".format(comment[first_param]))
+
+        empty_line = "{}\n".format(m.group(0))
+
+        # For each param split the line into 3 columns: param, param_name, description
+        for param in range(first_param, last_param):
+            m = re.match(r"([^@]+@param\[[^\]]+\]) +(\S+) +(.+)", comment[param])
+
+            if m:
+                params.append( (m.group(1), m.group(2), m.group(3)) )
+            else:
+                # If it's not a match then it must be a multi-line param description:
+
+                m = re.match("( *\*) +(.*)", comment[param])
+
+                if not m:
+                    raise Exception("Not a comment line ? ({})".format(n_line - len(comment) + param))
+
+                params.append( (m.group(1), "", m.group(2)) )
+
+        # Now that we've got a list of params, find what is the longest string for each column:
+        max_len = [0, 0]
+
+        for p in params:
+            for l in range(len(max_len)):
+                max_len[l] = max(max_len[l], len(p[l]))
+
+        # Insert an empty line if needed before the first param to make it easier to read:
+        m = re.match(r" *\* *$", comment[first_param - 1])
+
+        if not m:
+            # insert empty line
+            fd.write(empty_line)
+
+        # Write out the formatted list of params:
+        for p in params:
+            fd.write("{}{} {}{} {}\n".format(
+                    p[0], " " * (max_len[0] - len(p[0])),
+                    p[1], " " * (max_len[1] - len(p[1])),
+                    p[2]))
+
+        # If the next line after the list of params is a command (@return, @note, @warning, etc), insert an empty line to separate it from the list of params
+        if last_param < len(comment) - 1:
+            if re.match(r" *\* *@\w+", comment[last_param]):
+                # insert empty line
+                fd.write(empty_line)
+
+        # Copy the remaining of the comment unmodified:
+        fd.write("".join(comment[last_param:]))
+
+if __name__ == "__main__":
+    n_file=0
+
+    if len(sys.argv) == 1:
+        paths = []
+
+        for top_level in ["./arm_compute", "./src", "./examples", "./tests", "./utils"]:
+            for root, _, files in os.walk(top_level):
+                paths.extend([os.path.join(root, f) for f in files])
+    else:
+        paths = sys.argv[1:]
+
+    for path in paths:
+        if (path[-3:] not in ("cpp", "inl") and
+            path[-2:] not in ("cl") and
+            path[-1] not in ("h")):
+            continue
+
+        print("[{}] {}".format(n_file, path))
+
+        n_file += 1
+
+        with open(path,'r+', encoding="utf-8") as fd:
+            comment = list()
+            first_param = -1
+            last_param = -1
+            n_line = 0
+
+            lines = fd.readlines()
+            fd.seek(0)
+            fd.truncate()
+
+            for line in lines:
+                n_line += 1
+
+                # Start comment
+                # Match C-style comment /* anywhere in the line
+                if re.search(r"/\*", line):
+                    #print("Start comment {}".format(n_line))
+
+                    if len(comment) > 0:
+                        raise Exception("Already in a comment! ({})".format(n_line))
+
+                    comment.append(line)
+
+                # Comment already started
+                elif len(comment) > 0:
+                    #print("Add line to comment {}".format(n_line))
+
+                    comment.append(line)
+
+                # Non-comment line
+                else:
+                    #print("Normal line {}".format(n_line))
+
+                    fd.write(line)
+
+                # Match param declaration in Doxygen comment
+                # @param[in] name description
+                if re.search(r"@param\[[^\]]+\] +\S+ +\S", line):
+                    #print("Param {}".format(n_line))
+
+                    if first_param < 0:
+                        first_param = len(comment) - 1
+
+                    last_param = len(comment)
+
+                # Match end of C-style comment */
+                if re.search(r"\*/", line):
+                    #print("End comment {}".format(n_line))
+
+                    if len(comment) < 1:
+                        raise Exception("Was not in a comment! ({})".format(n_line))
+
+                    #print("Process comment {} {}".format(first_param, last_param))
+
+                    process_comment(fd, comment, first_param, last_param)
+
+                    comment = list()
+                    first_param = -1
+                    last_param = -1
diff --git a/scripts/include_functions_kernels.py b/scripts/include_functions_kernels.py
new file mode 100755
index 0000000000..9f566cbc1a
--- /dev/null
+++ b/scripts/include_functions_kernels.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3.5
+
+import glob
+import collections
+import os
+
+Target = collections.namedtuple('Target', 'name prefix')
+
+targets = [Target("NEON", "NE"), Target("CL", "CL"), Target("CPP", "CPP")]
+
+armcv_path = "arm_compute"
+core_path = armcv_path + "/core/"
+runtime_path = armcv_path + "/runtime/"
+include_str = "#include \""
+
+
+def read_file(file):
+    with open(file, "r") as f:
+        lines = f.readlines()
+    return lines
+
+
+def write_file(file, lines):
+    with open(file, "w") as f:
+        for line in lines:
+            f.write(line)
+
+
+def remove_existing_includes(lines):
+    first_pos = next(i for i, line in enumerate(lines) if include_str in line)
+    return [x for x in lines if not x.startswith(include_str)], first_pos
+
+
+def add_updated_includes(lines, pos, includes):
+    lines[pos:pos] = includes
+    return lines
+
+
+def create_include_list(folder):
+    files_path = folder + "/*.h"
+    files = glob.glob(files_path)
+    updated_files = [include_str + folder + "/" + x.rsplit('/',1)[1] + "\"\n" for x in files]
+    updated_files.sort()
+    return updated_files
+
+
+def include_components(path, header_prefix, folder):
+    for t in targets:
+        target_path = path +  t.name + "/"
+        components_file = target_path + t.prefix + header_prefix
+        if os.path.exists(components_file):
+            include_list = create_include_list(target_path + folder)
+            lines = read_file(components_file)
+            lines, first_pos = remove_existing_includes(lines)
+            lines = add_updated_includes(lines, first_pos, include_list)
+            write_file(components_file, lines)
+
+
+if __name__ == "__main__":
+    # Include kernels
+    include_components(core_path, "Kernels.h", "kernels")
+
+    # Include functions
+    include_components(runtime_path, "Functions.h", "functions")
diff --git a/src/core/AccessWindowAutoPadding.cpp b/src/core/AccessWindowAutoPadding.cpp
new file mode 100644
index 0000000000..b75ebcfeb8
--- /dev/null
+++ b/src/core/AccessWindowAutoPadding.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/AccessWindowAutoPadding.h"
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+AccessWindowAutoPadding::AccessWindowAutoPadding(ITensorInfo *info)
+    : _info(info)
+{
+}
+
+ValidRegion AccessWindowAutoPadding::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const
+{
+    ARM_COMPUTE_UNUSED(window);
+    ARM_COMPUTE_UNUSED(input_valid_region);
+    ARM_COMPUTE_UNUSED(border_undefined);
+    ARM_COMPUTE_UNUSED(border_size);
+
+    return compute_valid_region();
+}
+
+ValidRegion AccessWindowAutoPadding::compute_valid_region() const
+{
+    if(_info == nullptr)
+    {
+        return ValidRegion();
+    }
+
+    return ValidRegion(Coordinates(), _info->tensor_shape());
+}
+
+void AccessWindowAutoPadding::set_valid_region()
+{
+    if(_info == nullptr)
+    {
+        return;
+    }
+
+    _info->set_valid_region(compute_valid_region());
+}
+
+bool AccessWindowAutoPadding::update_window_if_needed(Window &window) const
+{
+    ARM_COMPUTE_UNUSED(window);
+
+    return false;
+}
+
+bool AccessWindowAutoPadding::update_padding_if_needed(const Window &window) const
+{
+    ARM_COMPUTE_UNUSED(window);
+
+    // Only update the padding if the tensor allows it
+    if(_info == nullptr || !_info->is_resizable())
+    {
+        return false;
+    }
+
+    // Update strides in tensor info
+    return _info->auto_padding();
+}
diff --git a/src/core/AccessWindowStatic.cpp b/src/core/AccessWindowStatic.cpp
new file mode 100644
index 0000000000..8b6419c485
--- /dev/null
+++ b/src/core/AccessWindowStatic.cpp
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/AccessWindowStatic.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+AccessWindowStatic::AccessWindowStatic(ITensorInfo *info, int start_x, int start_y, int end_x, int end_y)
+    : _info(info), _start_x(start_x), _start_y(start_y), _end_x(end_x), _end_y(end_y)
+{
+}
+
+ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const
+{
+    ARM_COMPUTE_UNUSED(border_undefined);
+    ARM_COMPUTE_UNUSED(border_size);
+
+    return compute_valid_region(window, input_valid_region);
+}
+
+ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, ValidRegion input_valid_region) const
+{
+    if(_info == nullptr)
+    {
+        return input_valid_region;
+    }
+
+    Coordinates &anchor = input_valid_region.anchor;
+    TensorShape &shape  = input_valid_region.shape;
+
+    // Start of the valid region is equal to the start of the static access but
+    // never outside of the tensor.
+    anchor.set(0, std::max<int>(0, _start_x));
+    if(_info->num_dimensions() > 1)
+    {
+        anchor.set(1, std::max<int>(0, _start_y));
+    }
+
+    // End of the valid region is equal to the end of the static access but
+    // never outside of the tensor.
+    shape.set(0, std::min<int>(_end_x, _info->tensor_shape()[0]));
+    if(_info->num_dimensions() > 1)
+    {
+        shape.set(1, std::min<int>(_end_y, _info->tensor_shape()[1]));
+    }
+
+    // For higher dimension use the intersection of the window size and the
+    // valid region of the input
+    for(size_t d = 2; d < _info->num_dimensions(); ++d)
+    {
+        anchor.set(d, std::max(window[d].start(), input_valid_region.anchor[d]));
+        shape.set(d, std::min<int>(window[d].end(), input_valid_region.shape[d]) - anchor[d]);
+    }
+
+    return input_valid_region;
+}
+
+void AccessWindowStatic::set_valid_region(const Window &window, const ValidRegion &input_valid_region)
+{
+    if(_info != nullptr)
+    {
+        _info->set_valid_region(compute_valid_region(window, input_valid_region));
+    }
+}
+
+bool AccessWindowStatic::update_window_if_needed(Window &window) const
+{
+    // Only update the window size if we can't use padding
+    if(_info == nullptr || _info->is_resizable())
+    {
+        return false;
+    }
+
+    const TensorShape &shape                = _info->tensor_shape();
+    const Strides     &strides              = _info->strides_in_bytes();
+    const size_t       offset_first_element = _info->offset_first_element_in_bytes();
+
+    bool window_modified = false;
+
+    int front_pad_y = 0;
+
+    // Adjust window start for Y dimension
+    if(_start_y < 0)
+    {
+        // Calculate rows available above the tensor
+        const int front_pad_y_available = -static_cast<int>(offset_first_element / strides[1]);
+
+        if(_start_y < front_pad_y_available)
+        {
+            // Not enough padding available, need to shrink the window
+            const int start = adjust_up(_start_y, front_pad_y_available, window.y().step());
+
+            window.set(1, Window::Dimension(start, window.y().end(), window.y().step()));
+            window_modified = true;
+        }
+
+        // Update front padding with reconstructed value
+        front_pad_y = std::max(0, -window.y().start());
+    }
+
+    // Adjust window end for Y dimension
+    if(_end_y > static_cast<int>(shape[1]))
+    {
+        const int stride_z = _info->num_dimensions() > 2 ? strides[2] : _info->total_size();
+
+        // Calculate rows available below the tensor
+        const int tail_pad_y_available = (stride_z / strides[1]) - shape[1] - front_pad_y;
+
+        if(static_cast<int>(shape[1]) + tail_pad_y_available < _end_y)
+        {
+            // Not enough padding available, need to shrink the window
+            const int end = adjust_down(_end_y, shape[1] + tail_pad_y_available, window.y().step()) + window.y().step();
+            window.set(1, Window::Dimension(window.y().start(), end, window.y().step()));
+            window_modified = true;
+        }
+    }
+
+    int front_pad_x = 0;
+
+    const int stride_y = _info->num_dimensions() > 1 ? strides[1] : _info->total_size();
+
+    // Adjust window start for X dimension
+    if(_start_x < 0)
+    {
+        const int front_pad_x_available = -std::min<int>(static_cast<int>(offset_first_element) - front_pad_y * strides[1], stride_y - shape[0] * strides[0]) / static_cast<int>(strides[0]);
+
+        if(_start_x < front_pad_x_available)
+        {
+            // Not enough padding available, need to shrink the window
+            const int start = adjust_up(_start_x, front_pad_x_available, window.x().step());
+            window.set(0, Window::Dimension(start, window.x().end(), window.x().step()));
+            window_modified = true;
+        }
+
+        // Update front padding with reconstructed value
+        front_pad_x = std::max(0, -window.x().start());
+    }
+
+    // Adjust window end for X dimension
+    if(_end_x > static_cast<int>(shape[0]))
+    {
+        const int tail_pad_x_available = (stride_y / strides[0]) - shape[0] - front_pad_x;
+
+        if(static_cast<int>(shape[0]) + tail_pad_x_available < _end_x)
+        {
+            // Not enough padding available, need to shrink the window
+            const int end = adjust_down(_end_x, shape[0] + tail_pad_x_available, window.x().step()) + window.x().step();
+            window.set(0, Window::Dimension(window.x().start(), end, window.x().step()));
+            window_modified = true;
+        }
+    }
+
+    window.validate();
+
+    return window_modified;
+}
+
+bool AccessWindowStatic::update_padding_if_needed(const Window &window) const
+{
+    ARM_COMPUTE_UNUSED(window);
+
+    // Only update the padding if the tensor allows it
+    if(_info == nullptr || !_info->is_resizable())
+    {
+        return false;
+    }
+
+    const TensorShape &shape = _info->tensor_shape();
+
+    PaddingSize padding;
+    padding.left   = std::max(0, -_start_x);
+    padding.right  = std::max<int>(0, _end_x - shape[0]);
+    padding.top    = shape.num_dimensions() == 1 ? 0 : std::max(0, -_start_y);
+    padding.bottom = shape.num_dimensions() == 1 ? 0 : std::max<int>(0, _end_y - shape[1]);
+
+    // Update strides in tensor info
+    return _info->extend_padding(padding);
+}
diff --git a/src/core/AccessWindowTranspose.cpp b/src/core/AccessWindowTranspose.cpp
new file mode 100644
index 0000000000..b3605c43f7
--- /dev/null
+++ b/src/core/AccessWindowTranspose.cpp
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/AccessWindowTranspose.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const
+{
+    if(_info == nullptr)
+    {
+        return input_valid_region;
+    }
+
+    Coordinates &anchor = input_valid_region.anchor;
+    TensorShape &shape  = input_valid_region.shape;
+    Coordinates  old_anchor(anchor);
+    TensorShape  old_shape(shape);
+
+    if(!border_undefined)
+    {
+        border_size = BorderSize(0);
+    }
+
+    // Start of the valid region is equal to the start of the window. But it
+    // cannot be less than the start of the input's valid region plus the border
+    // size required by this kernel (if undefined).
+    // Additionally the valid region is shifted by the offset that is used by
+    // the kernel to write back output values.
+    // As the relation between input and output is transposed window.y() is
+    // used for x anchor and window.x() for y anchor.
+    anchor.set(0, std::max<int>(window.y().start() * _scale_x, anchor[1] + border_size.top) + _x);
+    anchor.set(1, std::max<int>(window.x().start() * _scale_y, anchor[0] + border_size.left) + _y);
+
+    // End of the valid region is equal to the start of the last write of the
+    // kernel plus the number of written elements. (This assumes that all
+    // written elements are valid). Nevertheless the end cannot be larger than
+    // the end of the input's valid region minus the border size.
+    // Note: not the end points of the region are stored but its size. Thus the
+    // old size is first converted into end points to compared against the
+    // execution window. Afterwards the new end points are converted back into
+    // a size of the region.
+    // As the relation between input and output is transposed window.y() is
+    // used for x shape and window.x() for y shape.
+    shape.set(0, std::min<int>(old_anchor[1] + old_shape[1] - border_size.right, (window.y().end() - window.y().step()) * _scale_x + _width) - anchor[0]);
+    shape.set(1, std::min<int>(old_anchor[0] + old_shape[0] - border_size.bottom, (window.x().end() - window.x().step()) * _scale_y + _height) - anchor[1]);
+
+    // For higher dimensions use the intersection of the window size and the
+    // valid region of the input
+    for(size_t d = 2; d < _info->num_dimensions(); ++d)
+    {
+        anchor.set(d, std::max(window[d].start(), input_valid_region.anchor[d]));
+        shape.set(d, std::min<int>(window[d].end(), input_valid_region.shape[d]) - anchor[d]);
+    }
+
+    return input_valid_region;
+}
+
+bool AccessWindowTranspose::update_window_if_needed(Window &window) const
+{
+    // Only update the window size if we can't use padding
+    if(_info == nullptr || _info->is_resizable())
+    {
+        return false;
+    }
+
+    const TensorShape &shape                = _info->tensor_shape();
+    const Strides     &strides              = _info->strides_in_bytes();
+    const size_t       offset_first_element = _info->offset_first_element_in_bytes();
+
+    bool window_modified = false;
+
+    int front_pad_y = 0;
+
+    // Transpose and scale
+    const int min_y = window.x().start() * _scale_y + _y;
+    const int max_y = window.x().end() * _scale_y + _y;
+
+    // Adjust window start for output's Y dimension (so X in (input) window)
+    if(min_y < 0)
+    {
+        // Calculate rows available above the tensor
+        const int front_pad_y_available = -offset_first_element / strides[1];
+
+        if(min_y < front_pad_y_available)
+        {
+            // Not enough padding available, need to shrink the window
+            const int start = adjust_up(min_y, front_pad_y_available, window.x().step() * _scale_y) - _y;
+
+            window.set(0, Window::Dimension(start / _scale_y, window.x().end(), window.x().step()));
+            window_modified = true;
+        }
+
+        // Update front padding with reconstructed value
+        front_pad_y = std::max(0, static_cast<int>(std::floor(-window.x().start() * _scale_y)) - _y);
+    }
+
+    // Adjust window end for Y dimension
+    if(max_y > static_cast<int>(shape[1]))
+    {
+        const int stride_z = _info->num_dimensions() > 2 ? strides[2] : _info->total_size();
+
+        // Calculate rows available below the tensor
+        const int tail_pad_y_available = (stride_z / strides[1]) - shape[1] - front_pad_y;
+
+        if(static_cast<int>(shape[1]) + tail_pad_y_available < max_y)
+        {
+            // Not enough padding available, need to shrink the window
+            const int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.x().step() * _scale_y) + window.x().step() * _scale_y - _y - _height;
+            window.set(0, Window::Dimension(window.x().start(), end / _scale_y, window.x().step()));
+            window_modified = true;
+        }
+    }
+
+    int front_pad_x = 0;
+
+    // Transpose and scale
+    const int min_x = window.y().start() * _scale_x + _x;
+    const int max_x = window.y().end() * _scale_x + _x;
+
+    const int stride_y = _info->num_dimensions() > 1 ? strides[1] : _info->total_size();
+
+    // Adjust window start for X dimension
+    if(min_x < 0)
+    {
+        const int front_pad_x_available = -std::min<int>(static_cast<int>(offset_first_element) - front_pad_y * strides[1], stride_y - shape[0] * strides[0]) / static_cast<int>(strides[0]);
+
+        if(min_x < front_pad_x_available)
+        {
+            // Not enough padding available, need to shrink the window
+            const int start = adjust_up(min_x, front_pad_x_available, window.y().step() * _scale_x) - _x;
+            window.set(1, Window::Dimension(start / _scale_x, window.y().end(), window.y().step()));
+            window_modified = true;
+        }
+
+        // Update front padding with reconstructed value
+        front_pad_x = std::max(0, static_cast<int>(std::floor(-window.y().start() * _scale_x)) - _x);
+    }
+
+    // Adjust window end for X dimension
+    if(max_x > static_cast<int>(shape[0]))
+    {
+        const int tail_pad_x_available = (stride_y / strides[0]) - shape[0] - front_pad_x;
+
+        if(static_cast<int>(shape[0]) + tail_pad_x_available < max_x)
+        {
+            // Not enough padding available, need to shrink the window
+            const int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.y().step() * _scale_x) + window.y().step() * _scale_x - _x - _width;
+            window.set(1, Window::Dimension(window.y().start(), end / _scale_x, window.y().step()));
+            window_modified = true;
+        }
+    }
+
+    window.validate();
+
+    return window_modified;
+}
+
+bool AccessWindowTranspose::update_padding_if_needed(const Window &window) const
+{
+    // Only update the padding if the tensor allows it
+    if(_info == nullptr || !_info->is_resizable())
+    {
+        return false;
+    }
+
+    ARM_COMPUTE_ERROR_ON(window.y().step() == 0);
+    ARM_COMPUTE_ERROR_ON(window.x().step() == 0);
+
+    const int min_x = window.y().start() * _scale_x + _x;
+    const int max_x = window.y().end() * _scale_x + _x;
+    const int min_y = window.x().start() * _scale_y + _y;
+    const int max_y = window.x().end() * _scale_y + _y;
+
+    const TensorShape &shape = _info->tensor_shape();
+
+    PaddingSize padding;
+    padding.left   = std::max(0, -min_x);
+    padding.right  = std::max<int>(0, max_x - shape[0]);
+    padding.top    = shape.num_dimensions() == 1 ? 0 : std::max(0, -min_y);
+    padding.bottom = shape.num_dimensions() == 1 ? 0 : std::max<int>(0, max_y - shape[1]);
+
+    // Update strides in tensor info
+    return _info->extend_padding(padding);
+}
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
new file mode 100644
index 0000000000..21b72ddd3b
--- /dev/null
+++ b/src/core/CL/CLHelpers.cpp
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLTypes.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Types.h"
+
+#include <map>
+#include <vector>
+
+namespace
+{
+arm_compute::GPUTarget get_bifrost_target(const std::string &name)
+{
+    arm_compute::GPUTarget target = arm_compute::GPUTarget::MIDGARD;
+
+    if(name == "G7")
+    {
+        target = arm_compute::GPUTarget::G70;
+    }
+
+    return target;
+}
+
+arm_compute::GPUTarget get_midgard_target(const std::string &name)
+{
+    arm_compute::GPUTarget target = arm_compute::GPUTarget::MIDGARD;
+
+    if(name == "T6")
+    {
+        target = arm_compute::GPUTarget::T600;
+    }
+    else if(name == "T7")
+    {
+        target = arm_compute::GPUTarget::T700;
+    }
+    else if(name == "T8")
+    {
+        target = arm_compute::GPUTarget::T800;
+    }
+
+    return target;
+}
+} // namespace
+
+namespace arm_compute
+{
+std::string get_cl_type_from_data_type(const DataType &dt)
+{
+    switch(dt)
+    {
+        case DataType::U8:
+            return "uchar";
+        case DataType::S8:
+            return "char";
+        case DataType::U16:
+            return "ushort";
+        case DataType::S16:
+            return "short";
+        case DataType::U32:
+            return "uint";
+        case DataType::S32:
+            return "int";
+        case DataType::U64:
+            return "ulong";
+        case DataType::S64:
+            return "long";
+        case DataType::F16:
+            return "half";
+        case DataType::F32:
+            return "float";
+        default:
+            ARM_COMPUTE_ERROR("Unsupported input data type.");
+            return "";
+    }
+}
+
+const std::string &string_from_target(GPUTarget target)
+{
+    static std::map<GPUTarget, const std::string> gpu_target_map =
+    {
+        { GPUTarget::MIDGARD, "midgard" },
+        { GPUTarget::BIFROST, "bifrost" },
+        { GPUTarget::T600, "t600" },
+        { GPUTarget::T700, "t700" },
+        { GPUTarget::T800, "t800" },
+        { GPUTarget::G70, "g70" }
+    };
+
+    return gpu_target_map[target];
+}
+
+GPUTarget get_target_from_device(cl::Device &device)
+{
+    const std::string name_mali("Mali-");
+    GPUTarget         target{ GPUTarget::MIDGARD };
+
+    size_t            name_size = 0;
+    std::vector<char> name;
+
+    // Query device name size
+    cl_int err = clGetDeviceInfo(device.get(), CL_DEVICE_NAME, 0, nullptr, &name_size);
+    ARM_COMPUTE_ERROR_ON_MSG((err != 0) || (name_size == 0), "clGetDeviceInfo failed to return valid information");
+    // Resize vector
+    name.resize(name_size);
+    // Query device name
+    err = clGetDeviceInfo(device.get(), CL_DEVICE_NAME, name_size, name.data(), nullptr);
+    ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetDeviceInfo failed to return valid information");
+    ARM_COMPUTE_UNUSED(err);
+
+    std::string name_str(name.begin(), name.end());
+    auto        pos = name_str.find(name_mali);
+
+    if(pos != std::string::npos)
+    {
+        ARM_COMPUTE_ERROR_ON_MSG((pos + name_mali.size() + 2) > name_str.size(), "Device name is shorter than expected.");
+        std::string sub_name = name_str.substr(pos + name_mali.size(), 2);
+
+        if(sub_name[0] == 'G')
+        {
+            target = get_bifrost_target(sub_name);
+        }
+        else if(sub_name[0] == 'T')
+        {
+            target = get_midgard_target(sub_name);
+        }
+        else
+        {
+            ARM_COMPUTE_INFO("Mali GPU unknown. Target is set to the default one.");
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_INFO("Can't find valid Mali GPU. Target is set to the default one.");
+    }
+
+    return target;
+}
+
+GPUTarget get_arch_from_target(GPUTarget target)
+{
+    return (target & GPUTarget::GPU_ARCH_MASK);
+}
+} // namespace arm_compute
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
new file mode 100644
index 0000000000..15a5d90835
--- /dev/null
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -0,0 +1,597 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Utils.h"
+
+#include <fstream>
+#include <iostream>
+#include <utility>
+#include <vector>
+
+using namespace arm_compute;
+
+Program::Program()
+    : _context(), _device(), _is_binary(false), _name(), _source(), _binary()
+{
+}
+
+Program::Program(cl::Context context, std::string name, std::string source)
+    : _context(std::move(context)), _device(), _is_binary(false), _name(std::move(name)), _source(std::move(source)), _binary()
+{
+}
+
+Program::Program(cl::Context context, cl::Device device, std::string name, std::vector<unsigned char> binary)
+    : _context(std::move(context)), _device(std::move(device)), _is_binary(true), _name(std::move(name)), _source(), _binary(std::move(binary))
+{
+}
+
+Program::operator cl::Program() const
+{
+    if(_is_binary)
+    {
+        return cl::Program(_context, { _device }, { _binary });
+    }
+    else
+    {
+        return cl::Program(_context, _source, false);
+    }
+}
+
+bool Program::build(const cl::Program &program, const std::string &build_options)
+{
+    try
+    {
+        return program.build(build_options.c_str()) == CL_SUCCESS;
+    }
+    catch(const cl::Error &e)
+    {
+        cl_int     err        = CL_SUCCESS;
+        const auto build_info = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(&err);
+
+        for(auto &pair : build_info)
+        {
+            std::cerr << pair.second << std::endl;
+        }
+
+        return false;
+    }
+}
+
+cl::Program Program::build(const std::string &build_options) const
+{
+    cl::Program cl_program = static_cast<cl::Program>(*this);
+    build(cl_program, build_options);
+    return cl_program;
+}
+
+Kernel::Kernel()
+    : _name(), _kernel()
+{
+}
+
+Kernel::Kernel(std::string name, const cl::Program &program)
+    : _name(std::move(name)),
+      _kernel(cl::Kernel(program, _name.c_str()))
+{
+}
+
+const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
+{
+    { "absdiff", "absdiff.cl" },
+    { "accumulate", "accumulate.cl" },
+    { "accumulate_squared", "accumulate.cl" },
+    { "accumulate_weighted", "accumulate.cl" },
+    { "activation_layer", "activation_layer.cl" },
+    { "arithmetic_add", "arithmetic_op.cl" },
+    { "arithmetic_sub", "arithmetic_op.cl" },
+    { "bitwise_or", "bitwise_op.cl" },
+    { "bitwise_and", "bitwise_op.cl" },
+    { "bitwise_xor", "bitwise_op.cl" },
+    { "bitwise_not", "bitwise_op.cl" },
+    { "channel_combine_NV", "channel_combine.cl" },
+    { "channel_combine_RGB888", "channel_combine.cl" },
+    { "channel_combine_RGBA8888", "channel_combine.cl" },
+    { "channel_combine_UYVY422", "channel_combine.cl" },
+    { "channel_combine_YUYV422", "channel_combine.cl" },
+    { "channel_extract_NV12", "channel_extract.cl" },
+    { "channel_extract_NV21", "channel_extract.cl" },
+    { "channel_extract_RGB888", "channel_extract.cl" },
+    { "channel_extract_RGBA8888", "channel_extract.cl" },
+    { "channel_extract_UYVY422", "channel_extract.cl" },
+    { "channel_extract_YUYV422", "channel_extract.cl" },
+    { "combine_gradients_L1", "canny.cl" },
+    { "combine_gradients_L2", "canny.cl" },
+    { "concatenate_depth", "concatenate.cl" },
+    { "convolution_rectangle", "convolution_rectangle.cl" },
+    { "col2im", "convolution_layer.cl" },
+    { "convolution3x3_static", "convolution3x3.cl" },
+    { "convolution5x5_static", "convolution5x5.cl" },
+    { "convolution7x7_static", "convolution7x7.cl" },
+    { "convolution9x9_static", "convolution9x9.cl" },
+    { "convolution_separable1x5_static", "convolution5x5.cl" },
+    { "convolution_separable5x1_static", "convolution5x5.cl" },
+    { "convolution_separable1x7_static", "convolution7x7.cl" },
+    { "convolution_separable7x1_static", "convolution7x7.cl" },
+    { "convolution_separable1x9_static", "convolution9x9.cl" },
+    { "convolution_separable9x1_static", "convolution9x9.cl" },
+    { "convert_depth_down", "depth_convert.cl" },
+    { "convert_depth_up", "depth_convert.cl" },
+    { "copy_plane", "channel_extract.cl" },
+    { "copy_planes_3p", "channel_combine.cl" },
+    { "copy_to_keypoint", "fast_corners.cl" },
+    { "derivative", "derivative.cl" },
+    { "dilate", "dilate.cl" },
+    { "erode", "erode.cl" },
+    { "fast_corners", "fast_corners.cl" },
+    { "fill_image_borders_constant", "fill_border.cl" },
+    { "fill_image_borders_replicate", "fill_border.cl" },
+    { "finalize", "optical_flow_pyramid_lk.cl" },
+    { "gaussian1x5_sub_x", "gaussian_pyramid.cl" },
+    { "gaussian5x1_sub_y", "gaussian_pyramid.cl" },
+    { "gemm_accumulate_biases_f16", "gemm.cl" },
+    { "gemm_accumulate_biases_f32", "gemm.cl" },
+    { "gemm_interleave4x4_8bit", "gemm.cl" },
+    { "gemm_interleave4x4_16bit", "gemm.cl" },
+    { "gemm_interleave4x4_32bit", "gemm.cl" },
+    { "gemm_ma_f16", "gemm.cl" },
+    { "gemm_ma_f32", "gemm.cl" },
+    { "gemm_mm_u8", "gemm.cl" },
+    { "gemm_mm_f16", "gemm.cl" },
+    { "gemm_mm_f32_midgard", "gemm.cl" },
+    { "gemm_mm_f32_bifrost", "gemm.cl" },
+    { "gemm_vm_f16", "gemm.cl" },
+    { "gemm_vm_f32", "gemm.cl" },
+    { "gemm_lc_vm_f32", "gemm.cl" },
+    { "gemm_transpose1x16_u8", "gemm.cl" },
+    { "gemm_transpose1x8_f16", "gemm.cl" },
+    { "gemm_transpose1x4_f32", "gemm.cl" },
+    { "harris_score_3x3", "harris_corners.cl" },
+    { "harris_score_5x5", "harris_corners.cl" },
+    { "harris_score_7x7", "harris_corners.cl" },
+    { "hist_border_kernel", "histogram.cl" },
+    { "hist_border_kernel_fixed", "histogram.cl" },
+    { "hist_local_kernel", "histogram.cl" },
+    { "hist_local_kernel_fixed", "histogram.cl" },
+    { "hog_block_normalization", "hog.cl" },
+    { "hog_detector", "hog.cl" },
+    { "hog_orientation_binning", "hog.cl" },
+    { "hysteresis", "canny.cl" },
+    { "im2col_generic", "convolution_layer.cl" },
+    { "im2col_reduced", "convolution_layer.cl" },
+    { "init_level", "optical_flow_pyramid_lk.cl" },
+    { "init_level_max", "optical_flow_pyramid_lk.cl" },
+    { "init_level_max_initial_estimate", "optical_flow_pyramid_lk.cl" },
+    { "integral_horizontal", "integral_image.cl" },
+    { "integral_vertical", "integral_image.cl" },
+    { "IYUV_to_NV12_bt709", "color_convert.cl" },
+    { "IYUV_to_RGB888_bt709", "color_convert.cl" },
+    { "IYUV_to_RGBA8888_bt709", "color_convert.cl" },
+    { "IYUV_to_YUV444_bt709", "color_convert.cl" },
+    { "lktracker_stage0", "optical_flow_pyramid_lk.cl" },
+    { "lktracker_stage1", "optical_flow_pyramid_lk.cl" },
+    { "magnitude_phase", "magnitude_phase.cl" },
+    { "mean_stddev_accumulate", "mean_stddev.cl" },
+    { "minmax", "minmaxloc.cl" },
+    { "minmax_border", "minmaxloc.cl" },
+    { "minmaxloc", "minmaxloc.cl" },
+    { "non_linear_filter_box3x3", "non_linear_filter3x3.cl" },
+    { "non_linear_filter_cross3x3", "non_linear_filter3x3.cl" },
+    { "non_linear_filter_disk3x3", "non_linear_filter3x3.cl" },
+    { "non_linear_filter_box5x5", "non_linear_filter5x5.cl" },
+    { "non_linear_filter_cross5x5", "non_linear_filter5x5.cl" },
+    { "non_linear_filter_disk5x5", "non_linear_filter5x5.cl" },
+    { "non_max_suppression", "nonmax.cl" },
+    { "normalization_layer_cross_map", "normalization_layer.cl" },
+    { "normalization_layer_in_map_1D", "normalization_layer.cl" },
+    { "batchnormalization_layer", "batchnormalization_layer.cl" },
+    { "NV12_to_IYUV_bt709", "color_convert.cl" },
+    { "NV12_to_RGB888_bt709", "color_convert.cl" },
+    { "NV12_to_RGBA8888_bt709", "color_convert.cl" },
+    { "NV12_to_YUV444_bt709", "color_convert.cl" },
+    { "NV21_to_IYUV_bt709", "color_convert.cl" },
+    { "NV21_to_RGB888_bt709", "color_convert.cl" },
+    { "NV21_to_RGBA8888_bt709", "color_convert.cl" },
+    { "NV21_to_YUV444_bt709", "color_convert.cl" },
+    { "pixelwise_mul_float", "pixelwise_mul_float.cl" },
+    { "pixelwise_mul_int", "pixelwise_mul_int.cl" },
+    { "pooling_layer_2", "pooling_layer.cl" },
+    { "pooling_layer_3", "pooling_layer.cl" },
+    { "remap_nearest_neighbour", "remap.cl" },
+    { "remap_bilinear", "remap.cl" },
+    { "reshape_to_columns", "convolution_layer.cl" },
+    { "RGB888_to_IYUV_bt709", "color_convert.cl" },
+    { "RGB888_to_NV12_bt709", "color_convert.cl" },
+    { "RGB888_to_RGBA8888_bt709", "color_convert.cl" },
+    { "RGB888_to_YUV444_bt709", "color_convert.cl" },
+    { "RGBA8888_to_IYUV_bt709", "color_convert.cl" },
+    { "RGBA8888_to_NV12_bt709", "color_convert.cl" },
+    { "RGBA8888_to_RGB888_bt709", "color_convert.cl" },
+    { "RGBA8888_to_YUV444_bt709", "color_convert.cl" },
+    { "scale_nearest_neighbour", "scale.cl" },
+    { "scale_bilinear", "scale.cl" },
+    { "scharr3x3", "scharr_filter.cl" },
+    { "sobel3x3", "sobel_filter.cl" },
+    { "sobel_separable5x1", "sobel_filter.cl" },
+    { "sobel_separable1x5", "sobel_filter.cl" },
+    { "sobel_separable7x1", "sobel_filter.cl" },
+    { "sobel_separable1x7", "sobel_filter.cl" },
+    { "softmax_layer_max", "softmax_layer.cl" },
+    { "softmax_layer_shift_exp_sum", "softmax_layer.cl" },
+    { "softmax_layer_norm", "softmax_layer.cl" },
+    { "suppress_non_maximum", "canny.cl" },
+    { "tablelookup_U8", "tablelookup.cl" },
+    { "tablelookup_S16", "tablelookup.cl" },
+    { "threshold_binary", "threshold.cl" },
+    { "threshold_range", "threshold.cl" },
+    { "transpose", "transpose.cl" },
+    { "UYVY422_to_IYUV_bt709", "color_convert.cl" },
+    { "UYVY422_to_NV12_bt709", "color_convert.cl" },
+    { "UYVY422_to_RGB888_bt709", "color_convert.cl" },
+    { "UYVY422_to_RGBA8888_bt709", "color_convert.cl" },
+    { "warp_affine_nearest_neighbour", "warp_affine.cl" },
+    { "warp_affine_bilinear", "warp_affine.cl" },
+    { "warp_perspective_nearest_neighbour", "warp_perspective.cl" },
+    { "warp_perspective_bilinear", "warp_perspective.cl" },
+    { "YUYV422_to_IYUV_bt709", "color_convert.cl" },
+    { "YUYV422_to_NV12_bt709", "color_convert.cl" },
+    { "YUYV422_to_RGB888_bt709", "color_convert.cl" },
+    { "YUYV422_to_RGBA8888_bt709", "color_convert.cl" },
+};
+
+const std::map<std::string, std::string> CLKernelLibrary::_program_source_map =
+{
+#ifdef EMBEDDED_KERNELS
+    {
+        "absdiff.cl",
+#include "./cl_kernels/absdiff.clembed"
+    },
+    {
+        "accumulate.cl",
+#include "./cl_kernels/accumulate.clembed"
+    },
+    {
+        "activation_layer.cl",
+#include "./cl_kernels/activation_layer.clembed"
+    },
+    {
+        "arithmetic_op.cl",
+#include "./cl_kernels/arithmetic_op.clembed"
+    },
+    {
+        "bitwise_op.cl",
+#include "./cl_kernels/bitwise_op.clembed"
+    },
+    {
+        "canny.cl",
+#include "./cl_kernels/canny.clembed"
+    },
+    {
+        "channel_combine.cl",
+#include "./cl_kernels/channel_combine.clembed"
+    },
+    {
+        "channel_extract.cl",
+#include "./cl_kernels/channel_extract.clembed"
+    },
+    {
+        "concatenate.cl",
+#include "./cl_kernels/concatenate.clembed"
+    },
+    {
+        "color_convert.cl",
+#include "./cl_kernels/color_convert.clembed"
+    },
+    {
+        "convolution3x3.cl",
+#include "./cl_kernels/convolution3x3.clembed"
+    },
+    {
+        "convolution5x5.cl",
+#include "./cl_kernels/convolution5x5.clembed"
+    },
+    {
+        "convolution7x7.cl",
+#include "./cl_kernels/convolution7x7.clembed"
+    },
+    {
+        "convolution9x9.cl",
+#include "./cl_kernels/convolution9x9.clembed"
+    },
+    {
+        "convolution_layer.cl",
+#include "./cl_kernels/convolution_layer.clembed"
+    },
+    {
+        "convolution_rectangle.cl",
+#include "./cl_kernels/convolution_rectangle.clembed"
+    },
+    {
+        "depth_convert.cl",
+#include "./cl_kernels/depth_convert.clembed"
+    },
+    {
+        "derivative.cl",
+#include "./cl_kernels/derivative.clembed"
+    },
+    {
+        "dilate.cl",
+#include "./cl_kernels/dilate.clembed"
+    },
+    {
+        "erode.cl",
+#include "./cl_kernels/erode.clembed"
+    },
+    {
+        "fast_corners.cl",
+#include "./cl_kernels/fast_corners.clembed"
+    },
+    {
+        "fill_border.cl",
+#include "./cl_kernels/fill_border.clembed"
+    },
+    {
+        "gaussian_pyramid.cl",
+#include "./cl_kernels/gaussian_pyramid.clembed"
+    },
+    {
+        "gemm.cl",
+#include "./cl_kernels/gemm.clembed"
+    },
+    {
+        "harris_corners.cl",
+#include "./cl_kernels/harris_corners.clembed"
+    },
+    {
+        "helpers.h",
+#include "./cl_kernels/helpers.hembed"
+    },
+    {
+        "histogram.cl",
+#include "./cl_kernels/histogram.clembed"
+    },
+    {
+        "hog.cl",
+#include "./cl_kernels/hog.clembed"
+    },
+    {
+        "integral_image.cl",
+#include "./cl_kernels/integral_image.clembed"
+    },
+    {
+        "magnitude_phase.cl",
+#include "./cl_kernels/magnitude_phase.clembed"
+    },
+    {
+        "mean_stddev.cl",
+#include "./cl_kernels/mean_stddev.clembed"
+    },
+    {
+        "minmaxloc.cl",
+#include "./cl_kernels/minmaxloc.clembed"
+    },
+    {
+        "non_linear_filter3x3.cl",
+#include "./cl_kernels/non_linear_filter3x3.clembed"
+    },
+    {
+        "non_linear_filter5x5.cl",
+#include "./cl_kernels/non_linear_filter5x5.clembed"
+    },
+    {
+        "non_linear_filter_helpers.h",
+#include "./cl_kernels/non_linear_filter_helpers.hembed"
+    },
+    {
+        "nonmax.cl",
+#include "./cl_kernels/nonmax.clembed"
+    },
+    {
+        "normalization_layer.cl",
+#include "./cl_kernels/normalization_layer.clembed"
+    },
+    {
+        "batchnormalization_layer.cl",
+#include "./cl_kernels/batchnormalization_layer.clembed"
+    },
+    {
+        "optical_flow_pyramid_lk.cl",
+#include "./cl_kernels/optical_flow_pyramid_lk.clembed"
+    },
+    {
+        "pixelwise_mul_float.cl",
+#include "./cl_kernels/pixelwise_mul_float.clembed"
+    },
+    {
+        "pixelwise_mul_int.cl",
+#include "./cl_kernels/pixelwise_mul_int.clembed"
+    },
+    {
+        "pooling_layer.cl",
+#include "./cl_kernels/pooling_layer.clembed"
+    },
+    {
+        "remap.cl",
+#include "./cl_kernels/remap.clembed"
+    },
+    {
+        "scale.cl",
+#include "./cl_kernels/scale.clembed"
+    },
+    {
+        "scharr_filter.cl",
+#include "./cl_kernels/scharr_filter.clembed"
+    },
+    {
+        "sobel_filter.cl",
+#include "./cl_kernels/sobel_filter.clembed"
+    },
+    {
+        "softmax_layer.cl",
+#include "./cl_kernels/softmax_layer.clembed"
+    },
+    {
+        "tablelookup.cl",
+#include "./cl_kernels/tablelookup.clembed"
+    },
+    {
+        "threshold.cl",
+#include "./cl_kernels/threshold.clembed"
+    },
+    {
+        "transpose.cl",
+#include "./cl_kernels/transpose.clembed"
+    },
+    {
+        "types.h",
+#include "./cl_kernels/types.hembed"
+    },
+    {
+        "warp_affine.cl",
+#include "./cl_kernels/warp_affine.clembed"
+    },
+    {
+        "warp_helpers.h",
+#include "./cl_kernels/warp_helpers.hembed"
+    },
+    {
+        "warp_perspective.cl",
+#include "./cl_kernels/warp_perspective.clembed"
+    }
+#endif
+};
+
+CLKernelLibrary::CLKernelLibrary()
+    : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map()
+{
+}
+
+CLKernelLibrary &CLKernelLibrary::get()
+{
+    static CLKernelLibrary _kernel_library;
+    return _kernel_library;
+}
+
+Kernel CLKernelLibrary::create_kernel(const std::string &kernel_name, const StringSet &build_options_set) const
+{
+    // Find which program contains the kernel
+    auto kernel_program_it = _kernel_program_map.find(kernel_name);
+
+    if(_kernel_program_map.end() == kernel_program_it)
+    {
+        ARM_COMPUTE_ERROR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str());
+    }
+
+    // Check if the program has been built before with same build options.
+    const std::string program_name       = kernel_program_it->second;
+    const std::string build_options      = stringify_set(build_options_set);
+    const std::string built_program_name = program_name + "_" + build_options;
+    auto              built_program_it   = _built_programs_map.find(built_program_name);
+
+    cl::Program cl_program;
+
+    if(_built_programs_map.end() != built_program_it)
+    {
+        // If program has been built, retrieve to create kernel from it
+        cl_program = built_program_it->second;
+    }
+    else
+    {
+        // Get program
+        Program program = load_program(program_name);
+
+        // Build program
+        cl_program = program.build(build_options);
+
+        // Add built program to internal map
+        _built_programs_map.emplace(built_program_name, cl_program);
+    }
+
+    // Create and return kernel
+    return Kernel(kernel_name, cl_program);
+}
+
+const Program &CLKernelLibrary::load_program(const std::string &program_name) const
+{
+    const auto program_it = _programs_map.find(program_name);
+
+    if(program_it != _programs_map.end())
+    {
+        return program_it->second;
+    }
+
+    Program program;
+
+#ifdef EMBEDDED_KERNELS
+    const auto program_source_it = _program_source_map.find(program_name);
+
+    if(_program_source_map.end() == program_source_it)
+    {
+        ARM_COMPUTE_ERROR("Embedded program for %s does not exist.", program_name.c_str());
+    }
+
+    program = Program(_context, program_name, program_source_it->second);
+#else
+    // Check for binary
+    std::string source_name = _kernel_path + program_name;
+    std::string binary_name = source_name + "bin";
+
+    if(std::ifstream(binary_name).is_open())
+    {
+        const std::string program_binary = read_file(binary_name, true);
+        program                          = Program(_context, _device, program_name, std::vector<unsigned char>(program_binary.begin(), program_binary.end()));
+    }
+    else if(std::ifstream(source_name).is_open())
+    {
+        program = Program(_context, program_name, read_file(source_name, false));
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Kernel file %s does not exist.", source_name.c_str());
+    }
+#endif
+
+    // Insert program to program map
+    const auto new_program = _programs_map.emplace(program_name, std::move(program));
+
+    return new_program.first->second;
+}
+
+std::string CLKernelLibrary::stringify_set(const StringSet &s) const
+{
+    std::string concat_set = "-cl-arm-non-uniform-work-group-size ";
+
+#ifndef EMBEDDED_KERNELS
+    concat_set += "-I" + _kernel_path + " ";
+#endif /* EMBEDDED_KERNELS */
+
+    // Concatenate set
+    for(const auto &el : s)
+    {
+        concat_set += " " + el;
+    }
+
+    return concat_set;
+}
diff --git a/src/core/CL/ICLDistribution1D.cpp b/src/core/CL/ICLDistribution1D.cpp
new file mode 100644
index 0000000000..a645d0ed71
--- /dev/null
+++ b/src/core/CL/ICLDistribution1D.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/ICLDistribution1D.h"
+
+#include "arm_compute/core/Error.h"
+
+using namespace arm_compute;
+
+ICLDistribution1D::ICLDistribution1D(size_t num_bins, int32_t offset, uint32_t range)
+    : IDistribution1D(num_bins, offset, range), _mapping(nullptr)
+{
+}
+
+void ICLDistribution1D::map(cl::CommandQueue &q, bool blocking)
+{
+    ARM_COMPUTE_ERROR_ON(_mapping != nullptr);
+    _mapping = do_map(q, blocking);
+}
+
+void ICLDistribution1D::unmap(cl::CommandQueue &q)
+{
+    ARM_COMPUTE_ERROR_ON(_mapping == nullptr);
+    do_unmap(q);
+    _mapping = nullptr;
+}
+
+uint32_t *ICLDistribution1D::buffer() const
+{
+    return _mapping;
+}
diff --git a/src/core/CL/ICLHOG.cpp b/src/core/CL/ICLHOG.cpp
new file mode 100644
index 0000000000..e1829971cf
--- /dev/null
+++ b/src/core/CL/ICLHOG.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/ICLHOG.h"
+
+using namespace arm_compute;
+
+ICLHOG::ICLHOG()
+    : _mapping(nullptr)
+{
+}
+
+void ICLHOG::map(cl::CommandQueue &q, bool blocking)
+{
+    _mapping = do_map(q, blocking);
+}
+
+void ICLHOG::unmap(cl::CommandQueue &q)
+{
+    do_unmap(q);
+    _mapping = nullptr;
+}
+
+float *ICLHOG::descriptor() const
+{
+    return reinterpret_cast<float *>(_mapping);
+}
+\ No newline at end of file
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
new file mode 100644
index 0000000000..7ac0fe3bbb
--- /dev/null
+++ b/src/core/CL/ICLKernel.cpp
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/ICLKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <cstddef>
+
+using namespace arm_compute;
+
+void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint)
+{
+    if(kernel.kernel()() == nullptr)
+    {
+        return;
+    }
+
+    ARM_COMPUTE_ERROR_ON((0 == (window.x().end() - window.x().start())) || (0 == (window.y().end() - window.y().start())));
+
+    cl::NDRange gws((window.x().end() - window.x().start()) / window.x().step(),
+                    (window.y().end() - window.y().start()) / window.y().step(),
+                    (window.z().end() - window.z().start()) / window.z().step());
+
+    cl::NDRange lws = cl::NullRange;
+
+    if((lws_hint[0] <= gws[0]) && (lws_hint[1] <= gws[1]) && (lws_hint[2] <= gws[2]))
+    {
+        lws = lws_hint;
+    }
+
+    queue.enqueueNDRangeKernel(kernel.kernel(), cl::NullRange, gws, lws);
+}
+
+ICLKernel::ICLKernel()
+    : _kernel(nullptr), _lws_hint(cl::Range_128_1), _target(CLScheduler::get().target())
+{
+}
+
+cl::Kernel &ICLKernel::kernel()
+{
+    return _kernel;
+}
+
+template <unsigned int dimension_size>
+unsigned int           ICLKernel::num_arguments_per_tensor() const
+{
+    return 2 + 2 * dimension_size;
+}
+
+template <unsigned int dimension_size>
+void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON(tensor == nullptr);
+
+    const ITensorInfo *info    = tensor->info();
+    const Strides     &strides = info->strides_in_bytes();
+
+    // Calculate offset to the start of the window
+    unsigned int offset_first_element = info->offset_first_element_in_bytes();
+
+    for(unsigned int n = 0; n < info->num_dimensions(); ++n)
+    {
+        offset_first_element += window[n].start() * strides[n];
+    }
+
+    unsigned int idx_start = idx;
+    _kernel.setArg(idx++, tensor->cl_buffer());
+
+    for(unsigned int dimension = 0; dimension < dimension_size; dimension++)
+    {
+        _kernel.setArg<cl_uint>(idx++, strides[dimension]);
+        _kernel.setArg<cl_uint>(idx++, strides[dimension] * window[dimension].step());
+    }
+
+    _kernel.setArg<cl_uint>(idx++, offset_first_element);
+
+    ARM_COMPUTE_ERROR_ON_MSG(idx_start + num_arguments_per_tensor<dimension_size>() != idx,
+                             "add_%dD_tensor_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_tensor<dimension_size>());
+    ARM_COMPUTE_UNUSED(idx_start);
+}
+
+void ICLKernel::add_1D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
+{
+    add_tensor_argument<1>(idx, tensor, window);
+}
+
+void ICLKernel::add_2D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
+{
+    add_tensor_argument<2>(idx, tensor, window);
+}
+
+void ICLKernel::add_3D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
+{
+    add_tensor_argument<3>(idx, tensor, window);
+}
+
+unsigned int ICLKernel::num_arguments_per_1D_tensor() const
+{
+    return num_arguments_per_tensor<1>();
+}
+
+unsigned int ICLKernel::num_arguments_per_2D_tensor() const
+{
+    return num_arguments_per_tensor<2>();
+}
+
+unsigned int ICLKernel::num_arguments_per_3D_tensor() const
+{
+    return num_arguments_per_tensor<3>();
+}
+
+void ICLKernel::set_target(cl::Device &device)
+{
+    _target = get_target_from_device(device);
+}
+
+void ICLKernel::set_target(GPUTarget target)
+{
+    _target = target;
+}
+
+GPUTarget ICLKernel::get_target() const
+{
+    return _target;
+}
diff --git a/src/core/CL/ICLLut.cpp b/src/core/CL/ICLLut.cpp
new file mode 100644
index 0000000000..ea9deac6dc
--- /dev/null
+++ b/src/core/CL/ICLLut.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/ICLLut.h"
+
+using namespace arm_compute;
+
+ICLLut::ICLLut()
+    : _mapping(nullptr)
+{
+}
+
+void ICLLut::map(cl::CommandQueue &q, bool blocking)
+{
+    _mapping = do_map(q, blocking);
+}
+
+void ICLLut::unmap(cl::CommandQueue &q)
+{
+    do_unmap(q);
+    _mapping = nullptr;
+}
+
+uint8_t *ICLLut::buffer() const
+{
+    return _mapping;
+}
diff --git a/src/core/CL/ICLMultiHOG.cpp b/src/core/CL/ICLMultiHOG.cpp
new file mode 100644
index 0000000000..8ece566e83
--- /dev/null
+++ b/src/core/CL/ICLMultiHOG.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/ICLMultiHOG.h"
+
+#include "arm_compute/core/IHOG.h"
+
+using namespace arm_compute;
+
+IHOG *ICLMultiHOG::model(size_t index)
+{
+    return cl_model(index);
+}
+
+const IHOG *ICLMultiHOG::model(size_t index) const
+{
+    return cl_model(index);
+}
+\ No newline at end of file
diff --git a/src/core/CL/ICLMultiImage.cpp b/src/core/CL/ICLMultiImage.cpp
new file mode 100644
index 0000000000..dbf3fe3e6f
--- /dev/null
+++ b/src/core/CL/ICLMultiImage.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/ICLMultiImage.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/ITensor.h"
+
+using namespace arm_compute;
+
+IImage *ICLMultiImage::plane(unsigned int index)
+{
+    return cl_plane(index);
+}
+
+const IImage *ICLMultiImage::plane(unsigned int index) const
+{
+    return cl_plane(index);
+}
diff --git a/src/core/CL/ICLSimple2DKernel.cpp b/src/core/CL/ICLSimple2DKernel.cpp
new file mode 100644
index 0000000000..5dc3e6c8bb
--- /dev/null
+++ b/src/core/CL/ICLSimple2DKernel.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/ICLSimple2DKernel.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+void ICLSimple2DKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice, _lws_hint);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/ICLSimple3DKernel.cpp b/src/core/CL/ICLSimple3DKernel.cpp
new file mode 100644
index 0000000000..7b0d011b3e
--- /dev/null
+++ b/src/core/CL/ICLSimple3DKernel.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/ICLSimple3DKernel.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+void ICLSimple3DKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice = window.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}
diff --git a/src/core/CL/ICLSimpleKernel.cpp b/src/core/CL/ICLSimpleKernel.cpp
new file mode 100644
index 0000000000..fec9d923da
--- /dev/null
+++ b/src/core/CL/ICLSimpleKernel.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/ICLSimpleKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+ICLSimpleKernel::ICLSimpleKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void ICLSimpleKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined, const BorderSize &border_size)
+{
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size);
+
+    ICLKernel::configure(win);
+}
diff --git a/src/core/CL/ICLTensor.cpp b/src/core/CL/ICLTensor.cpp
new file mode 100644
index 0000000000..4a7952e108
--- /dev/null
+++ b/src/core/CL/ICLTensor.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/ICLTensor.h"
+
+#include <cstring>
+
+using namespace arm_compute;
+
+ICLTensor::ICLTensor()
+    : _mapping(nullptr)
+{
+}
+
+void ICLTensor::map(cl::CommandQueue &q, bool blocking)
+{
+    _mapping = do_map(q, blocking);
+}
+
+void ICLTensor::unmap(cl::CommandQueue &q)
+{
+    do_unmap(q);
+    _mapping = nullptr;
+}
+
+void ICLTensor::clear(cl::CommandQueue &q)
+{
+    this->map(q);
+    std::memset(static_cast<void *>(_mapping), 0, this->info()->total_size());
+    this->unmap(q);
+}
+
+uint8_t *ICLTensor::buffer() const
+{
+    return _mapping;
+}
diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
new file mode 100644
index 0000000000..3b8dfd2465
--- /dev/null
+++ b/src/core/CL/OpenCL.cpp
@@ -0,0 +1,586 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/OpenCL.h"
+
+#include <dlfcn.h>
+#include <iostream>
+
+using clBuildProgram_func            = cl_int (*)(cl_program, cl_uint, const cl_device_id *, const char *, void (*pfn_notify)(cl_program, void *), void *);
+using clEnqueueNDRangeKernel_func    = cl_int (*)(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *);
+using clSetKernelArg_func            = cl_int (*)(cl_kernel, cl_uint, size_t, const void *);
+using clReleaseMemObject_func        = cl_int (*)(cl_mem);
+using clEnqueueUnmapMemObject_func   = cl_int (*)(cl_command_queue, cl_mem, void *, cl_uint, const cl_event *, cl_event *);
+using clRetainCommandQueue_func      = cl_int (*)(cl_command_queue command_queue);
+using clReleaseContext_func          = cl_int (*)(cl_context);
+using clReleaseEvent_func            = cl_int (*)(cl_event);
+using clEnqueueWriteBuffer_func      = cl_int (*)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *);
+using clEnqueueReadBuffer_func       = cl_int (*)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *);
+using clGetProgramBuildInfo_func     = cl_int (*)(cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *);
+using clRetainProgram_func           = cl_int (*)(cl_program program);
+using clEnqueueMapBuffer_func        = void *(*)(cl_command_queue, cl_mem, cl_bool, cl_map_flags, size_t, size_t, cl_uint, const cl_event *, cl_event *, cl_int *);
+using clReleaseCommandQueue_func     = cl_int (*)(cl_command_queue);
+using clCreateProgramWithBinary_func = cl_program (*)(cl_context, cl_uint, const cl_device_id *, const size_t *, const unsigned char **, cl_int *, cl_int *);
+using clRetainContext_func           = cl_int (*)(cl_context context);
+using clReleaseProgram_func          = cl_int (*)(cl_program program);
+using clFlush_func                   = cl_int (*)(cl_command_queue command_queue);
+using clGetProgramInfo_func          = cl_int (*)(cl_program, cl_program_info, size_t, void *, size_t *);
+using clCreateKernel_func            = cl_kernel (*)(cl_program, const char *, cl_int *);
+using clRetainKernel_func            = cl_int (*)(cl_kernel kernel);
+using clCreateBuffer_func            = cl_mem (*)(cl_context, cl_mem_flags, size_t, void *, cl_int *);
+using clCreateProgramWithSource_func = cl_program (*)(cl_context, cl_uint, const char **, const size_t *, cl_int *);
+using clReleaseKernel_func           = cl_int (*)(cl_kernel kernel);
+using clGetDeviceInfo_func           = cl_int (*)(cl_device_id, cl_device_info, size_t, void *, size_t *);
+using clGetDeviceIDs_func            = cl_int (*)(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *);
+
+class CLSymbols
+{
+private:
+    CLSymbols()
+    {
+        void *handle = dlopen("libOpenCL.so", RTLD_LAZY | RTLD_LOCAL);
+        if(handle == nullptr)
+        {
+            std::cerr << "Can't load libOpenCL.so: " << dlerror() << std::endl;
+        }
+        else
+        {
+            clBuildProgram            = reinterpret_cast<clBuildProgram_func>(dlsym(handle, "clBuildProgram"));
+            clEnqueueNDRangeKernel    = reinterpret_cast<clEnqueueNDRangeKernel_func>(dlsym(handle, "clEnqueueNDRangeKernel"));
+            clSetKernelArg            = reinterpret_cast<clSetKernelArg_func>(dlsym(handle, "clSetKernelArg"));
+            clReleaseKernel           = reinterpret_cast<clReleaseKernel_func>(dlsym(handle, "clReleaseKernel"));
+            clCreateProgramWithSource = reinterpret_cast<clCreateProgramWithSource_func>(dlsym(handle, "clCreateProgramWithSource"));
+            clCreateBuffer            = reinterpret_cast<clCreateBuffer_func>(dlsym(handle, "clCreateBuffer"));
+            clRetainKernel            = reinterpret_cast<clRetainKernel_func>(dlsym(handle, "clRetainKernel"));
+            clCreateKernel            = reinterpret_cast<clCreateKernel_func>(dlsym(handle, "clCreateKernel"));
+            clGetProgramInfo          = reinterpret_cast<clGetProgramInfo_func>(dlsym(handle, "clGetProgramInfo"));
+            clFlush                   = reinterpret_cast<clFlush_func>(dlsym(handle, "clFlush"));
+            clReleaseProgram          = reinterpret_cast<clReleaseProgram_func>(dlsym(handle, "clReleaseProgram"));
+            clRetainContext           = reinterpret_cast<clRetainContext_func>(dlsym(handle, "clRetainContext"));
+            clCreateProgramWithBinary = reinterpret_cast<clCreateProgramWithBinary_func>(dlsym(handle, "clCreateProgramWithBinary"));
+            clReleaseCommandQueue     = reinterpret_cast<clReleaseCommandQueue_func>(dlsym(handle, "clReleaseCommandQueue"));
+            clEnqueueMapBuffer        = reinterpret_cast<clEnqueueMapBuffer_func>(dlsym(handle, "clEnqueueMapBuffer"));
+            clRetainProgram           = reinterpret_cast<clRetainProgram_func>(dlsym(handle, "clRetainProgram"));
+            clGetProgramBuildInfo     = reinterpret_cast<clGetProgramBuildInfo_func>(dlsym(handle, "clGetProgramBuildInfo"));
+            clEnqueueReadBuffer       = reinterpret_cast<clEnqueueReadBuffer_func>(dlsym(handle, "clEnqueueReadBuffer"));
+            clEnqueueWriteBuffer      = reinterpret_cast<clEnqueueWriteBuffer_func>(dlsym(handle, "clEnqueueWriteBuffer"));
+            clReleaseEvent            = reinterpret_cast<clReleaseEvent_func>(dlsym(handle, "clReleaseEvent"));
+            clReleaseContext          = reinterpret_cast<clReleaseContext_func>(dlsym(handle, "clReleaseContext"));
+            clRetainCommandQueue      = reinterpret_cast<clRetainCommandQueue_func>(dlsym(handle, "clRetainCommandQueue"));
+            clEnqueueUnmapMemObject   = reinterpret_cast<clEnqueueUnmapMemObject_func>(dlsym(handle, "clEnqueueUnmapMemObject"));
+            clReleaseMemObject        = reinterpret_cast<clReleaseMemObject_func>(dlsym(handle, "clReleaseMemObject"));
+            clGetDeviceInfo           = reinterpret_cast<clGetDeviceInfo_func>(dlsym(handle, "clGetDeviceInfo"));
+            clGetDeviceIDs            = reinterpret_cast<clGetDeviceIDs_func>(dlsym(handle, "clGetDeviceIDs"));
+            dlclose(handle);
+        }
+    }
+
+public:
+    static CLSymbols &get()
+    {
+        static CLSymbols symbols = CLSymbols();
+        return symbols;
+    }
+
+    clBuildProgram_func            clBuildProgram            = nullptr;
+    clEnqueueNDRangeKernel_func    clEnqueueNDRangeKernel    = nullptr;
+    clSetKernelArg_func            clSetKernelArg            = nullptr;
+    clReleaseKernel_func           clReleaseKernel           = nullptr;
+    clCreateProgramWithSource_func clCreateProgramWithSource = nullptr;
+    clCreateBuffer_func            clCreateBuffer            = nullptr;
+    clRetainKernel_func            clRetainKernel            = nullptr;
+    clCreateKernel_func            clCreateKernel            = nullptr;
+    clGetProgramInfo_func          clGetProgramInfo          = nullptr;
+    clFlush_func                   clFlush                   = nullptr;
+    clReleaseProgram_func          clReleaseProgram          = nullptr;
+    clRetainContext_func           clRetainContext           = nullptr;
+    clCreateProgramWithBinary_func clCreateProgramWithBinary = nullptr;
+    clReleaseCommandQueue_func     clReleaseCommandQueue     = nullptr;
+    clEnqueueMapBuffer_func        clEnqueueMapBuffer        = nullptr;
+    clRetainProgram_func           clRetainProgram           = nullptr;
+    clGetProgramBuildInfo_func     clGetProgramBuildInfo     = nullptr;
+    clEnqueueReadBuffer_func       clEnqueueReadBuffer       = nullptr;
+    clEnqueueWriteBuffer_func      clEnqueueWriteBuffer      = nullptr;
+    clReleaseEvent_func            clReleaseEvent            = nullptr;
+    clReleaseContext_func          clReleaseContext          = nullptr;
+    clRetainCommandQueue_func      clRetainCommandQueue      = nullptr;
+    clEnqueueUnmapMemObject_func   clEnqueueUnmapMemObject   = nullptr;
+    clReleaseMemObject_func        clReleaseMemObject        = nullptr;
+    clGetDeviceInfo_func           clGetDeviceInfo           = nullptr;
+    clGetDeviceIDs_func            clGetDeviceIDs            = nullptr;
+};
+
+bool arm_compute::opencl_is_available()
+{
+    return CLSymbols::get().clBuildProgram != nullptr;
+}
+
+cl_int clBuildProgram(
+    cl_program          program,
+    cl_uint             num_devices,
+    const cl_device_id *device_list,
+    const char         *options,
+    void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
+    void *user_data)
+{
+    auto func = CLSymbols::get().clBuildProgram;
+    if(func != nullptr)
+    {
+        return func(program, num_devices, device_list, options, pfn_notify, user_data);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clEnqueueNDRangeKernel(
+    cl_command_queue command_queue,
+    cl_kernel        kernel,
+    cl_uint          work_dim,
+    const size_t    *global_work_offset,
+    const size_t    *global_work_size,
+    const size_t    *local_work_size,
+    cl_uint          num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event        *event)
+{
+    auto func = CLSymbols::get().clEnqueueNDRangeKernel;
+    if(func != nullptr)
+    {
+        return func(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_events_in_wait_list, event_wait_list, event);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clSetKernelArg(
+    cl_kernel   kernel,
+    cl_uint     arg_index,
+    size_t      arg_size,
+    const void *arg_value)
+{
+    auto func = CLSymbols::get().clSetKernelArg;
+    if(func != nullptr)
+    {
+        return func(kernel, arg_index, arg_size, arg_value);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clReleaseMemObject(cl_mem memobj)
+{
+    auto func = CLSymbols::get().clReleaseMemObject;
+    if(func != nullptr)
+    {
+        return func(memobj);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clEnqueueUnmapMemObject(
+    cl_command_queue command_queue,
+    cl_mem           memobj,
+    void            *mapped_ptr,
+    cl_uint          num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event        *event)
+{
+    auto func = CLSymbols::get().clEnqueueUnmapMemObject;
+    if(func != nullptr)
+    {
+        return func(command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clRetainCommandQueue(cl_command_queue command_queue)
+{
+    auto func = CLSymbols::get().clRetainCommandQueue;
+    if(func != nullptr)
+    {
+        return func(command_queue);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clReleaseContext(cl_context context)
+{
+    auto func = CLSymbols::get().clReleaseContext;
+    if(func != nullptr)
+    {
+        return func(context);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+cl_int clReleaseEvent(cl_event event)
+{
+    auto func = CLSymbols::get().clReleaseEvent;
+    if(func != nullptr)
+    {
+        return func(event);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clEnqueueWriteBuffer(
+    cl_command_queue command_queue,
+    cl_mem           buffer,
+    cl_bool          blocking_write,
+    size_t           offset,
+    size_t           size,
+    const void      *ptr,
+    cl_uint          num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event        *event)
+{
+    auto func = CLSymbols::get().clEnqueueWriteBuffer;
+    if(func != nullptr)
+    {
+        return func(command_queue, buffer, blocking_write, offset, size, ptr, num_events_in_wait_list, event_wait_list, event);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clEnqueueReadBuffer(
+    cl_command_queue command_queue,
+    cl_mem           buffer,
+    cl_bool          blocking_read,
+    size_t           offset,
+    size_t           size,
+    void            *ptr,
+    cl_uint          num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event        *event)
+{
+    auto func = CLSymbols::get().clEnqueueReadBuffer;
+    if(func != nullptr)
+    {
+        return func(command_queue, buffer, blocking_read, offset, size, ptr, num_events_in_wait_list, event_wait_list, event);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clGetProgramBuildInfo(
+    cl_program            program,
+    cl_device_id          device,
+    cl_program_build_info param_name,
+    size_t                param_value_size,
+    void                 *param_value,
+    size_t               *param_value_size_ret)
+{
+    auto func = CLSymbols::get().clGetProgramBuildInfo;
+    if(func != nullptr)
+    {
+        return func(program, device, param_name, param_value_size, param_value, param_value_size_ret);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clRetainProgram(cl_program program)
+{
+    auto func = CLSymbols::get().clRetainProgram;
+    if(func != nullptr)
+    {
+        return func(program);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+void *clEnqueueMapBuffer(
+    cl_command_queue command_queue,
+    cl_mem           buffer,
+    cl_bool          blocking_map,
+    cl_map_flags     map_flags,
+    size_t           offset,
+    size_t           size,
+    cl_uint          num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event        *event,
+    cl_int          *errcode_ret)
+{
+    auto func = CLSymbols::get().clEnqueueMapBuffer;
+    if(func != nullptr)
+    {
+        return func(command_queue, buffer, blocking_map, map_flags, offset, size, num_events_in_wait_list, event_wait_list, event, errcode_ret);
+    }
+    else
+    {
+        if(errcode_ret != nullptr)
+        {
+            *errcode_ret = CL_OUT_OF_RESOURCES;
+        }
+        return nullptr;
+    }
+}
+
+cl_int clReleaseCommandQueue(cl_command_queue command_queue)
+{
+    auto func = CLSymbols::get().clReleaseCommandQueue;
+    if(func != nullptr)
+    {
+        return func(command_queue);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_program clCreateProgramWithBinary(
+    cl_context            context,
+    cl_uint               num_devices,
+    const cl_device_id   *device_list,
+    const size_t         *lengths,
+    const unsigned char **binaries,
+    cl_int               *binary_status,
+    cl_int               *errcode_ret)
+{
+    auto func = CLSymbols::get().clCreateProgramWithBinary;
+    if(func != nullptr)
+    {
+        return func(context, num_devices, device_list, lengths, binaries, binary_status, errcode_ret);
+    }
+    else
+    {
+        if(errcode_ret != nullptr)
+        {
+            *errcode_ret = CL_OUT_OF_RESOURCES;
+        }
+        return nullptr;
+    }
+}
+
+cl_int clRetainContext(cl_context context)
+{
+    auto func = CLSymbols::get().clRetainContext;
+    if(func != nullptr)
+    {
+        return func(context);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clReleaseProgram(cl_program program)
+{
+    auto func = CLSymbols::get().clReleaseProgram;
+    if(func != nullptr)
+    {
+        return func(program);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clFlush(cl_command_queue command_queue)
+{
+    auto func = CLSymbols::get().clFlush;
+    if(func != nullptr)
+    {
+        return func(command_queue);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clGetProgramInfo(
+    cl_program      program,
+    cl_program_info param_name,
+    size_t          param_value_size,
+    void           *param_value,
+    size_t         *param_value_size_ret)
+{
+    auto func = CLSymbols::get().clGetProgramInfo;
+    if(func != nullptr)
+    {
+        return func(program, param_name, param_value_size, param_value, param_value_size_ret);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_kernel clCreateKernel(
+    cl_program  program,
+    const char *kernel_name,
+    cl_int     *errcode_ret)
+{
+    auto func = CLSymbols::get().clCreateKernel;
+    if(func != nullptr)
+    {
+        return func(program, kernel_name, errcode_ret);
+    }
+    else
+    {
+        if(errcode_ret != nullptr)
+        {
+            *errcode_ret = CL_OUT_OF_RESOURCES;
+        }
+        return nullptr;
+    }
+}
+
+cl_int clRetainKernel(cl_kernel kernel)
+{
+    auto func = CLSymbols::get().clRetainKernel;
+    if(func != nullptr)
+    {
+        return func(kernel);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_mem clCreateBuffer(
+    cl_context   context,
+    cl_mem_flags flags,
+    size_t       size,
+    void        *host_ptr,
+    cl_int      *errcode_ret)
+{
+    auto func = CLSymbols::get().clCreateBuffer;
+    if(func != nullptr)
+    {
+        return func(context, flags, size, host_ptr, errcode_ret);
+    }
+    else
+    {
+        if(errcode_ret != nullptr)
+        {
+            *errcode_ret = CL_OUT_OF_RESOURCES;
+        }
+        return nullptr;
+    }
+}
+
+cl_program clCreateProgramWithSource(
+    cl_context    context,
+    cl_uint       count,
+    const char **strings,
+    const size_t *lengths,
+    cl_int       *errcode_ret)
+{
+    auto func = CLSymbols::get().clCreateProgramWithSource;
+    if(func != nullptr)
+    {
+        return func(context, count, strings, lengths, errcode_ret);
+    }
+    else
+    {
+        if(errcode_ret != nullptr)
+        {
+            *errcode_ret = CL_OUT_OF_RESOURCES;
+        }
+        return nullptr;
+    }
+}
+
+cl_int clReleaseKernel(cl_kernel kernel)
+{
+    auto func = CLSymbols::get().clReleaseKernel;
+    if(func != nullptr)
+    {
+        return func(kernel);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clGetDeviceIDs(cl_platform_id platform,
+                      cl_device_type device_type,
+                      cl_uint        num_entries,
+                      cl_device_id *devices,
+                      cl_uint       *num_devices)
+{
+    auto func = CLSymbols::get().clGetDeviceIDs;
+    if(func != nullptr)
+    {
+        return func(platform, device_type, num_entries, devices, num_devices);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clGetDeviceInfo(cl_device_id   device,
+                       cl_device_info param_name,
+                       size_t         param_value_size,
+                       void          *param_value,
+                       size_t        *param_value_size_ret)
+{
+    auto func = CLSymbols::get().clGetDeviceInfo;
+    if(func != nullptr)
+    {
+        return func(device, param_name, param_value_size, param_value, param_value_size_ret);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
diff --git a/src/core/CL/cl_kernels/absdiff.cl b/src/core/CL/cl_kernels/absdiff.cl
new file mode 100644
index 0000000000..1761342eb4
--- /dev/null
+++ b/src/core/CL/cl_kernels/absdiff.cl
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Calculate the absolute difference of two input images.
+ *
+ * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:\n
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
+ *
+ * @param[in]  in1_ptr                           Pointer to the first source image. Supported data types: U8, S16
+ * @param[in]  in1_stride_x                      Stride of the first source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[in]  in2_ptr                           Pointer to the second source image. Supported data types: U8, S16
+ * @param[in]  in2_stride_x                      Stride of the second source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the second source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the second source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void absdiff(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out))
+{
+    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
+    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+
+    vstore16(CONVERT_SAT(abs_diff(in_a, in_b), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/accumulate.cl b/src/core/CL/cl_kernels/accumulate.cl
new file mode 100644
index 0000000000..39c1512c3c
--- /dev/null
+++ b/src/core/CL/cl_kernels/accumulate.cl
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function accumulates an input image into output image.
+ *
+ * @param[in]  input_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  input_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] accu_ptr                            Pointer to the destination image. Supported data types: S16
+ * @param[in]  accu_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  accu_step_x                         accu_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  accu_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  accu_step_y                         accu_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  accu_offset_first_element_in_bytes  The offset of the first element in the destination image
+ */
+__kernel void accumulate(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(accu))
+{
+    // Get pixels pointer
+    Image input = CONVERT_TO_IMAGE_STRUCT(input);
+    Image accu  = CONVERT_TO_IMAGE_STRUCT(accu);
+
+    // Load data
+    uchar16 in_data   = vload16(0, input.ptr);
+    short16 accu_data = vload16(0, (__global short *)accu.ptr);
+
+    // Perform accumulation
+    short16 res = add_sat(convert_short16(in_data), accu_data);
+
+    // Store result
+    vstore16(res, 0, (__global short *)accu.ptr);
+}
+
+/** This function accumulates a weighted value from an input image to an output image.
+ *
+ * @param[in]  input_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  input_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] accu_ptr                            Pointer to the destination image. Supported data types: S16
+ * @param[in]  accu_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  accu_step_x                         accu_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  accu_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  accu_step_y                         accu_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  accu_offset_first_element_in_bytes  The offset of the first element in the destination image
+ * @param[in]  alpha                               The float scalar value with a value in the range of 0 to 1
+ */
+__kernel void accumulate_weighted(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(accu),
+    const float alpha)
+{
+    // Get pixels pointer
+    Image input = CONVERT_TO_IMAGE_STRUCT(input);
+    Image accu  = CONVERT_TO_IMAGE_STRUCT(accu);
+
+    // Load data
+    const float16 in_data   = convert_float16(vload16(0, input.ptr));
+    const float16 accu_data = convert_float16(vload16(0, accu.ptr));
+
+    // Calculate weighted accumulation
+    const uchar16 res = convert_uchar16((1.0f - alpha) * accu_data + alpha * in_data);
+
+    // Store result
+    vstore16(res, 0, accu.ptr);
+}
+
+/** This function accumulates a squared value from an input image to an output image.
+ *
+ * @param[in]  input_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  input_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] accu_ptr                            Pointer to the destination image. Supported data types: S16
+ * @param[in]  accu_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  accu_step_x                         accu_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  accu_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  accu_step_y                         accu_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  accu_offset_first_element_in_bytes  The offset of the first element in the destination image
+ * @param[in]  shift                               The U32 scalar value with a value in the range of 0 to 15
+ */
+__kernel void accumulate_squared(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(accu),
+    const uint shift)
+{
+    // Get pixels pointer
+    Image input = CONVERT_TO_IMAGE_STRUCT(input);
+    Image accu  = CONVERT_TO_IMAGE_STRUCT(accu);
+
+    // Load data
+    ushort16 in_data   = convert_ushort16(vload16(0, input.ptr));
+    uint16   accu_data = convert_uint16(vload16(0, (__global short *)accu.ptr));
+
+    // Calculate squared accumulation
+    short16 res = convert_short16_sat(accu_data + convert_uint16((in_data * in_data) >> shift));
+
+    // Store result
+    vstore16(res, 0, (__global short *)accu.ptr);
+}
diff --git a/src/core/CL/cl_kernels/activation_layer.cl b/src/core/CL/cl_kernels/activation_layer.cl
new file mode 100644
index 0000000000..e3cbb6c801
--- /dev/null
+++ b/src/core/CL/cl_kernels/activation_layer.cl
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This performs an activation function floating point inputs.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Activation function should be given as a preprocessor argument using -DNAME. e.g. -DTANH
+ * @note Distinction between floating point and integer is done using -DTYPE_FP and -DTYPE_INT preprocessor argument
+ * @note A, B variables required by some activation functions are set using -DA= and -DB= respectively.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16, F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: F16, F32
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void activation_layer(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    data = vload16(0, (__global DATA_TYPE *)input.ptr);
+
+    // Perform activation
+#if defined LOGISTIC
+    data = 1 / (1 + exp(-data));
+#elif defined TANH
+    data = (VEC_DATA_TYPE(DATA_TYPE, 16))A * tanh((VEC_DATA_TYPE(DATA_TYPE, 16))B * data);
+#elif defined RELU
+    data = max(0, data);
+#elif defined BRELU
+    data = min((VEC_DATA_TYPE(DATA_TYPE, 16))A, max(0, data));
+#elif defined SRELU
+    data = log(1 + exp(data));
+#elif defined ABS
+#if defined   TYPE_INT
+    data = abs(data);
+#else
+    data = fabs(data);
+#endif
+#elif defined SQUARE
+    data = data * data;
+#elif defined SQRT
+    data = sqrt(data);
+#elif defined LINEAR
+    data = (VEC_DATA_TYPE(DATA_TYPE, 16))A * data + (VEC_DATA_TYPE(DATA_TYPE, 16))B;
+#endif
+
+    // Store result
+    vstore16(data, 0, (__global DATA_TYPE *)output.ptr);
+}
diff --git a/src/core/CL/cl_kernels/arithmetic_op.cl b/src/core/CL/cl_kernels/arithmetic_op.cl
new file mode 100644
index 0000000000..434300efa8
--- /dev/null
+++ b/src/core/CL/cl_kernels/arithmetic_op.cl
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifdef SATURATE
+#define ADD(x, y) add_sat((x), (y))
+#define SUB(x, y) sub_sat((x), (y))
+#else
+#define ADD(x, y) (x) + (y)
+#define SUB(x, y) (x) - (y)
+#endif
+
+/** This function add two images.
+ *
+ * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
+ * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8, S16
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8, S16
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void arithmetic_add(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out))
+{
+    // Get pixels pointer
+    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
+    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    // Load values
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+
+    // Calculate and store result
+    vstore16(ADD(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
+
+/** This function subtracts one image from another.
+ *
+ * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
+ * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8, S16
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8, S16
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void arithmetic_sub(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out))
+{
+    // Get pixels pointer
+    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
+    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    // Load values
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+
+    // Calculate and store result
+    vstore16(SUB(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
new file mode 100644
index 0000000000..13e6702334
--- /dev/null
+++ b/src/core/CL/cl_kernels/batchnormalization_layer.cl
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Apply batch normalization.
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                             Pointer to the mean source tensor. Supported data types: F32
+ * @param[in]  mean_stride_x                        Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                          mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes   The offset of the first element in the mean source tensor
+ * @param[in]  var_ptr                              Pointer to the var tensor. Supported data types: F32
+ * @param[in]  var_stride_x                         Stride of the var tensor in X dimension (in bytes)
+ * @param[in]  var_step_x                           var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  var_offset_first_element_in_bytes    The offset of the first element in the var source tensor
+ * @param[in]  beta_ptr                             Pointer to the beta source tensor. Supported data types: F32
+ * @param[in]  beta_stride_x                        Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in]  beta_step_x                          beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  beta_offset_first_element_in_bytes   The offset of the first element in the beta source tensor
+ * @param[in]  gamma_ptr                            Pointer to the gamma source tensor. Supported data types: F32
+ * @param[in]  gamma_stride_x                       Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in]  gamma_step_x                         gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gamma_offset_first_element_in_bytes  The offset of the first element in the gamma source tensor
+ * @param[in]  epsilon                              Epsilon parameter in the batch normalization equation
+ */
+__kernel void batchnormalization_layer(TENSOR3D_DECLARATION(input),
+                                       TENSOR3D_DECLARATION(output),
+                                       VECTOR_DECLARATION(mean),
+                                       VECTOR_DECLARATION(var),
+                                       VECTOR_DECLARATION(beta),
+                                       VECTOR_DECLARATION(gamma),
+                                       float epsilon)
+{
+    Tensor3D in    = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D out   = CONVERT_TO_TENSOR3D_STRUCT(output);
+    Vector   mean  = CONVERT_TO_VECTOR_STRUCT(mean);
+    Vector   var   = CONVERT_TO_VECTOR_STRUCT(var);
+    Vector   beta  = CONVERT_TO_VECTOR_STRUCT(beta);
+    Vector   gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
+
+    float4 _in         = 0;
+    float4 denominator = 0;
+    float4 numerator   = 0;
+    float4 x_bar       = 0;
+    float4 gamma_vec   = 0;
+    float4 beta_vec    = 0;
+
+    const int current_slice = get_global_id(2);
+
+    _in         = vload4(0, (__global float *)in.ptr);
+    denominator = *((__global float *)(var.ptr + current_slice * var.stride_x));
+    denominator = rsqrt(denominator + epsilon);
+
+    // Calculate x bar and store results
+    numerator = *((__global float *)(mean.ptr + current_slice * mean.stride_x));
+    numerator = _in - numerator;
+    x_bar     = numerator * denominator;
+
+    gamma_vec = *((__global float *)(gamma.ptr + current_slice * beta.stride_x));
+    beta_vec  = *((__global float *)(beta.ptr + current_slice * beta.stride_x));
+
+    vstore4(gamma_vec * x_bar + beta_vec, 0, (__global float *)out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/bitwise_op.cl b/src/core/CL/cl_kernels/bitwise_op.cl
new file mode 100644
index 0000000000..135bfa989c
--- /dev/null
+++ b/src/core/CL/cl_kernels/bitwise_op.cl
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function computes the bitwise OR of two input images.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void bitwise_or(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out))
+{
+    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
+    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    uchar16 in_a = vload16(0, in1.ptr);
+    uchar16 in_b = vload16(0, in2.ptr);
+
+    vstore16(in_a | in_b, 0, out.ptr);
+}
+
+/** This function computes the bitwise AND of two input images.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void bitwise_and(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out))
+{
+    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
+    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    uchar16 in_a = vload16(0, in1.ptr);
+    uchar16 in_b = vload16(0, in2.ptr);
+
+    vstore16(in_a & in_b, 0, out.ptr);
+}
+
+/** This function computes the bitwise XOR of two input images.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void bitwise_xor(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out))
+{
+    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
+    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    uchar16 in_a = vload16(0, in1.ptr);
+    uchar16 in_b = vload16(0, in2.ptr);
+
+    vstore16(in_a ^ in_b, 0, out.ptr);
+}
+
+/** This function computes the bitwise NOT of an image.
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void bitwise_not(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out))
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    uchar16 in_data = vload16(0, in.ptr);
+
+    vstore16(~in_data, 0, out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/canny.cl b/src/core/CL/cl_kernels/canny.cl
new file mode 100644
index 0000000000..ec6719213c
--- /dev/null
+++ b/src/core/CL/cl_kernels/canny.cl
@@ -0,0 +1,429 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Calculate the magnitude and phase from horizontal and vertical result of sobel result.
+ *
+ * @note The calculation of gradient uses level 1 normalisation.
+ * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
+ *
+ * @param[in]  src1_ptr                            Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32
+ * @param[in]  src1_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src1_step_x                         src1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src1_step_y                         src1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  src2_ptr                            Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32
+ * @param[in]  src2_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src2_step_x                         src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src2_step_y                         src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] grad_ptr                            Pointer to the gradient output. Supported data types: U16, U32
+ * @param[in]  grad_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  grad_step_x                         grad_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  grad_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  grad_step_y                         grad_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  grad_offset_first_element_in_bytes  The offset of the first element of the output
+ * @param[out] angle_ptr                           Pointer to the angle output. Supported data types: U8
+ * @param[in]  angle_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  angle_step_x                        angle_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  angle_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  angle_step_y                        angle_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  angle_offset_first_element_in_bytes The offset of the first element of the output
+ */
+__kernel void combine_gradients_L1(
+    IMAGE_DECLARATION(src1),
+    IMAGE_DECLARATION(src2),
+    IMAGE_DECLARATION(grad),
+    IMAGE_DECLARATION(angle))
+{
+    // Construct images
+    Image src1  = CONVERT_TO_IMAGE_STRUCT(src1);
+    Image src2  = CONVERT_TO_IMAGE_STRUCT(src2);
+    Image grad  = CONVERT_TO_IMAGE_STRUCT(grad);
+    Image angle = CONVERT_TO_IMAGE_STRUCT(angle);
+
+    // Load sobel horizontal and vertical values
+    VEC_DATA_TYPE(DATA_TYPE_IN, 4)
+    h = vload4(0, (__global DATA_TYPE_IN *)src1.ptr);
+    VEC_DATA_TYPE(DATA_TYPE_IN, 4)
+    v = vload4(0, (__global DATA_TYPE_IN *)src2.ptr);
+
+    /* Calculate the gradient, using level 1 normalisation method */
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 4)
+    m = CONVERT_SAT((abs(h) + abs(v)), VEC_DATA_TYPE(DATA_TYPE_OUT, 4));
+
+    /* Calculate the angle */
+    float4 p = atan2pi(convert_float4(v), convert_float4(h));
+
+    /* Remap angle to range [0, 256) */
+    p = select(p, p + 2, p < 0.0f) * 128.0f;
+
+    /* Store results */
+    vstore4(m, 0, (__global DATA_TYPE_OUT *)grad.ptr);
+    vstore4(convert_uchar4_sat_rte(p), 0, angle.ptr);
+}
+
+/** Calculate the gradient and angle from horizontal and vertical result of sobel result.
+ *
+ * @note The calculation of gradient uses level 2 normalisation
+ * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
+ *
+ * @param[in]  src1_ptr                            Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32
+ * @param[in]  src1_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src1_step_x                         src1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src1_step_y                         src1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  src2_ptr                            Pointer to the source image (Vertical result of Sobel). Supported data types: S16, S32
+ * @param[in]  src2_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src2_step_x                         src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src2_step_y                         src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] grad_ptr                            Pointer to the gradient output. Supported data types: U16, U32
+ * @param[in]  grad_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  grad_step_x                         grad_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  grad_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  grad_step_y                         grad_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  grad_offset_first_element_in_bytes  The offset of the first element of the output
+ * @param[out] angle_ptr                           Pointer to the angle output. Supported data types: U8
+ * @param[in]  angle_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  angle_step_x                        angle_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  angle_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  angle_step_y                        angle_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  angle_offset_first_element_in_bytes The offset of the first element of the output
+ */
+__kernel void combine_gradients_L2(
+    IMAGE_DECLARATION(src1),
+    IMAGE_DECLARATION(src2),
+    IMAGE_DECLARATION(grad),
+    IMAGE_DECLARATION(angle))
+{
+    // Construct images
+    Image src1  = CONVERT_TO_IMAGE_STRUCT(src1);
+    Image src2  = CONVERT_TO_IMAGE_STRUCT(src2);
+    Image grad  = CONVERT_TO_IMAGE_STRUCT(grad);
+    Image angle = CONVERT_TO_IMAGE_STRUCT(angle);
+
+    // Load sobel horizontal and vertical values
+    float4 h = convert_float4(vload4(0, (__global DATA_TYPE_IN *)src1.ptr));
+    float4 v = convert_float4(vload4(0, (__global DATA_TYPE_IN *)src2.ptr));
+
+    /* Calculate the gradient, using level 2 normalisation method */
+    float4 m = sqrt(h * h + v * v);
+
+    /* Calculate the angle */
+    float4 p = atan2pi(v, h);
+
+    /* Remap angle to range [0, 256) */
+    p = select(p, p + 2, p < 0.0f) * 128.0f;
+
+    /* Store results */
+    vstore4(CONVERT_SAT_ROUND(m, VEC_DATA_TYPE(DATA_TYPE_OUT, 4), rte), 0, (__global DATA_TYPE_OUT *)grad.ptr);
+    vstore4(convert_uchar4_sat_rte(p), 0, angle.ptr);
+}
+
+/** Array that holds the relative coordinates offset for the neighbouring pixels.
+ */
+__constant short4 neighbours_coords[] =
+{
+    { -1, 0, 1, 0 },  // 0
+    { -1, 1, 1, -1 }, // 45
+    { 0, 1, 0, -1 },  // 90
+    { 1, 1, -1, -1 }, // 135
+    { 1, 0, -1, 0 },  // 180
+    { 1, -1, -1, 1 }, // 225
+    { 0, 1, 0, -1 },  // 270
+    { -1, -1, 1, 1 }, // 315
+    { -1, 0, 1, 0 },  // 360
+};
+
+/** Perform non maximum suppression.
+ *
+ * @attention The input and output data types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
+ *
+ * @param[in]  grad_ptr                              Pointer to the gradient output. Supported data types: S16, S32
+ * @param[in]  grad_stride_x                         Stride of the source image in X dimension (in bytes)
+ * @param[in]  grad_step_x                           grad_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  grad_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  grad_step_y                           grad_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  grad_offset_first_element_in_bytes    The offset of the first element of the output
+ * @param[in]  angle_ptr                             Pointer to the angle output. Supported data types: U8
+ * @param[in]  angle_stride_x                        Stride of the source image in X dimension (in bytes)
+ * @param[in]  angle_step_x                          angle_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  angle_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  angle_step_y                          angle_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  angle_offset_first_element_in_bytes   TThe offset of the first element of the output
+ * @param[out] non_max_ptr                           Pointer to the non maximum suppressed output. Supported data types: U16, U32
+ * @param[in]  non_max_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  non_max_step_x                        non_max_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  non_max_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  non_max_step_y                        non_max_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  non_max_offset_first_element_in_bytes The offset of the first element of the output
+ * @param[in]  lower_thr                             The low threshold
+ */
+__kernel void suppress_non_maximum(
+    IMAGE_DECLARATION(grad),
+    IMAGE_DECLARATION(angle),
+    IMAGE_DECLARATION(non_max),
+    uint lower_thr)
+{
+    // Construct images
+    Image grad    = CONVERT_TO_IMAGE_STRUCT(grad);
+    Image angle   = CONVERT_TO_IMAGE_STRUCT(angle);
+    Image non_max = CONVERT_TO_IMAGE_STRUCT(non_max);
+
+    // Get gradient and angle
+    DATA_TYPE_IN gradient = *((__global DATA_TYPE_IN *)grad.ptr);
+    uchar an              = convert_ushort(*angle.ptr);
+
+    if(gradient <= lower_thr)
+    {
+        return;
+    }
+
+    // Divide the whole round into 8 directions
+    uchar         ang  = 127 - an;
+    DATA_TYPE_OUT q_an = (ang + 16) >> 5;
+
+    // Find the two pixels in the perpendicular direction
+    short2       x_p = neighbours_coords[q_an].s02;
+    short2       y_p = neighbours_coords[q_an].s13;
+    DATA_TYPE_IN g1  = *((global DATA_TYPE_IN *)offset(&grad, x_p.x, y_p.x));
+    DATA_TYPE_IN g2  = *((global DATA_TYPE_IN *)offset(&grad, x_p.y, y_p.y));
+
+    if((gradient > g1) && (gradient > g2))
+    {
+        *((global DATA_TYPE_OUT *)non_max.ptr) = gradient;
+    }
+}
+
+#define EDGE 255
+#define hysteresis_local_stack_L1 8  // The size of level 1 stack. This has to agree with the host side
+#define hysteresis_local_stack_L2 16 // The size of level 2 stack, adjust this can impact the match rate with VX implementation
+
+/** Check whether pixel is valid
+*
+* Skip the pixel if the early_test fails.
+* Otherwise, it tries to add the pixel coordinate to the stack, and proceed to popping the stack instead if the stack is full
+*
+* @param[in] early_test Boolean condition based on the minv check and visited buffer check
+* @param[in] x_pos      X-coordinate of pixel that is going to be recorded, has to be within the boundary
+* @param[in] y_pos      Y-coordinate of pixel that is going to be recorded, has to be within the boundary
+* @param[in] x_cur      X-coordinate of current central pixel
+* @param[in] y_cur      Y-coordinate of current central pixel
+*/
+#define check_pixel(early_test, x_pos, y_pos, x_cur, y_cur)                               \
+    {                                                                                     \
+        if(!early_test)                                                                   \
+        {                                                                                 \
+            /* Number of elements in the local stack 1, points to next available entry */ \
+            c = *((__global char *)offset(&l1_stack_counter, x_cur, y_cur));              \
+            \
+            if(c > (hysteresis_local_stack_L1 - 1)) /* Stack level 1 is full */           \
+                goto pop_stack;                                                           \
+            \
+            /* The pixel that has already been recorded is ignored */                     \
+            if(!atomic_or((__global uint *)offset(&recorded, x_pos, y_pos), 1))           \
+            {                                                                             \
+                l1_ptr[c] = (short2)(x_pos, y_pos);                                       \
+                *((__global char *)offset(&l1_stack_counter, x_cur, y_cur)) += 1;         \
+            }                                                                             \
+        }                                                                                 \
+    }
+
+/** Perform hysteresis.
+ *
+ * @attention The input data_type needs to be passed at compile time using -DDATA_TYPE_IN: e.g. -DDATA_TYPE_IN=short
+ *
+ * @param[in]  src_ptr                                        Pointer to the input image. Supported data types: U8
+ * @param[in]  src_stride_x                                   Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                                     src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                                   Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                                     src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes              The offset of the first element of the output
+ * @param[out] out_ptr                                        Pointer to the output image. Supported data types: U8
+ * @param[in]  out_stride_x                                   Stride of the source image in X dimension (in bytes)
+ * @param[in]  out_step_x                                     out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                                   Stride of the source image in Y dimension (in bytes)
+ * @param[in]  out_step_y                                     out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes              The offset of the first element of the output
+ * @param[out] visited_ptr                                    Pointer to the visited buffer, where pixels are marked as visited. Supported data types: U32
+ * @param[in]  visited_stride_x                               Stride of the source image in X dimension (in bytes)
+ * @param[in]  visited_step_x                                 visited_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  visited_stride_y                               Stride of the source image in Y dimension (in bytes)
+ * @param[in]  visited_step_y                                 visited_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  visited_offset_first_element_in_bytes          The offset of the first element of the output
+ * @param[out] recorded_ptr                                   Pointer to the recorded buffer, where pixels are marked as recorded. Supported data types: U32
+ * @param[in]  recorded_stride_x                              Stride of the source image in X dimension (in bytes)
+ * @param[in]  recorded_step_x                                recorded_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  recorded_stride_y                              Stride of the source image in Y dimension (in bytes)
+ * @param[in]  recorded_step_y                                recorded_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  recorded_offset_first_element_in_bytes         The offset of the first element of the output
+ * @param[out] l1_stack_ptr                                   Pointer to the l1 stack of a pixel. Supported data types: S32
+ * @param[in]  l1_stack_stride_x                              Stride of the source image in X dimension (in bytes)
+ * @param[in]  l1_stack_step_x                                l1_stack_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  l1_stack_stride_y                              Stride of the source image in Y dimension (in bytes)
+ * @param[in]  l1_stack_step_y                                l1_stack_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  l1_stack_offset_first_element_in_bytes         The offset of the first element of the output
+ * @param[out] l1_stack_counter_ptr                           Pointer to the l1 stack counters of an image. Supported data types: U8
+ * @param[in]  l1_stack_counter_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  l1_stack_counter_step_x                        l1_stack_counter_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  l1_stack_counter_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  l1_stack_counter_step_y                        l1_stack_counter_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  l1_stack_counter_offset_first_element_in_bytes The offset of the first element of the output
+ * @param[in]  low_thr                                        The lower threshold
+ * @param[in]  up_thr                                         The upper threshold
+ * @param[in]  width                                          The width of the image.
+ * @param[in]  height                                         The height of the image
+ */
+kernel void hysteresis(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(out),
+    IMAGE_DECLARATION(visited),
+    IMAGE_DECLARATION(recorded),
+    IMAGE_DECLARATION(l1_stack),
+    IMAGE_DECLARATION(l1_stack_counter),
+    uint low_thr,
+    uint up_thr,
+    int  width,
+    int  height)
+{
+    // Create images
+    Image src              = CONVERT_TO_IMAGE_STRUCT_NO_STEP(src);
+    Image out              = CONVERT_TO_IMAGE_STRUCT_NO_STEP(out);
+    Image visited          = CONVERT_TO_IMAGE_STRUCT_NO_STEP(visited);
+    Image recorded         = CONVERT_TO_IMAGE_STRUCT_NO_STEP(recorded);
+    Image l1_stack         = CONVERT_TO_IMAGE_STRUCT_NO_STEP(l1_stack);
+    Image l1_stack_counter = CONVERT_TO_IMAGE_STRUCT_NO_STEP(l1_stack_counter);
+
+    // Index
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    // Load value
+    DATA_TYPE_IN val = *((__global DATA_TYPE_IN *)offset(&src, x, y));
+
+    // If less than upper threshold set to NO_EDGE and return
+    if(val <= up_thr)
+    {
+        *offset(&out, x, y) = 0;
+        return;
+    }
+
+    // Init local stack 2
+    short2 stack_L2[hysteresis_local_stack_L2] = { 0 };
+    int    L2_counter                          = 0;
+
+    // Perform recursive hysteresis
+    while(true)
+    {
+        // Get L1 stack pointer
+        __global short2 *l1_ptr = (__global short2 *)(l1_stack.ptr + y * l1_stack.stride_y + x * hysteresis_local_stack_L1 * l1_stack.stride_x);
+
+        // If the pixel has already been visited, proceed with the items in the stack instead
+        if(atomic_or((__global uint *)offset(&visited, x, y), 1) != 0)
+        {
+            goto pop_stack;
+        }
+
+        // Set strong edge
+        *offset(&out, x, y) = EDGE;
+
+        // If it is the top of stack l2, we don't need check the surrounding pixels
+        if(L2_counter > (hysteresis_local_stack_L2 - 1))
+        {
+            goto pop_stack2;
+        }
+
+        // Points to the start of the local stack;
+        char c;
+
+        VEC_DATA_TYPE(DATA_TYPE_IN, 4)
+        x_tmp;
+        uint4 v_tmp;
+
+        // Get direction pixel indices
+        int N = max(y - 1, 0), S = min(y + 1, height - 2), W = max(x - 1, 0), E = min(x + 1, width - 2);
+
+        // Check 8 pixels around for week edges where low_thr < val <= up_thr
+        x_tmp = vload4(0, (__global DATA_TYPE_IN *)offset(&src, W, N));
+        v_tmp = vload4(0, (__global uint *)offset(&visited, W, N));
+        check_pixel(((x_tmp.s0 <= low_thr) || v_tmp.s0 || (x_tmp.s0 > up_thr)), W, N, x, y); // NW
+        check_pixel(((x_tmp.s1 <= low_thr) || v_tmp.s1 || (x_tmp.s1 > up_thr)), x, N, x, y); // N
+        check_pixel(((x_tmp.s2 <= low_thr) || v_tmp.s2 || (x_tmp.s2 > up_thr)), E, N, x, y); // NE
+
+        x_tmp = vload4(0, (__global DATA_TYPE_IN *)offset(&src, W, y));
+        v_tmp = vload4(0, (__global uint *)offset(&visited, W, y));
+        check_pixel(((x_tmp.s0 <= low_thr) || v_tmp.s0 || (x_tmp.s0 > up_thr)), W, y, x, y); // W
+        check_pixel(((x_tmp.s2 <= low_thr) || v_tmp.s2 || (x_tmp.s2 > up_thr)), E, y, x, y); // E
+
+        x_tmp = vload4(0, (__global DATA_TYPE_IN *)offset(&src, W, S));
+        v_tmp = vload4(0, (__global uint *)offset(&visited, W, S));
+        check_pixel(((x_tmp.s0 <= low_thr) || v_tmp.s0 || (x_tmp.s0 > up_thr)), W, S, x, y); // SW
+        check_pixel(((x_tmp.s1 <= low_thr) || v_tmp.s1 || (x_tmp.s1 > up_thr)), x, S, x, y); // S
+        check_pixel(((x_tmp.s2 <= low_thr) || v_tmp.s2 || (x_tmp.s2 > up_thr)), E, S, x, y); // SE
+
+#undef check_pixel
+
+pop_stack:
+        c = *((__global char *)offset(&l1_stack_counter, x, y));
+
+        if(c >= 1)
+        {
+            *((__global char *)offset(&l1_stack_counter, x, y)) -= 1;
+            int2 l_c = convert_int2(l1_ptr[c - 1]);
+
+            // Push the current position into level 2 stack
+            stack_L2[L2_counter].x = x;
+            stack_L2[L2_counter].y = y;
+
+            x = l_c.x;
+            y = l_c.y;
+
+            L2_counter++;
+
+            continue;
+        }
+
+        if(L2_counter > 0)
+        {
+            goto pop_stack2;
+        }
+        else
+        {
+            return;
+        }
+
+pop_stack2:
+        L2_counter--;
+        x = stack_L2[L2_counter].x;
+        y = stack_L2[L2_counter].y;
+    };
+}
diff --git a/src/core/CL/cl_kernels/channel_combine.cl b/src/core/CL/cl_kernels/channel_combine.cl
new file mode 100644
index 0000000000..93e80b925e
--- /dev/null
+++ b/src/core/CL/cl_kernels/channel_combine.cl
@@ -0,0 +1,416 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function combines three planes to a single RGB image.
+ *
+ * @param[in] plane0_ptr                           Pointer to the first plane. Supported Format: U8
+ * @param[in] plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
+ * @param[in] plane0_step_x                        plane0_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
+ * @param[in] plane0_step_y                        plane0_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane
+ * @param[in] plane1_ptr                           Pointer to the second plane. Supported Format: U8
+ * @param[in] plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
+ * @param[in] plane1_step_x                        plane1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
+ * @param[in] plane1_step_y                        plane1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane
+ * @param[in] plane2_ptr                           Pointer to the third plane. Supported Format: U8
+ * @param[in] plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
+ * @param[in] plane2_step_x                        plane2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
+ * @param[in] plane2_step_y                        plane2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane
+ * @param[in] dst_ptr                              Pointer to the destination image. Supported Format: RGB
+ * @param[in] dst_stride_x                         Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                         Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes    The offset of the first element in the destination image
+ */
+__kernel void channel_combine_RGB888(
+    IMAGE_DECLARATION(plane0),
+    IMAGE_DECLARATION(plane1),
+    IMAGE_DECLARATION(plane2),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0);
+    Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1);
+    Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2);
+    Image dst    = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data0 = vload16(0, plane0.ptr);
+    uchar16 data1 = vload16(0, plane1.ptr);
+    uchar16 data2 = vload16(0, plane2.ptr);
+
+    uchar16 out0 = (uchar16)(data0.s0, data1.s0, data2.s0,
+                             data0.s1, data1.s1, data2.s1,
+                             data0.s2, data1.s2, data2.s2,
+                             data0.s3, data1.s3, data2.s3,
+                             data0.s4, data1.s4, data2.s4,
+                             data0.s5);
+    vstore16(out0, 0, dst.ptr);
+
+    uchar16 out1 = (uchar16)(data1.s5, data2.s5, data0.s6,
+                             data1.s6, data2.s6, data0.s7,
+                             data1.s7, data2.s7, data0.s8,
+                             data1.s8, data2.s8, data0.s9,
+                             data1.s9, data2.s9, data0.sA,
+                             data1.sA);
+    vstore16(out1, 0, dst.ptr + 16);
+
+    uchar16 out2 = (uchar16)(data2.sA, data0.sB, data1.sB,
+                             data2.sB, data0.sC, data1.sC,
+                             data2.sC, data0.sD, data1.sD,
+                             data2.sD, data0.sE, data1.sE,
+                             data2.sE, data0.sF, data1.sF,
+                             data2.sF);
+    vstore16(out2, 0, dst.ptr + 32);
+}
+
+/** This function combines three planes to a single RGBA image.
+ *
+ * @param[in] plane0_ptr                           Pointer to the first plane. Supported Format: U8
+ * @param[in] plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
+ * @param[in] plane0_step_x                        plane0_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
+ * @param[in] plane0_step_y                        plane0_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane
+ * @param[in] plane1_ptr                           Pointer to the second plane. Supported Format: U8
+ * @param[in] plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
+ * @param[in] plane1_step_x                        plane1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
+ * @param[in] plane1_step_y                        plane1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane
+ * @param[in] plane2_ptr                           Pointer to the third plane. Supported Format: U8
+ * @param[in] plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
+ * @param[in] plane2_step_x                        plane2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
+ * @param[in] plane2_step_y                        plane2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane
+ * @param[in] plane3_ptr                           Pointer to the fourth plane. Supported Format: U8
+ * @param[in] plane3_stride_x                      Stride of the fourth plane in X dimension (in bytes)
+ * @param[in] plane3_step_x                        plane3_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane3_stride_y                      Stride of the fourth plane in Y dimension (in bytes)
+ * @param[in] plane3_step_y                        plane3_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane3_offset_first_element_in_bytes The offset of the first element in the fourth plane
+ * @param[in] dst_ptr                              Pointer to the destination image. Supported Format: RGBA
+ * @param[in] dst_stride_x                         Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                         Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes    The offset of the first element in the destination image
+ */
+__kernel void channel_combine_RGBA8888(
+    IMAGE_DECLARATION(plane0),
+    IMAGE_DECLARATION(plane1),
+    IMAGE_DECLARATION(plane2),
+    IMAGE_DECLARATION(plane3),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0);
+    Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1);
+    Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2);
+    Image plane3 = CONVERT_TO_IMAGE_STRUCT(plane3);
+    Image dst    = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data0 = vload16(0, plane0.ptr);
+    uchar16 data1 = vload16(0, plane1.ptr);
+    uchar16 data2 = vload16(0, plane2.ptr);
+    uchar16 data3 = vload16(0, plane3.ptr);
+
+    uchar16 out0 = (uchar16)(data0.s0, data1.s0, data2.s0, data3.s0,
+                             data0.s1, data1.s1, data2.s1, data3.s1,
+                             data0.s2, data1.s2, data2.s2, data3.s2,
+                             data0.s3, data1.s3, data2.s3, data3.s3);
+    vstore16(out0, 0, dst.ptr);
+
+    uchar16 out1 = (uchar16)(data0.s4, data1.s4, data2.s4, data3.s4,
+                             data0.s5, data1.s5, data2.s5, data3.s5,
+                             data0.s6, data1.s6, data2.s6, data3.s6,
+                             data0.s7, data1.s7, data2.s7, data3.s7);
+    vstore16(out1, 0, dst.ptr + 16);
+
+    uchar16 out2 = (uchar16)(data0.s8, data1.s8, data2.s8, data3.s8,
+                             data0.s9, data1.s9, data2.s9, data3.s9,
+                             data0.sA, data1.sA, data2.sA, data3.sA,
+                             data0.sB, data1.sB, data2.sB, data3.sB);
+    vstore16(out2, 0, dst.ptr + 32);
+
+    uchar16 out3 = (uchar16)(data0.sC, data1.sC, data2.sC, data3.sC,
+                             data0.sD, data1.sD, data2.sD, data3.sD,
+                             data0.sE, data1.sE, data2.sE, data3.sE,
+                             data0.sF, data1.sF, data2.sF, data3.sF);
+    vstore16(out3, 0, dst.ptr + 48);
+}
+
+/** This function combines three planes to a single YUYV image.
+ *
+ * @param[in] plane0_ptr                           Pointer to the first plane. Supported Format: U8
+ * @param[in] plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
+ * @param[in] plane0_step_x                        plane0_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
+ * @param[in] plane0_step_y                        plane0_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane
+ * @param[in] plane1_ptr                           Pointer to the second plane. Supported Format: U8
+ * @param[in] plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
+ * @param[in] plane1_step_x                        plane1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
+ * @param[in] plane1_step_y                        plane1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane
+ * @param[in] plane2_ptr                           Pointer to the third plane. Supported Format: U8
+ * @param[in] plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
+ * @param[in] plane2_step_x                        plane2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
+ * @param[in] plane2_step_y                        plane2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane
+ * @param[in] dst_ptr                              Pointer to the destination image. Supported Format: YUYV
+ * @param[in] dst_stride_x                         Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                         Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes    The offset of the first element in the destination image
+ */
+__kernel void channel_combine_YUYV422(
+    IMAGE_DECLARATION(plane0),
+    IMAGE_DECLARATION(plane1),
+    IMAGE_DECLARATION(plane2),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0);
+    Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1);
+    Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2);
+    Image dst    = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data0 = vload16(0, plane0.ptr);
+    uchar8  data1 = vload8(0, plane1.ptr);
+    uchar8  data2 = vload8(0, plane2.ptr);
+
+    uchar16 out0 = (uchar16)(data0.s0, data1.s0, data0.s1, data2.s0,
+                             data0.s2, data1.s1, data0.s3, data2.s1,
+                             data0.s4, data1.s2, data0.s5, data2.s2,
+                             data0.s6, data1.s3, data0.s7, data2.s3);
+    vstore16(out0, 0, dst.ptr);
+    uchar16 out1 = (uchar16)(data0.s8, data1.s4, data0.s9, data2.s4,
+                             data0.sA, data1.s5, data0.sB, data2.s5,
+                             data0.sC, data1.s6, data0.sD, data2.s6,
+                             data0.sE, data1.s7, data0.sF, data2.s7);
+    vstore16(out1, 0, dst.ptr + 16);
+}
+
+/** This function combines three planes to a single UYUV image.
+ *
+ * @param[in] plane0_ptr                           Pointer to the first plane. Supported Format: U8
+ * @param[in] plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
+ * @param[in] plane0_step_x                        plane0_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
+ * @param[in] plane0_step_y                        plane0_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane0_offset_first_element_in_bytes The offset of the first element in the first plane
+ * @param[in] plane1_ptr                           Pointer to the second plane. Supported Format: U8
+ * @param[in] plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
+ * @param[in] plane1_step_x                        plane1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
+ * @param[in] plane1_step_y                        plane1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane1_offset_first_element_in_bytes The offset of the first element in the second plane
+ * @param[in] plane2_ptr                           Pointer to the third plane. Supported Format: U8
+ * @param[in] plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
+ * @param[in] plane2_step_x                        plane2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
+ * @param[in] plane2_step_y                        plane2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] plane2_offset_first_element_in_bytes The offset of the first element in the third plane
+ * @param[in] dst_ptr                              Pointer to the destination image. Supported Format: UYUV
+ * @param[in] dst_stride_x                         Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                         Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes    The offset of the first element in the destination image
+ */
+__kernel void channel_combine_UYVY422(
+    IMAGE_DECLARATION(plane0),
+    IMAGE_DECLARATION(plane1),
+    IMAGE_DECLARATION(plane2),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image plane0 = CONVERT_TO_IMAGE_STRUCT(plane0);
+    Image plane1 = CONVERT_TO_IMAGE_STRUCT(plane1);
+    Image plane2 = CONVERT_TO_IMAGE_STRUCT(plane2);
+    Image dst    = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data0 = vload16(0, plane0.ptr);
+    uchar8  data1 = vload8(0, plane1.ptr);
+    uchar8  data2 = vload8(0, plane2.ptr);
+
+    uchar16 out0 = (uchar16)(data1.s0, data0.s0, data2.s0, data0.s1,
+                             data1.s1, data0.s2, data2.s1, data0.s3,
+                             data1.s2, data0.s4, data2.s2, data0.s5,
+                             data1.s3, data0.s6, data2.s3, data0.s7);
+    vstore16(out0, 0, dst.ptr);
+    uchar16 out1 = (uchar16)(data1.s4, data0.s8, data2.s4, data0.s9,
+                             data1.s5, data0.sA, data2.s5, data0.sB,
+                             data1.s6, data0.sC, data2.s6, data0.sD,
+                             data1.s7, data0.sE, data2.s7, data0.sF);
+    vstore16(out1, 0, dst.ptr + 16);
+}
+
+/** This function combines three planes to a single NV12/NV21 image.
+ *
+ * @note NV12 or NV21 has to be specified through preprocessor macro. eg. -DNV12 performs NV12 channel combine.
+ *
+ * @param[in] src_plane0_ptr                           Pointer to the first plane. Supported Format: U8
+ * @param[in] src_plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
+ * @param[in] src_plane0_step_x                        src_plane0_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
+ * @param[in] src_plane0_step_y                        src_plane0_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_plane0_offset_first_element_in_bytes The offset of the first element in the first plane
+ * @param[in] src_plane1_ptr                           Pointer to the second plane. Supported Format: U8
+ * @param[in] src_plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
+ * @param[in] src_plane1_step_x                        src_plane1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
+ * @param[in] src_plane1_step_y                        src_plane1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_plane1_offset_first_element_in_bytes The offset of the first element in the second plane
+ * @param[in] src_plane2_ptr                           Pointer to the third plane. Supported Format: U8
+ * @param[in] src_plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
+ * @param[in] src_plane2_step_x                        src_plane2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
+ * @param[in] src_plane2_step_y                        src_plane2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_plane2_offset_first_element_in_bytes The offset of the first element in the third plane
+ * @param[in] dst_plane0_ptr                           Pointer to the first plane of the destination image. Supported Format: U8
+ * @param[in] dst_plane0_stride_x                      Stride of the first plane of the destination image in X dimension (in bytes)
+ * @param[in] dst_plane0_step_x                        dst_plane0_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_plane0_stride_y                      Stride of the first plane of the destination image in Y dimension (in bytes)
+ * @param[in] dst_plane0_step_y                        dst_plane0_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_plane0_offset_first_element_in_bytes The offset of the first element in the first plane of the destination image
+ * @param[in] dst_plane1_ptr                           Pointer to the second plane of the destination image. Supported Format: UV88
+ * @param[in] dst_plane1_stride_x                      Stride of the second plane of the destination image in X dimension (in bytes)
+ * @param[in] dst_plane1_step_x                        dst_plane1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_plane1_stride_y                      Stride of the second plane of the destination image in Y dimension (in bytes)
+ * @param[in] dst_plane1_step_y                        dst_plane1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_plane1_offset_first_element_in_bytes The offset of the first element in the second plane of the destination image
+ * @param[in] height                                   Sub-sampled height
+ */
+__kernel void channel_combine_NV(
+    IMAGE_DECLARATION(src_plane0),
+    IMAGE_DECLARATION(src_plane1),
+    IMAGE_DECLARATION(src_plane2),
+    IMAGE_DECLARATION(dst_plane0),
+    IMAGE_DECLARATION(dst_plane1),
+    uint height)
+{
+    // Get pixels pointer
+    Image src_plane0 = CONVERT_TO_IMAGE_STRUCT(src_plane0);
+    Image src_plane1 = CONVERT_TO_IMAGE_STRUCT(src_plane1);
+    Image src_plane2 = CONVERT_TO_IMAGE_STRUCT(src_plane2);
+    Image dst_plane0 = CONVERT_TO_IMAGE_STRUCT(dst_plane0);
+    Image dst_plane1 = CONVERT_TO_IMAGE_STRUCT(dst_plane1);
+
+    // Copy plane data
+    vstore16(vload16(0, src_plane0.ptr), 0, dst_plane0.ptr);
+    vstore16(vload16(0, offset(&src_plane0, 0, height)), 0, (__global uchar *)offset(&dst_plane0, 0, height));
+
+    // Create UV place
+    uchar8 data1 = vload8(0, src_plane1.ptr);
+    uchar8 data2 = vload8(0, src_plane2.ptr);
+
+#if defined NV12
+    vstore16(shuffle2(data1, data2, (uchar16)(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15)), 0, dst_plane1.ptr);
+#elif defined NV21
+    vstore16(shuffle2(data2, data1, (uchar16)(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15)), 0, dst_plane1.ptr);
+#endif
+}
+
+/** This function combines three planes to a single YUV444 or IYUV image.
+ *
+ * @note YUV444 or IYUV has to be specified through preprocessor macro. eg. -DIYUV performs IYUV channel combine.
+ *
+ * @param[in] src_plane0_ptr                           Pointer to the first plane. Supported Format: U8
+ * @param[in] src_plane0_stride_x                      Stride of the first plane in X dimension (in bytes)
+ * @param[in] src_plane0_step_x                        src_plane0_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_plane0_stride_y                      Stride of the first plane in Y dimension (in bytes)
+ * @param[in] src_plane0_step_y                        src_plane0_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_plane0_offset_first_element_in_bytes The offset of the first element in the first plane
+ * @param[in] src_plane1_ptr                           Pointer to the second plane. Supported Format: U8
+ * @param[in] src_plane1_stride_x                      Stride of the second plane in X dimension (in bytes)
+ * @param[in] src_plane1_step_x                        src_plane1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_plane1_stride_y                      Stride of the second plane in Y dimension (in bytes)
+ * @param[in] src_plane1_step_y                        src_plane1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_plane1_offset_first_element_in_bytes The offset of the first element in the second plane
+ * @param[in] src_plane2_ptr                           Pointer to the third plane. Supported Format: U8
+ * @param[in] src_plane2_stride_x                      Stride of the third plane in X dimension (in bytes)
+ * @param[in] src_plane2_step_x                        src_plane2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_plane2_stride_y                      Stride of the third plane in Y dimension (in bytes)
+ * @param[in] src_plane2_step_y                        src_plane2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_plane2_offset_first_element_in_bytes The offset of the first element in the third plane
+ * @param[in] dst_plane0_ptr                           Pointer to the first plane of the destination image. Supported Format: U8
+ * @param[in] dst_plane0_stride_x                      Stride of the first plane of the destination image in X dimension (in bytes)
+ * @param[in] dst_plane0_step_x                        dst_plane0_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_plane0_stride_y                      Stride of the first plane of the destination image in Y dimension (in bytes)
+ * @param[in] dst_plane0_step_y                        dst_plane0_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_plane0_offset_first_element_in_bytes The offset of the first element in the first plane of the destination image
+ * @param[in] dst_plane1_ptr                           Pointer to the second plane of the destination image. Supported Format: U8
+ * @param[in] dst_plane1_stride_x                      Stride of the second plane of the destination image in X dimension (in bytes)
+ * @param[in] dst_plane1_step_x                        dst_plane1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_plane1_stride_y                      Stride of the second plane of the destination image in Y dimension (in bytes)
+ * @param[in] dst_plane1_step_y                        dst_plane1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_plane1_offset_first_element_in_bytes The offset of the first element in the second plane of the destination image
+ * @param[in] dst_plane2_ptr                           Pointer to the third plane of the destination image. Supported Format: U8
+ * @param[in] dst_plane2_stride_x                      Stride of the third plane of the destination image in X dimension (in bytes)
+ * @param[in] dst_plane2_step_x                        dst_plane2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_plane2_stride_y                      Stride of the third plane of the destination image in Y dimension (in bytes)
+ * @param[in] dst_plane2_step_y                        dst_plane2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_plane2_offset_first_element_in_bytes The offset of the first element in the third plane of the destination image
+ * @param[in] height                                   Sub-sampled height
+ */
+__kernel void copy_planes_3p(
+    IMAGE_DECLARATION(src_plane0),
+    IMAGE_DECLARATION(src_plane1),
+    IMAGE_DECLARATION(src_plane2),
+    IMAGE_DECLARATION(dst_plane0),
+    IMAGE_DECLARATION(dst_plane1),
+    IMAGE_DECLARATION(dst_plane2),
+    uint height)
+{
+    // Get pixels pointer
+    Image src_plane0 = CONVERT_TO_IMAGE_STRUCT(src_plane0);
+    Image src_plane1 = CONVERT_TO_IMAGE_STRUCT(src_plane1);
+    Image src_plane2 = CONVERT_TO_IMAGE_STRUCT(src_plane2);
+    Image dst_plane0 = CONVERT_TO_IMAGE_STRUCT(dst_plane0);
+    Image dst_plane1 = CONVERT_TO_IMAGE_STRUCT(dst_plane1);
+    Image dst_plane2 = CONVERT_TO_IMAGE_STRUCT(dst_plane2);
+
+    // Copy plane data
+    vstore16(vload16(0, src_plane0.ptr), 0, dst_plane0.ptr);
+#if defined YUV444
+    vstore16(vload16(0, src_plane1.ptr), 0, dst_plane1.ptr);
+    vstore16(vload16(0, src_plane2.ptr), 0, dst_plane2.ptr);
+#elif defined IYUV
+    vstore16(vload16(0, offset(&src_plane0, 0, height)), 0, (__global uchar *)offset(&dst_plane0, 0, height));
+    vstore8(vload8(0, src_plane1.ptr), 0, dst_plane1.ptr);
+    vstore8(vload8(0, src_plane2.ptr), 0, dst_plane2.ptr);
+#endif
+}
diff --git a/src/core/CL/cl_kernels/channel_extract.cl b/src/core/CL/cl_kernels/channel_extract.cl
new file mode 100644
index 0000000000..14c6c8a92a
--- /dev/null
+++ b/src/core/CL/cl_kernels/channel_extract.cl
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function extracts a given channel from an RGB image.
+ *
+ * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_B will extract the B channel.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported Format: RGB
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void channel_extract_RGB888(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data  = vload16(0, src.ptr);
+    uchar8  data2 = vload8(0, src.ptr + 16);
+
+#if defined CHANNEL_R
+    vstore4(data.s0369, 0, dst.ptr);
+    vstore4((uchar4)(data.sCF, data2.s25), 0, dst.ptr + 4);
+#elif defined CHANNEL_G
+    vstore4(data.s147A, 0, dst.ptr);
+    vstore4((uchar4)(data.sD, data2.s036), 0, dst.ptr + 4);
+#elif defined CHANNEL_B
+    vstore4(data.s258B, 0, dst.ptr);
+    vstore4((uchar4)(data.sE, data2.s147), 0, dst.ptr + 4);
+#endif
+}
+
+/** This function extracts a given channel from an RGBA image.
+ *
+ * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_B will extract the B channel.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported Format: RGBA
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void channel_extract_RGBA8888(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data  = vload16(0, src.ptr);
+    uchar16 data2 = vload16(0, src.ptr + 16);
+
+#if defined CHANNEL_R
+    vstore8((uchar8)(data.s048C, data2.s048C), 0, dst.ptr);
+#elif defined CHANNEL_G
+    vstore8((uchar8)(data.s159D, data2.s159D), 0, dst.ptr);
+#elif defined CHANNEL_B
+    vstore8((uchar8)(data.s26AE, data2.s26AE), 0, dst.ptr);
+#elif defined CHANNEL_A
+    vstore8((uchar8)(data.s37BF, data2.s37BF), 0, dst.ptr);
+#endif
+}
+
+/** This function extracts a given channel from an YUYV image.
+ *
+ * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported Format: YUYV
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void channel_extract_YUYV422(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data = vload16(0, src.ptr);
+
+#if defined CHANNEL_Y
+    vstore8(data.s02468ACE, 0, dst.ptr);
+#elif defined CHANNEL_U
+    vstore4(data.s159D, 0, dst.ptr);
+#elif defined CHANNEL_V
+    vstore4(data.s37BF, 0, dst.ptr);
+#endif
+}
+
+/** This function extracts a given channel from an UYUV image.
+ *
+ * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported Format: UYUV
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void channel_extract_UYVY422(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data = vload16(0, src.ptr);
+
+#if defined CHANNEL_Y
+    vstore8(data.s13579BDF, 0, dst.ptr);
+#elif defined CHANNEL_U
+    vstore4(data.s048C, 0, dst.ptr);
+#elif defined CHANNEL_V
+    vstore4(data.s26AE, 0, dst.ptr);
+#endif
+}
+
+/** This function extracts a given channel from an NV12 image.
+ *
+ * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel.
+ * @warning Only channels UV can be extracted using this kernel.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported Format: NV12 (UV88)
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void channel_extract_NV12(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data = vload16(0, src.ptr);
+
+#if defined CHANNEL_U
+    vstore8(data.s02468ACE, 0, dst.ptr);
+#elif defined CHANNEL_V
+    vstore8(data.s13579BDF, 0, dst.ptr);
+#endif
+}
+
+/** This function extracts a given channel from an NV21 image.
+ *
+ * @note Channel to be extracted should be passed as a pre-processor argument, e.g. -DCHANNEL_U will extract the U channel.
+ * @warning Only channels UV can be extracted using this kernel.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported Format: NV21 (UV88)
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void channel_extract_NV21(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 data = vload16(0, src.ptr);
+
+#if defined CHANNEL_U
+    vstore8(data.s13579BDF, 0, dst.ptr);
+#elif defined CHANNEL_V
+    vstore8(data.s02468ACE, 0, dst.ptr);
+#endif
+}
+
+/** This function extracts a given plane from an multi-planar image.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported Format: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void copy_plane(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Copy plane data
+    vstore16(vload16(0, src.ptr), 0, dst.ptr);
+}
diff --git a/src/core/CL/cl_kernels/color_convert.cl b/src/core/CL/cl_kernels/color_convert.cl
new file mode 100644
index 0000000000..f5ec85ae76
--- /dev/null
+++ b/src/core/CL/cl_kernels/color_convert.cl
@@ -0,0 +1,1823 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Convert an RGB888 image to RGBX8888
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void RGB888_to_RGBA8888_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+    // handle 16 pixels every time
+    uchar16 rgb_0 = vload16(0, in.ptr);
+    uchar16 rgb_1 = vload16(0, in.ptr + 16);
+    uchar16 rgb_2 = vload16(0, in.ptr + 32);
+
+    uchar16 rgba_0 = (uchar16)(rgb_0.s012, 255, rgb_0.s345, 255, rgb_0.s678, 255, rgb_0.s9ab, 255);
+    uchar16 rgba_1 = (uchar16)(rgb_0.scde, 255, rgb_0.f, rgb_1.s01, 255, rgb_1.s234, 255, rgb_1.s567, 255);
+    uchar16 rgba_2 = (uchar16)(rgb_1.s89a, 255, rgb_1.sbcd, 255, rgb_1.sef, rgb_2.s0, 255, rgb_2.s123, 255);
+    uchar16 rgba_3 = (uchar16)(rgb_2.s456, 255, rgb_2.s789, 255, rgb_2.sabc, 255, rgb_2.sdef, 255);
+
+    vstore16(rgba_0, 0, out.ptr);
+    vstore16(rgba_1, 0, out.ptr + 16);
+    vstore16(rgba_2, 0, out.ptr + 32);
+    vstore16(rgba_3, 0, out.ptr + 48);
+}
+
+/** Convert an RGB888 image to RGBX8888
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void RGBA8888_to_RGB888_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+    // handle 16 pixels every time
+    uchar16 rgba_0 = vload16(0, in.ptr);
+    uchar16 rgba_1 = vload16(0, in.ptr + 16);
+    uchar16 rgba_2 = vload16(0, in.ptr + 32);
+    uchar16 rgba_3 = vload16(0, in.ptr + 48);
+
+    uchar16 rgb_0 = (uchar16)(rgba_0.s01245689, rgba_0.sacde, rgba_1.s0124);
+    uchar16 rgb_1 = (uchar16)(rgba_1.s5689acde, rgba_2.s01245689);
+    uchar16 rgb_2 = (uchar16)(rgba_2.sacde, rgba_3.s01245689, rgba_3.sacde);
+
+    vstore16(rgb_0, 0, out.ptr);
+    vstore16(rgb_1, 0, out.ptr + 16);
+    vstore16(rgb_2, 0, out.ptr + 32);
+}
+
+/** Convert a UYVY422 image to RGB888 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void UYVY422_to_RGB888_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+    // handle 8 pixels every time
+    uchar16 uyvy = vload16(0, in.ptr);
+
+    uchar8 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
+    char8  cb   = (char8)(uyvy.s0, uyvy.s0, uyvy.s4, uyvy.s4, uyvy.s8, uyvy.s8, uyvy.sc, uyvy.sc) - (char8)(128);
+    char8  cr   = (char8)(uyvy.s2, uyvy.s2, uyvy.s6, uyvy.s6, uyvy.sa, uyvy.sa, uyvy.se, uyvy.se) - (char8)(128);
+
+    float8 f_r = convert_float8(luma) + (float8)(0.0000f) * convert_float8(cb) + (float8)(1.5748f) * convert_float8(cr);
+    float8 f_g = convert_float8(luma) - (float8)(0.1873f) * convert_float8(cb) - (float8)(0.4681f) * convert_float8(cr);
+    float8 f_b = convert_float8(luma) + (float8)(1.8556f) * convert_float8(cb) + (float8)(0.0000f) * convert_float8(cr);
+
+    uchar8 r_0 = convert_uchar8_rtz(f_r);
+    uchar8 g_0 = convert_uchar8_rtz(f_g);
+    uchar8 b_0 = convert_uchar8_rtz(f_b);
+
+    uchar16 rgb_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2, b_0.s2,
+                              r_0.s3, g_0.s3, b_0.s3, r_0.s4, g_0.s4, b_0.s4, r_0.s5);
+    uchar8 rgb_1 = (uchar8)(g_0.s5, b_0.s5, r_0.s6, g_0.s6, b_0.s6, r_0.s7, g_0.s7, b_0.s7);
+
+    vstore16(rgb_0, 0, out.ptr);
+    vstore8(rgb_1, 0, out.ptr + 16);
+}
+
+/** Convert a UYVY422 image to RGBX8888 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void UYVY422_to_RGBA8888_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+    // handle 8 pixels every time
+    uchar16 uyvy = vload16(0, in.ptr);
+
+    uchar8 luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
+    char8  cb   = (char8)(uyvy.s0, uyvy.s0, uyvy.s4, uyvy.s4, uyvy.s8, uyvy.s8, uyvy.sc, uyvy.sc) - (char8)(128);
+    char8  cr   = (char8)(uyvy.s2, uyvy.s2, uyvy.s6, uyvy.s6, uyvy.sa, uyvy.sa, uyvy.se, uyvy.se) - (char8)(128);
+
+    float8 f_r = convert_float8(luma) + (float8)(0.0000f) * convert_float8(cb) + (float8)(1.5748f) * convert_float8(cr);
+    float8 f_g = convert_float8(luma) - (float8)(0.1873f) * convert_float8(cb) - (float8)(0.4681f) * convert_float8(cr);
+    float8 f_b = convert_float8(luma) + (float8)(1.8556f) * convert_float8(cb) + (float8)(0.0000f) * convert_float8(cr);
+
+    uchar8 r_0 = convert_uchar8_rtz(f_r);
+    uchar8 g_0 = convert_uchar8_rtz(f_g);
+    uchar8 b_0 = convert_uchar8_rtz(f_b);
+
+    uchar16 rgba_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255,
+                               r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
+    uchar16 rgba_1 = (uchar16)(r_0.s4, g_0.s4, b_0.s4, 255, r_0.s5, g_0.s5, b_0.s5, 255,
+                               r_0.s6, g_0.s6, b_0.s6, 255, r_0.s7, g_0.s7, b_0.s7, 255);
+
+    vstore16(rgba_0, 0, out.ptr);
+    vstore16(rgba_1, 0, out.ptr + 16);
+}
+
+/** Convert a YUYV422 image to RGB888 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void YUYV422_to_RGB888_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+    // handle 8 pixels every time
+    uchar16 uyvy = vload16(0, in.ptr);
+
+    uchar8 luma = (uchar8)(uyvy.s0, uyvy.s2, uyvy.s4, uyvy.s6, uyvy.s8, uyvy.sa, uyvy.sc, uyvy.se);
+    char8  cb   = (char8)(uyvy.s1, uyvy.s1, uyvy.s5, uyvy.s5, uyvy.s9, uyvy.s9, uyvy.sd, uyvy.sd) - (char8)(128);
+    char8  cr   = (char8)(uyvy.s3, uyvy.s3, uyvy.s7, uyvy.s7, uyvy.sb, uyvy.sb, uyvy.sf, uyvy.sf) - (char8)(128);
+
+    float8 f_r = convert_float8(luma) + (float8)(0.0000f) * convert_float8(cb) + (float8)(1.5748f) * convert_float8(cr);
+    float8 f_g = convert_float8(luma) - (float8)(0.1873f) * convert_float8(cb) - (float8)(0.4681f) * convert_float8(cr);
+    float8 f_b = convert_float8(luma) + (float8)(1.8556f) * convert_float8(cb) + (float8)(0.0000f) * convert_float8(cr);
+
+    uchar8 r_0 = convert_uchar8_rtz(f_r);
+    uchar8 g_0 = convert_uchar8_rtz(f_g);
+    uchar8 b_0 = convert_uchar8_rtz(f_b);
+
+    uchar16 rgb_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2, b_0.s2,
+                              r_0.s3, g_0.s3, b_0.s3, r_0.s4, g_0.s4, b_0.s4, r_0.s5);
+    uchar8 rgb_1 = (uchar8)(g_0.s5, b_0.s5, r_0.s6, g_0.s6, b_0.s6, r_0.s7, g_0.s7, b_0.s7);
+
+    vstore16(rgb_0, 0, out.ptr);
+    vstore8(rgb_1, 0, out.ptr + 16);
+}
+
+/** Convert a YUYV422 image to RGBX8888 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void YUYV422_to_RGBA8888_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+    // handle 8 pixels every time
+    uchar16 uyvy = vload16(0, in.ptr);
+
+    uchar8 luma = (uchar8)(uyvy.s0, uyvy.s2, uyvy.s4, uyvy.s6, uyvy.s8, uyvy.sa, uyvy.sc, uyvy.se);
+    char8  cb   = (char8)(uyvy.s1, uyvy.s1, uyvy.s5, uyvy.s5, uyvy.s9, uyvy.s9, uyvy.sd, uyvy.sd) - (char8)(128);
+    char8  cr   = (char8)(uyvy.s3, uyvy.s3, uyvy.s7, uyvy.s7, uyvy.sb, uyvy.sb, uyvy.sf, uyvy.sf) - (char8)(128);
+
+    float8 f_r = convert_float8(luma) + (float8)(0.0000f) * convert_float8(cb) + (float8)(1.5748f) * convert_float8(cr);
+    float8 f_g = convert_float8(luma) - (float8)(0.1873f) * convert_float8(cb) - (float8)(0.4681f) * convert_float8(cr);
+    float8 f_b = convert_float8(luma) + (float8)(1.8556f) * convert_float8(cb) + (float8)(0.0000f) * convert_float8(cr);
+
+    uchar8 r_0 = convert_uchar8_rtz(f_r);
+    uchar8 g_0 = convert_uchar8_rtz(f_g);
+    uchar8 b_0 = convert_uchar8_rtz(f_b);
+
+    uchar16 rgba_0 = (uchar16)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255,
+                               r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
+    uchar16 rgba_1 = (uchar16)(r_0.s4, g_0.s4, b_0.s4, 255, r_0.s5, g_0.s5, b_0.s5, 255,
+                               r_0.s6, g_0.s6, b_0.s6, 255, r_0.s7, g_0.s7, b_0.s7, 255);
+
+    vstore16(rgba_0, 0, out.ptr);
+    vstore16(rgba_1, 0, out.ptr + 16);
+}
+
+/** Convert a RGB image to NV12 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                           Pointer to the source image. Supported Format: U8
+ * @param[in]  input_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] luma_ptr                            Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_stride_x                       Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_step_x                         luma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_stride_y                       Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_step_y                         luma_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_offset_first_element_in_bytes  The offset of the first element in the destination image luma channel
+ * @param[out] uv_ptr                              Pointer to the destination uv channel. Supported Format: U8
+ * @param[in]  uv_stride_x                         Stride of the destination uv channel in X dimension (in bytes)
+ * @param[in]  uv_step_x                           uv_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_stride_y                         Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  uv_step_y                           uv_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_offset_first_element_in_bytes    The offset of the first element in the destination image uv channel
+ *
+ */
+__kernel void RGB888_to_NV12_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(luma),
+    IMAGE_DECLARATION(uv))
+{
+    Image in     = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma);
+    Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv);
+
+    // handle 4 pixels every time, two lines, each line for 2 pixels
+    // Read 2 pixel of the first line
+    uchar8 rgb_0 = vload8(0, in.ptr);
+    uchar2 r_0   = (uchar2)(rgb_0.s0, rgb_0.s3);
+    uchar2 g_0   = (uchar2)(rgb_0.s1, rgb_0.s4);
+    uchar2 b_0   = (uchar2)(rgb_0.s2, rgb_0.s5);
+
+    float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
+    float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
+    float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
+
+    short2 i_y = convert_short2_rtz(f_y);
+    short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
+    short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
+
+    uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
+    vstore2(luma_0, 0, out_y.ptr);
+
+    uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
+    uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
+
+    // Read 2 pixel of the second line
+    uchar8 rgb_1 = vload8(0, in.ptr + input_stride_y);
+    uchar2 r_1   = (uchar2)(rgb_1.s0, rgb_1.s3);
+    uchar2 g_1   = (uchar2)(rgb_1.s1, rgb_1.s4);
+    uchar2 b_1   = (uchar2)(rgb_1.s2, rgb_1.s5);
+
+    f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
+    f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
+    f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
+
+    i_y = convert_short2_rtz(f_y);
+    i_u = convert_short2_rtz(f_u) + (short2)(128);
+    i_v = convert_short2_rtz(f_v) + (short2)(128);
+
+    uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
+    vstore2(luma_1, 0, out_y.ptr + luma_stride_y);
+
+    uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
+    uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
+    uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
+                           ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
+
+    vstore2(cbcr, 0, out_uv.ptr);
+}
+
+/*
+    R'= Y' + 0.0000*U + 1.5748*V
+    G'= Y' - 0.1873*U - 0.4681*V
+    B'= Y' + 1.8556*U + 0.0000*V
+*/
+
+/** Convert an NV12 image to RGB888
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                           Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                      Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                        luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                      Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                        luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  uv_input_ptr                             Pointer to the source uv channel. Supported Format: U8
+ * @param[in]  uv_input_stride_x                        Stride of the source image uv channel in X dimension (in bytes)
+ * @param[in]  uv_input_step_x                          uv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_input_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uv_input_step_y                          uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_input_offset_first_element_in_bytes   The offset of the first element in the source image
+ * @param[out] rgb_output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  rgb_output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  rgb_output_step_x                        rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgb_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  rgb_output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void NV12_to_RGB888_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(uv_input),
+    IMAGE_DECLARATION(rgb_output))
+{
+    Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_uv   = CONVERT_TO_IMAGE_STRUCT(uv_input);
+    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
+
+    // handle 8 pixels every time, two lines, each line for 4 pixels
+    uchar4 luma_0 = vload4(0, in_luma.ptr);
+    uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y);
+    uchar4 cbcr   = vload4(0, in_uv.ptr);
+    char4  cb     = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
+    char4  cr     = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
+
+    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
+    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
+    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
+
+    float4 f_r = convert_float4(luma_0) + temp0;
+    float4 f_g = convert_float4(luma_0) + temp1;
+    float4 f_b = convert_float4(luma_0) + temp2;
+
+    uchar4 r_0 = convert_uchar4_rtz(f_r);
+    uchar4 g_0 = convert_uchar4_rtz(f_g);
+    uchar4 b_0 = convert_uchar4_rtz(f_b);
+
+    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
+    uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
+    vstore8(rgb_0, 0, out_rgb.ptr);
+    vstore4(rgb_1, 0, out_rgb.ptr + 8);
+
+    f_r = convert_float4(luma_1) + temp0;
+    f_g = convert_float4(luma_1) + temp1;
+    f_b = convert_float4(luma_1) + temp2;
+
+    r_0 = convert_uchar4_rtz(f_r);
+    g_0 = convert_uchar4_rtz(f_g);
+    b_0 = convert_uchar4_rtz(f_b);
+
+    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
+    rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
+    vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
+    vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
+}
+
+/** Convert a RGB image to YUV444 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  rgb_input_ptr                             Pointer to the source image. Supported Format: U8
+ * @param[in]  rgb_input_stride_x                        Stride of the source image in X dimension (in bytes)
+ * @param[in]  rgb_input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgb_input_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  rgb_input_step_y                          rgb_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgb_input_offset_first_element_in_bytes   The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination image V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ *
+ */
+__kernel void RGB888_to_YUV444_bt709(
+    IMAGE_DECLARATION(rgb_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    // handle 4 pixels every time
+    Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_input);
+    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u  = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v  = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // Read 4 pixel
+    uchar16 rgb_0 = vload16(0, in_rgb.ptr);
+    uchar4  r_0   = (uchar4)(rgb_0.s0, rgb_0.s3, rgb_0.s6, rgb_0.s9);
+    uchar4  g_0   = (uchar4)(rgb_0.s1, rgb_0.s4, rgb_0.s7, rgb_0.sa);
+    uchar4  b_0   = (uchar4)(rgb_0.s2, rgb_0.s5, rgb_0.s8, rgb_0.sb);
+
+    float4 f_y = (float4)(0.0000f) + (float4)(0.2126f) * convert_float4(r_0) + (float4)(0.7152f) * convert_float4(g_0) + (float4)(0.0722f) * convert_float4(b_0);
+    float4 f_u = (float4)(0.0000f) - (float4)(0.1146f) * convert_float4(r_0) - (float4)(0.3854f) * convert_float4(g_0) + (float4)(0.5000f) * convert_float4(b_0);
+    float4 f_v = (float4)(0.0000f) + (float4)(0.5000f) * convert_float4(r_0) - (float4)(0.4542f) * convert_float4(g_0) - (float4)(0.0458f) * convert_float4(b_0);
+
+    short4 i_y = convert_short4_rtz(f_y);
+    short4 i_u = convert_short4_rtz(f_u) + (short4)(128);
+    short4 i_v = convert_short4_rtz(f_v) + (short4)(128);
+
+    uchar4 luma_0 = convert_uchar4(max((short4)(0), min(i_y, (short4)(255))));
+    vstore4(luma_0, 0, out_y.ptr);
+
+    uchar4 cb_0 = convert_uchar4(max((short4)(0), min(i_u, (short4)(255))));
+    uchar4 cr_0 = convert_uchar4(max((short4)(0), min(i_v, (short4)(255))));
+    vstore4(cb_0, 0, out_u.ptr);
+    vstore4(cr_0, 0, out_v.ptr);
+}
+
+/** Convert a RGB image to IYUV using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 2), height ]
+ * No offset.
+ *
+ * @param[in]  rgb_input_ptr                             Pointer to the source image. Supported Format: U8
+ * @param[in]  rgb_input_stride_x                        Stride of the source image in X dimension (in bytes)
+ * @param[in]  rgb_input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgb_input_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  rgb_input_step_y                          rgb_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgb_input_offset_first_element_in_bytes   The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ *
+ */
+__kernel void RGB888_to_IYUV_bt709(
+    IMAGE_DECLARATION(rgb_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    // handle 4 pixels every time, two lines, each line for 2 pixels
+    Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_input);
+    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u  = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v  = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // Read 2 pixel of the first line
+    uchar8 rgb_0 = vload8(0, in_rgb.ptr);
+    uchar2 r_0   = (uchar2)(rgb_0.s0, rgb_0.s3);
+    uchar2 g_0   = (uchar2)(rgb_0.s1, rgb_0.s4);
+    uchar2 b_0   = (uchar2)(rgb_0.s2, rgb_0.s5);
+
+    float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
+    float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
+    float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
+
+    short2 i_y = convert_short2_rtz(f_y);
+    short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
+    short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
+
+    uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
+    vstore2(luma_0, 0, out_y.ptr);
+
+    uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
+    uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
+
+    // Read 2 pixel of the second line
+    uchar8 rgb_1 = vload8(0, in_rgb.ptr + rgb_input_stride_y);
+    uchar2 r_1   = (uchar2)(rgb_1.s0, rgb_1.s3);
+    uchar2 g_1   = (uchar2)(rgb_1.s1, rgb_1.s4);
+    uchar2 b_1   = (uchar2)(rgb_1.s2, rgb_1.s5);
+
+    f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
+    f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
+    f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
+
+    i_y = convert_short2_rtz(f_y);
+    i_u = convert_short2_rtz(f_u) + (short2)(128);
+    i_v = convert_short2_rtz(f_v) + (short2)(128);
+
+    uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
+    vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y);
+
+    uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
+    uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
+    uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
+                           ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
+    *out_u.ptr = cbcr.x;
+    *out_v.ptr = cbcr.y;
+}
+
+/** Convert a RGBA image to YUV444 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  rgba_input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  rgba_input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  rgba_input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgba_input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  rgba_input_step_y                         rgb_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgba_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination image V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ *
+ */
+__kernel void RGBA8888_to_YUV444_bt709(
+    IMAGE_DECLARATION(rgba_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    // handle 4 pixels every time
+    Image in_rgba = CONVERT_TO_IMAGE_STRUCT(rgba_input);
+    Image out_y   = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u   = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v   = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // Read 4 pixel
+    uchar16 rgb_0 = vload16(0, in_rgba.ptr);
+    uchar4  r_0   = (uchar4)(rgb_0.s0, rgb_0.s4, rgb_0.s8, rgb_0.sc);
+    uchar4  g_0   = (uchar4)(rgb_0.s1, rgb_0.s5, rgb_0.s9, rgb_0.sd);
+    uchar4  b_0   = (uchar4)(rgb_0.s2, rgb_0.s6, rgb_0.sa, rgb_0.se);
+
+    float4 f_y = (float4)(0.0000f) + (float4)(0.2126f) * convert_float4(r_0) + (float4)(0.7152f) * convert_float4(g_0) + (float4)(0.0722f) * convert_float4(b_0);
+    float4 f_u = (float4)(0.0000f) - (float4)(0.1146f) * convert_float4(r_0) - (float4)(0.3854f) * convert_float4(g_0) + (float4)(0.5000f) * convert_float4(b_0);
+    float4 f_v = (float4)(0.0000f) + (float4)(0.5000f) * convert_float4(r_0) - (float4)(0.4542f) * convert_float4(g_0) - (float4)(0.0458f) * convert_float4(b_0);
+
+    short4 i_y = convert_short4(f_y);
+    short4 i_u = convert_short4(f_u) + (short4)(128);
+    short4 i_v = convert_short4(f_v) + (short4)(128);
+
+    uchar4 luma_0 = convert_uchar4_sat(max((short4)(0), min(i_y, (short4)(255))));
+    vstore4(luma_0, 0, out_y.ptr);
+
+    uchar4 cb_0 = convert_uchar4_sat(max((short4)(0), min(i_u, (short4)(255))));
+    uchar4 cr_0 = convert_uchar4_sat(max((short4)(0), min(i_v, (short4)(255))));
+    vstore4(cb_0, 0, out_u.ptr);
+    vstore4(cr_0, 0, out_v.ptr);
+}
+
+/** Convert a RGBA image to NV12 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 2), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                                 Pointer to the source image. Supported Format: U8
+ * @param[in]  input_stride_x                            Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                              input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                            Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                              input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes       The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination image luma channel
+ * @param[out] uv_output_ptr                             Pointer to the destination uv channel. Supported Format: U8
+ * @param[in]  uv_output_stride_x                        Stride of the destination uv channel in X dimension (in bytes)
+ * @param[in]  uv_output_step_x                          uv_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_output_stride_y                        Stride of the destination image uv channel in Y dimension (in bytes)
+ * @param[in]  uv_output_step_y                          uv_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_output_offset_first_element_in_bytes   The offset of the first element in the destination image uv channel
+ *
+ */
+__kernel void RGBA8888_to_NV12_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(uv_output))
+{
+    Image in     = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv_output);
+
+    // Read 2 pixel of the first line
+    uchar8 rgb_0 = vload8(0, in.ptr);
+    uchar2 r_0   = (uchar2)(rgb_0.s0, rgb_0.s4);
+    uchar2 g_0   = (uchar2)(rgb_0.s1, rgb_0.s5);
+    uchar2 b_0   = (uchar2)(rgb_0.s2, rgb_0.s6);
+
+    float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
+    float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
+    float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
+
+    short2 i_y = convert_short2_rtz(f_y);
+    short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
+    short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
+
+    uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
+    vstore2(luma_0, 0, out_y.ptr);
+
+    uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
+    uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
+
+    // Read 2 pixel of the second line
+    uchar8 rgb_1 = vload8(0, in.ptr + input_stride_y);
+    uchar2 r_1   = (uchar2)(rgb_1.s0, rgb_1.s4);
+    uchar2 g_1   = (uchar2)(rgb_1.s1, rgb_1.s5);
+    uchar2 b_1   = (uchar2)(rgb_1.s2, rgb_1.s6);
+
+    f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
+    f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
+    f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
+
+    i_y = convert_short2_rtz(f_y);
+    i_u = convert_short2_rtz(f_u) + (short2)(128);
+    i_v = convert_short2_rtz(f_v) + (short2)(128);
+
+    uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
+    vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y);
+
+    uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
+    uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
+    uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
+                           ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
+    vstore2(cbcr, 0, out_uv.ptr);
+}
+
+/** Convert a RGBA image to IYUV using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 2), height ]
+ * No offset.
+ *
+ * @param[in]  rgba_input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  rgba_input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  rgba_input_step_x                         rgba_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgba_input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  rgba_input_step_y                         rgba_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgba_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ *
+ */
+__kernel void RGBA8888_to_IYUV_bt709(
+    IMAGE_DECLARATION(rgba_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    // handle 4 pixels every time, two lines, each line for 2 pixels
+    Image in_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_input);
+    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u  = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v  = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // Read 2 pixel of the first line
+    uchar8 rgb_0 = vload8(0, in_rgb.ptr);
+    uchar2 r_0   = (uchar2)(rgb_0.s0, rgb_0.s4);
+    uchar2 g_0   = (uchar2)(rgb_0.s1, rgb_0.s5);
+    uchar2 b_0   = (uchar2)(rgb_0.s2, rgb_0.s6);
+
+    float2 f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_0) + (float2)(0.7152f) * convert_float2(g_0) + (float2)(0.0722f) * convert_float2(b_0);
+    float2 f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_0) - (float2)(0.3854f) * convert_float2(g_0) + (float2)(0.5000f) * convert_float2(b_0);
+    float2 f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_0) - (float2)(0.4542f) * convert_float2(g_0) - (float2)(0.0458f) * convert_float2(b_0);
+
+    short2 i_y = convert_short2_rtz(f_y);
+    short2 i_u = convert_short2_rtz(f_u) + (short2)(128);
+    short2 i_v = convert_short2_rtz(f_v) + (short2)(128);
+
+    uchar2 luma_0 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
+    vstore2(luma_0, 0, out_y.ptr);
+
+    uchar2 cb_0 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
+    uchar2 cr_0 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
+
+    // Read 2 pixel of the second line
+    uchar8 rgb_1 = vload8(0, in_rgb.ptr + rgba_input_stride_y);
+    uchar2 r_1   = (uchar2)(rgb_1.s0, rgb_1.s4);
+    uchar2 g_1   = (uchar2)(rgb_1.s1, rgb_1.s5);
+    uchar2 b_1   = (uchar2)(rgb_1.s2, rgb_1.s6);
+
+    f_y = (float2)(0.0000f) + (float2)(0.2126f) * convert_float2(r_1) + (float2)(0.7152f) * convert_float2(g_1) + (float2)(0.0722f) * convert_float2(b_1);
+    f_u = (float2)(0.0000f) - (float2)(0.1146f) * convert_float2(r_1) - (float2)(0.3854f) * convert_float2(g_1) + (float2)(0.5000f) * convert_float2(b_1);
+    f_v = (float2)(0.0000f) + (float2)(0.5000f) * convert_float2(r_1) - (float2)(0.4542f) * convert_float2(g_1) - (float2)(0.0458f) * convert_float2(b_1);
+
+    i_y = convert_short2_rtz(f_y);
+    i_u = convert_short2_rtz(f_u) + (short2)(128);
+    i_v = convert_short2_rtz(f_v) + (short2)(128);
+
+    uchar2 luma_1 = convert_uchar2(max((short2)(0), min(i_y, (short2)(255))));
+    vstore2(luma_1, 0, out_y.ptr + luma_output_stride_y);
+
+    uchar2 cb_1 = convert_uchar2(max((short2)(0), min(i_u, (short2)(255))));
+    uchar2 cr_1 = convert_uchar2(max((short2)(0), min(i_v, (short2)(255))));
+    uchar2 cbcr = (uchar2)(((cb_0.s0 + cb_0.s1 + cb_1.s0 + cb_1.s1) / 4),
+                           ((cr_0.s0 + cr_0.s1 + cr_1.s0 + cr_1.s1) / 4));
+    *out_u.ptr = cbcr.x;
+    *out_v.ptr = cbcr.y;
+}
+
+/** Convert an NV12 image to RGB8888
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                           Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                      Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                        luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                      Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                        luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  uv_input_ptr                             Pointer to the source uv channel. Supported Format: U8
+ * @param[in]  uv_input_stride_x                        Stride of the source image uv channel in X dimension (in bytes)
+ * @param[in]  uv_input_step_x                          uv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_input_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uv_input_step_y                          uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_input_offset_first_element_in_bytes   The offset of the first element in the source image
+ * @param[out] rgb_output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  rgb_output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  rgb_output_step_x                        rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgb_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  rgb_output_step_y                        rgb_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void NV12_to_RGBA8888_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(uv_input),
+    IMAGE_DECLARATION(rgb_output))
+{
+    Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_uv   = CONVERT_TO_IMAGE_STRUCT(uv_input);
+    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
+
+    uchar4 luma_0 = vload4(0, in_luma.ptr);
+    uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y);
+    uchar4 cbcr   = vload4(0, in_uv.ptr);
+    char4  cb     = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
+    char4  cr     = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
+
+    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
+    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
+    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
+
+    float4 f_r = convert_float4(luma_0) + temp0;
+    float4 f_g = convert_float4(luma_0) + temp1;
+    float4 f_b = convert_float4(luma_0) + temp2;
+
+    uchar4 r_0 = convert_uchar4_rtz(f_r);
+    uchar4 g_0 = convert_uchar4_rtz(f_g);
+    uchar4 b_0 = convert_uchar4_rtz(f_b);
+
+    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
+    uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
+    vstore8(rgb_0, 0, out_rgb.ptr);
+    vstore8(rgb_1, 0, out_rgb.ptr + 8);
+
+    f_r = convert_float4(luma_1) + temp0;
+    f_g = convert_float4(luma_1) + temp1;
+    f_b = convert_float4(luma_1) + temp2;
+
+    r_0 = convert_uchar4_rtz(f_r);
+    g_0 = convert_uchar4_rtz(f_g);
+    b_0 = convert_uchar4_rtz(f_b);
+
+    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
+    rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
+    vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
+    vstore8(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
+}
+
+/** Convert an NV12 image to IYUV
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
+ * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
+ * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ */
+__kernel void NV12_to_IYUV_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(uv_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
+    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // handle 32 pixels every time, two lines, each line for 16 pixels
+    uchar16 luma_0 = vload16(0, in_y.ptr);
+    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
+    uchar16 cbcr   = vload16(0, in_uv.ptr);
+    uchar8  cb     = (uchar8)(cbcr.s0, cbcr.s2, cbcr.s4, cbcr.s6, cbcr.s8, cbcr.sa, cbcr.sc, cbcr.se);
+    uchar8  cr     = (uchar8)(cbcr.s1, cbcr.s3, cbcr.s5, cbcr.s7, cbcr.s9, cbcr.sb, cbcr.sd, cbcr.sf);
+
+    vstore16(luma_0, 0, out_y.ptr);
+    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
+    vstore8(cb, 0, out_u.ptr);
+    vstore8(cr, 0, out_v.ptr);
+}
+
+/** Convert an NV12 image to YUV444
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
+ * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
+ * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ */
+__kernel void NV12_to_YUV444_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(uv_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
+    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // handle 32 pixels every time, two lines, each line for 16 pixels
+    uchar16 luma_0 = vload16(0, in_y.ptr);
+    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
+    uchar16 cbcr   = vload16(0, in_uv.ptr);
+    uchar16 cb     = (uchar16)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2, cbcr.s4, cbcr.s4, cbcr.s6, cbcr.s6, cbcr.s8, cbcr.s8,
+                               cbcr.sa, cbcr.sa, cbcr.sc, cbcr.sc, cbcr.se, cbcr.se);
+    uchar16 cr = (uchar16)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3, cbcr.s5, cbcr.s5, cbcr.s7, cbcr.s7, cbcr.s9, cbcr.s9,
+                           cbcr.sb, cbcr.sb, cbcr.sd, cbcr.sd, cbcr.sf, cbcr.sf);
+
+    vstore16(luma_0, 0, out_y.ptr);
+    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
+    vstore16(cb, 0, out_u.ptr);
+    vstore16(cb, 0, out_u.ptr + u_output_stride_y);
+    vstore16(cr, 0, out_v.ptr);
+    vstore16(cr, 0, out_v.ptr + v_output_stride_y);
+}
+
+/** Convert an NV21 image to RGB888
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                           Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                      Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                        luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                      Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                        luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  uv_input_ptr                             Pointer to the source uv channel. Supported Format: U8
+ * @param[in]  uv_input_stride_x                        Stride of the source image uv channel in X dimension (in bytes)
+ * @param[in]  uv_input_step_x                          uv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_input_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uv_input_step_y                          uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_input_offset_first_element_in_bytes   The offset of the first element in the source image
+ * @param[out] rgb_output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  rgb_output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  rgb_output_step_x                        rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgb_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  rgb_output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void NV21_to_RGB888_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(uv_input),
+    IMAGE_DECLARATION(rgb_output))
+{
+    Image in_y    = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_uv   = CONVERT_TO_IMAGE_STRUCT(uv_input);
+    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
+
+    // handle 8 pixels every time, two lines, each line for 4 pixels
+    uchar4 luma_0 = vload4(0, in_y.ptr);
+    uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y);
+    uchar4 cbcr   = vload4(0, in_uv.ptr);
+    char4  cr     = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
+    char4  cb     = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
+
+    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
+    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
+    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
+
+    float4 f_r = convert_float4(luma_0) + temp0;
+    float4 f_g = convert_float4(luma_0) + temp1;
+    float4 f_b = convert_float4(luma_0) + temp2;
+
+    uchar4 r_0 = convert_uchar4_rtz(f_r);
+    uchar4 g_0 = convert_uchar4_rtz(f_g);
+    uchar4 b_0 = convert_uchar4_rtz(f_b);
+
+    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
+    uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
+    vstore8(rgb_0, 0, out_rgb.ptr);
+    vstore4(rgb_1, 0, out_rgb.ptr + 8);
+
+    f_r = convert_float4(luma_1) + temp0;
+    f_g = convert_float4(luma_1) + temp1;
+    f_b = convert_float4(luma_1) + temp2;
+
+    r_0 = convert_uchar4_rtz(f_r);
+    g_0 = convert_uchar4_rtz(f_g);
+    b_0 = convert_uchar4_rtz(f_b);
+
+    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
+    rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
+    vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
+    vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
+}
+
+/** Convert an NV12 image to RGB8888
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
+ * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
+ * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] rgba_output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  rgba_output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  rgba_output_step_x                        rgba_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgba_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  rgba_output_step_y                        rgba_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgba_output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void NV21_to_RGBA8888_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(uv_input),
+    IMAGE_DECLARATION(rgba_output))
+{
+    Image in_luma = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_uv   = CONVERT_TO_IMAGE_STRUCT(uv_input);
+    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_output);
+
+    // handle 8 pixels every time, two lines, each line for 4 pixels
+    uchar4 luma_0 = vload4(0, in_luma.ptr);
+    uchar4 luma_1 = vload4(0, in_luma.ptr + luma_input_stride_y);
+    uchar4 cbcr   = vload4(0, in_uv.ptr);
+    char4  cr     = (char4)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2) - (char4)(128);
+    char4  cb     = (char4)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3) - (char4)(128);
+
+    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
+    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
+    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
+
+    float4 f_r = convert_float4(luma_0) + temp0;
+    float4 f_g = convert_float4(luma_0) + temp1;
+    float4 f_b = convert_float4(luma_0) + temp2;
+
+    uchar4 r_0 = convert_uchar4_rtz(f_r);
+    uchar4 g_0 = convert_uchar4_rtz(f_g);
+    uchar4 b_0 = convert_uchar4_rtz(f_b);
+
+    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
+    uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
+    vstore8(rgb_0, 0, out_rgb.ptr);
+    vstore8(rgb_1, 0, out_rgb.ptr + 8);
+
+    f_r = convert_float4(luma_1) + temp0;
+    f_g = convert_float4(luma_1) + temp1;
+    f_b = convert_float4(luma_1) + temp2;
+
+    r_0 = convert_uchar4_rtz(f_r);
+    g_0 = convert_uchar4_rtz(f_g);
+    b_0 = convert_uchar4_rtz(f_b);
+
+    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
+    rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
+    vstore8(rgb_0, 0, out_rgb.ptr + rgba_output_stride_y);
+    vstore8(rgb_1, 0, out_rgb.ptr + rgba_output_stride_y + 8);
+}
+
+/** Convert an NV21 image to YUV444
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
+ * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
+ * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ */
+__kernel void NV21_to_YUV444_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(uv_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
+    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // handle 32 pixels every time, two lines, each line for 16 pixels
+    uchar16 luma_0 = vload16(0, in_y.ptr);
+    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
+    uchar16 cbcr   = vload16(0, in_uv.ptr);
+    uchar16 cr     = (uchar16)(cbcr.s0, cbcr.s0, cbcr.s2, cbcr.s2, cbcr.s4, cbcr.s4, cbcr.s6, cbcr.s6, cbcr.s8, cbcr.s8,
+                               cbcr.sa, cbcr.sa, cbcr.sc, cbcr.sc, cbcr.se, cbcr.se);
+    uchar16 cb = (uchar16)(cbcr.s1, cbcr.s1, cbcr.s3, cbcr.s3, cbcr.s5, cbcr.s5, cbcr.s7, cbcr.s7, cbcr.s9, cbcr.s9,
+                           cbcr.sb, cbcr.sb, cbcr.sd, cbcr.sd, cbcr.sf, cbcr.sf);
+
+    vstore16(luma_0, 0, out_y.ptr);
+    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
+    vstore16(cb, 0, out_u.ptr);
+    vstore16(cb, 0, out_u.ptr + u_output_stride_y);
+    vstore16(cr, 0, out_v.ptr);
+    vstore16(cr, 0, out_v.ptr + v_output_stride_y);
+}
+
+/** Convert an NV21 image to IYUV
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  uv_input_ptr                              Pointer to the source uv channel. Supported Format: U8
+ * @param[in]  uv_input_stride_x                         Stride of the source image uv channel in X dimension (in bytes)
+ * @param[in]  uv_input_step_x                           uv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_input_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uv_input_step_y                           uv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_input_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ */
+__kernel void NV21_to_IYUV_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(uv_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_uv = CONVERT_TO_IMAGE_STRUCT(uv_input);
+    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    uchar16 luma_0 = vload16(0, in_y.ptr);
+    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
+    uchar16 cbcr   = vload16(0, in_uv.ptr);
+    uchar8  cr     = (uchar8)(cbcr.s0, cbcr.s2, cbcr.s4, cbcr.s6, cbcr.s8, cbcr.sa, cbcr.sc, cbcr.se);
+    uchar8  cb     = (uchar8)(cbcr.s1, cbcr.s3, cbcr.s5, cbcr.s7, cbcr.s9, cbcr.sb, cbcr.sd, cbcr.sf);
+
+    vstore16(luma_0, 0, out_y.ptr);
+    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
+    vstore8(cb, 0, out_u.ptr);
+    vstore8(cr, 0, out_v.ptr);
+}
+
+/** Convert a UYVY image to IYUV using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ * No offset.
+ *
+ * @param[in]  uyvy_input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  uyvy_input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  uyvy_input_step_x                         uyvy_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uyvy_input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  uyvy_input_step_y                         uyvy_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uyvy_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ *
+ */
+__kernel void UYVY422_to_IYUV_bt709(
+    IMAGE_DECLARATION(uyvy_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    Image in_uyvy = CONVERT_TO_IMAGE_STRUCT(uyvy_input);
+    Image out_y   = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u   = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v   = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // handle 16 pixels every time, each line 8 pixels
+    uchar16 uyvy = vload16(0, in_uyvy.ptr);
+    uchar8  luma = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
+    ushort4 cb_0 = (ushort4)(uyvy.s0, uyvy.s4, uyvy.s8, uyvy.sc);
+    ushort4 cr_0 = (ushort4)(uyvy.s2, uyvy.s6, uyvy.sa, uyvy.se);
+    vstore8(luma, 0, out_y.ptr);
+
+    uyvy         = vload16(0, in_uyvy.ptr + uyvy_input_stride_y);
+    luma         = (uchar8)(uyvy.s1, uyvy.s3, uyvy.s5, uyvy.s7, uyvy.s9, uyvy.sb, uyvy.sd, uyvy.sf);
+    ushort4 cb_1 = (ushort4)(uyvy.s0, uyvy.s4, uyvy.s8, uyvy.sc);
+    ushort4 cr_1 = (ushort4)(uyvy.s2, uyvy.s6, uyvy.sa, uyvy.se);
+    vstore8(luma, 0, out_y.ptr + luma_output_stride_y);
+
+    uchar4 cb = convert_uchar4((cb_0 + cb_1) / (ushort4)(2));
+    uchar4 cr = convert_uchar4((cr_0 + cr_1) / (ushort4)(2));
+    vstore4(cb, 0, out_u.ptr);
+    vstore4(cr, 0, out_v.ptr);
+}
+
+/** Convert a YUYV image to IYUV using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ * No offset.
+ *
+ * @param[in]  yuyv_input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  yuyv_input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  yuyv_input_step_x                         yuyv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  yuyv_input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  yuyv_input_step_y                         yuyv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  yuyv_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ *
+ */
+__kernel void YUYV422_to_IYUV_bt709(
+    IMAGE_DECLARATION(yuyv_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    Image in_yuyv = CONVERT_TO_IMAGE_STRUCT(yuyv_input);
+    Image out_y   = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u   = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v   = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // handle 16 pixels every time, each line 8 pixels
+    uchar16 yuyv = vload16(0, in_yuyv.ptr);
+    uchar8  luma = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
+    ushort4 cb_0 = (ushort4)(yuyv.s1, yuyv.s5, yuyv.s9, yuyv.sd);
+    ushort4 cr_0 = (ushort4)(yuyv.s3, yuyv.s7, yuyv.sb, yuyv.sf);
+    vstore8(luma, 0, out_y.ptr);
+
+    yuyv         = vload16(0, in_yuyv.ptr + yuyv_input_stride_y);
+    luma         = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
+    ushort4 cb_1 = (ushort4)(yuyv.s1, yuyv.s5, yuyv.s9, yuyv.sd);
+    ushort4 cr_1 = (ushort4)(yuyv.s3, yuyv.s7, yuyv.sb, yuyv.sf);
+    vstore8(luma, 0, out_y.ptr + luma_output_stride_y);
+
+    uchar4 cb = convert_uchar4((cb_0 + cb_1) / (ushort4)(2));
+    uchar4 cr = convert_uchar4((cr_0 + cr_1) / (ushort4)(2));
+    vstore4(cb, 0, out_u.ptr);
+    vstore4(cr, 0, out_v.ptr);
+}
+
+/** Convert an IYUV image to RGB888
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                           Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                      Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                        luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                      Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                        luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  u_input_ptr                              Pointer to the source U channel. Supported Format: U8
+ * @param[in]  u_input_stride_x                         Stride of the source image U channel in X dimension (in bytes)
+ * @param[in]  u_input_step_x                           u_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_input_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  u_input_step_y                           u_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_input_offset_first_element_in_bytes    The offset of the first element in the source U channel
+ * @param[in]  v_input_ptr                              Pointer to the source V channel. Supported Format: U8
+ * @param[in]  v_input_stride_x                         Stride of the source image V channel in X dimension (in bytes)
+ * @param[in]  v_input_step_x                           v_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_input_stride_y                         Stride of the source image V channel in Y dimension (in bytes)
+ * @param[in]  v_input_step_y                           v_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_input_offset_first_element_in_bytes    The offset of the first element in the source image V channel
+ * @param[out] rgb_output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  rgb_output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  rgb_output_step_x                        rgb_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgb_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  rgb_output_step_y                        rgb_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgb_output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void IYUV_to_RGB888_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(u_input),
+    IMAGE_DECLARATION(v_input),
+    IMAGE_DECLARATION(rgb_output))
+{
+    Image in_y    = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_u    = CONVERT_TO_IMAGE_STRUCT(u_input);
+    Image in_v    = CONVERT_TO_IMAGE_STRUCT(v_input);
+    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgb_output);
+
+    // handle 8 pixels every time, two lines, each line for 4 pixels
+    uchar4 luma_0 = vload4(0, in_y.ptr);
+    uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y);
+    uchar4 cbcr   = (uchar4)(vload2(0, in_u.ptr), vload2(0, in_v.ptr));
+    char4  cb     = (char4)(cbcr.s0, cbcr.s0, cbcr.s1, cbcr.s1) - (char4)(128);
+    char4  cr     = (char4)(cbcr.s2, cbcr.s2, cbcr.s3, cbcr.s3) - (char4)(128);
+
+    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
+    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
+    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
+
+    float4 f_r = convert_float4(luma_0) + temp0;
+    float4 f_g = convert_float4(luma_0) + temp1;
+    float4 f_b = convert_float4(luma_0) + temp2;
+
+    uchar4 r_0 = convert_uchar4_rtz(f_r);
+    uchar4 g_0 = convert_uchar4_rtz(f_g);
+    uchar4 b_0 = convert_uchar4_rtz(f_b);
+
+    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
+    uchar4 rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
+    vstore8(rgb_0, 0, out_rgb.ptr);
+    vstore4(rgb_1, 0, out_rgb.ptr + 8);
+
+    f_r = convert_float4(luma_1) + temp0;
+    f_g = convert_float4(luma_1) + temp1;
+    f_b = convert_float4(luma_1) + temp2;
+
+    r_0 = convert_uchar4_rtz(f_r);
+    g_0 = convert_uchar4_rtz(f_g);
+    b_0 = convert_uchar4_rtz(f_b);
+
+    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, r_0.s1, g_0.s1, b_0.s1, r_0.s2, g_0.s2);
+    rgb_1 = (uchar4)(b_0.s2, r_0.s3, g_0.s3, b_0.s3);
+    vstore8(rgb_0, 0, out_rgb.ptr + rgb_output_stride_y);
+    vstore4(rgb_1, 0, out_rgb.ptr + rgb_output_stride_y + 8);
+}
+
+/** Convert an IYUV image to RGB8888
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  u_input_ptr                               Pointer to the source U channel. Supported Format: U8
+ * @param[in]  u_input_stride_x                          Stride of the source image U channel in X dimension (in bytes)
+ * @param[in]  u_input_step_x                            u_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_input_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in]  u_input_step_y                            u_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_input_offset_first_element_in_bytes     The offset of the first element in the source U channel
+ * @param[in]  v_input_ptr                               Pointer to the source V channel. Supported Format: U8
+ * @param[in]  v_input_stride_x                          Stride of the source image V channel in X dimension (in bytes)
+ * @param[in]  v_input_step_x                            v_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_input_stride_y                          Stride of the source image V channel in Y dimension (in bytes)
+ * @param[in]  v_input_step_y                            v_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_input_offset_first_element_in_bytes     The offset of the first element in the source image V channel
+ * @param[out] rgba_output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  rgba_output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  rgba_output_step_x                        rgba_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rgba_output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  rgba_output_step_y                        rgba_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rgba_output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void IYUV_to_RGBA8888_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(u_input),
+    IMAGE_DECLARATION(v_input),
+    IMAGE_DECLARATION(rgba_output))
+{
+    Image in_y    = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_u    = CONVERT_TO_IMAGE_STRUCT(u_input);
+    Image in_v    = CONVERT_TO_IMAGE_STRUCT(v_input);
+    Image out_rgb = CONVERT_TO_IMAGE_STRUCT(rgba_output);
+
+    // handle 8 pixels every time, two lines, each line for 4 pixels
+    uchar4 luma_0 = vload4(0, in_y.ptr);
+    uchar4 luma_1 = vload4(0, in_y.ptr + luma_input_stride_y);
+    uchar4 cbcr   = (uchar4)(vload2(0, in_u.ptr), vload2(0, in_v.ptr));
+    char4  cb     = (char4)(cbcr.s0, cbcr.s0, cbcr.s1, cbcr.s1) - (char4)(128);
+    char4  cr     = (char4)(cbcr.s2, cbcr.s2, cbcr.s3, cbcr.s3) - (char4)(128);
+
+    float4 temp0 = (float4)(0.0000f) + (float4)(0.0000f) * convert_float4(cb) + (float4)(1.5748f) * convert_float4(cr);
+    float4 temp1 = (float4)(0.0000f) - (float4)(0.1873f) * convert_float4(cb) - (float4)(0.4681f) * convert_float4(cr);
+    float4 temp2 = (float4)(0.0000f) + (float4)(1.8556f) * convert_float4(cb) + (float4)(0.0000f) * convert_float4(cr);
+
+    float4 f_r = convert_float4(luma_0) + temp0;
+    float4 f_g = convert_float4(luma_0) + temp1;
+    float4 f_b = convert_float4(luma_0) + temp2;
+
+    uchar4 r_0 = convert_uchar4_rtz(f_r);
+    uchar4 g_0 = convert_uchar4_rtz(f_g);
+    uchar4 b_0 = convert_uchar4_rtz(f_b);
+
+    uchar8 rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
+    uchar8 rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
+    vstore8(rgb_0, 0, out_rgb.ptr);
+    vstore8(rgb_1, 0, out_rgb.ptr + 8);
+
+    f_r = convert_float4(luma_1) + temp0;
+    f_g = convert_float4(luma_1) + temp1;
+    f_b = convert_float4(luma_1) + temp2;
+
+    r_0 = convert_uchar4_rtz(f_r);
+    g_0 = convert_uchar4_rtz(f_g);
+    b_0 = convert_uchar4_rtz(f_b);
+
+    rgb_0 = (uchar8)(r_0.s0, g_0.s0, b_0.s0, 255, r_0.s1, g_0.s1, b_0.s1, 255);
+    rgb_1 = (uchar8)(r_0.s2, g_0.s2, b_0.s2, 255, r_0.s3, g_0.s3, b_0.s3, 255);
+    vstore8(rgb_0, 0, out_rgb.ptr + rgba_output_stride_y);
+    vstore8(rgb_1, 0, out_rgb.ptr + rgba_output_stride_y + 8);
+}
+
+/** Convert an IYUV image to YUV444
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  u_input_ptr                               Pointer to the source U channel. Supported Format: U8
+ * @param[in]  u_input_stride_x                          Stride of the source image U channel in X dimension (in bytes)
+ * @param[in]  u_input_step_x                            u_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_input_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in]  u_input_step_y                            u_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_input_offset_first_element_in_bytes     The offset of the first element in the source U channel
+ * @param[in]  v_input_ptr                               Pointer to the source V channel. Supported Format: U8
+ * @param[in]  v_input_stride_x                          Stride of the source image V channel in X dimension (in bytes)
+ * @param[in]  v_input_step_x                            v_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_input_stride_y                          Stride of the source image V channel in Y dimension (in bytes)
+ * @param[in]  v_input_step_y                            v_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_input_offset_first_element_in_bytes     The offset of the first element in the source image V channel
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] u_output_ptr                              Pointer to the destination U channel. Supported Format: U8
+ * @param[in]  u_output_stride_x                         Stride of the destination U channel in X dimension (in bytes)
+ * @param[in]  u_output_step_x                           u_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_output_stride_y                         Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  u_output_step_y                           u_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_output_offset_first_element_in_bytes    The offset of the first element in the destination U channel
+ * @param[out] v_output_ptr                              Pointer to the destination V channel. Supported Format: U8
+ * @param[in]  v_output_stride_x                         Stride of the destination V channel in X dimension (in bytes)
+ * @param[in]  v_output_step_x                           v_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_output_stride_y                         Stride of the destination V channel in Y dimension (in bytes)
+ * @param[in]  v_output_step_y                           v_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_output_offset_first_element_in_bytes    The offset of the first element in the destination V channel
+ *
+ */
+__kernel void IYUV_to_YUV444_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(u_input),
+    IMAGE_DECLARATION(v_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(u_output),
+    IMAGE_DECLARATION(v_output))
+{
+    Image in_y  = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_u  = CONVERT_TO_IMAGE_STRUCT(u_input);
+    Image in_v  = CONVERT_TO_IMAGE_STRUCT(v_input);
+    Image out_y = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_u = CONVERT_TO_IMAGE_STRUCT(u_output);
+    Image out_v = CONVERT_TO_IMAGE_STRUCT(v_output);
+
+    // handle 32 pixels every time, two lines, each line for 16 pixels
+    uchar16 luma_0 = vload16(0, in_y.ptr);
+    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
+    uchar8  cb_src = vload8(0, in_u.ptr);
+    uchar8  cr_src = vload8(0, in_v.ptr);
+    uchar16 cb     = (uchar16)(cb_src.s0, cb_src.s0, cb_src.s1, cb_src.s1, cb_src.s2, cb_src.s2, cb_src.s3, cb_src.s3,
+                               cb_src.s4, cb_src.s4, cb_src.s5, cb_src.s5, cb_src.s6, cb_src.s6, cb_src.s7, cb_src.s7);
+    uchar16 cr = (uchar16)(cr_src.s0, cr_src.s0, cr_src.s1, cr_src.s1, cr_src.s2, cr_src.s2, cr_src.s3, cr_src.s3,
+                           cr_src.s4, cr_src.s4, cr_src.s5, cr_src.s5, cr_src.s6, cr_src.s6, cr_src.s7, cr_src.s7);
+
+    vstore16(luma_0, 0, out_y.ptr);
+    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
+    vstore16(cb, 0, out_u.ptr);
+    vstore16(cb, 0, out_u.ptr + u_output_stride_y);
+    vstore16(cr, 0, out_v.ptr);
+    vstore16(cr, 0, out_v.ptr + v_output_stride_y);
+}
+
+/** Convert an IYUV image to NV12
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  luma_input_ptr                            Pointer to the source luma channel. Supported Format: U8
+ * @param[in]  luma_input_stride_x                       Stride of the luma image in X dimension (in bytes)
+ * @param[in]  luma_input_step_x                         luma_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_input_stride_y                       Stride of the source luma channel in Y dimension (in bytes)
+ * @param[in]  luma_input_step_y                         luma_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  u_input_ptr                               Pointer to the source U channel. Supported Format: U8
+ * @param[in]  u_input_stride_x                          Stride of the source image U channel in X dimension (in bytes)
+ * @param[in]  u_input_step_x                            u_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  u_input_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in]  u_input_step_y                            u_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  u_input_offset_first_element_in_bytes     The offset of the first element in the source U channel
+ * @param[in]  v_input_ptr                               Pointer to the source V channel. Supported Format: U8
+ * @param[in]  v_input_stride_x                          Stride of the source image V channel in X dimension (in bytes)
+ * @param[in]  v_input_step_x                            v_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  v_input_stride_y                          Stride of the source image V channel in Y dimension (in bytes)
+ * @param[in]  v_input_step_y                            v_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  v_input_offset_first_element_in_bytes     The offset of the first element in the source image V channel
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] uv_output_ptr                             Pointer to the destination UV channel. Supported Format: U8
+ * @param[in]  uv_output_stride_x                        Stride of the destination UV channel in X dimension (in bytes)
+ * @param[in]  uv_output_step_x                          uv_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_output_stride_y                        Stride of the destination image U channel in Y dimension (in bytes)
+ * @param[in]  uv_output_step_y                          uv_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_output_offset_first_element_in_bytes   The offset of the first element in the destination UV channel
+ *
+ */
+__kernel void IYUV_to_NV12_bt709(
+    IMAGE_DECLARATION(luma_input),
+    IMAGE_DECLARATION(u_input),
+    IMAGE_DECLARATION(v_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(uv_output))
+{
+    Image in_y   = CONVERT_TO_IMAGE_STRUCT(luma_input);
+    Image in_u   = CONVERT_TO_IMAGE_STRUCT(u_input);
+    Image in_v   = CONVERT_TO_IMAGE_STRUCT(v_input);
+    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv_output);
+
+    // handle 32 pixels every time, two lines, each line for 16 pixels
+    uchar16 luma_0 = vload16(0, in_y.ptr);
+    uchar16 luma_1 = vload16(0, in_y.ptr + luma_input_stride_y);
+    uchar8  cb     = vload8(0, in_u.ptr);
+    uchar8  cr     = vload8(0, in_v.ptr);
+    uchar16 cbcr   = (uchar16)(cb.s0, cr.s0, cb.s1, cr.s1, cb.s2, cr.s2, cb.s3, cr.s3, cb.s4, cr.s4, cb.s5, cr.s5, cb.s6,
+                               cr.s6, cb.s7, cr.s7);
+
+    vstore16(luma_0, 0, out_y.ptr);
+    vstore16(luma_1, 0, out_y.ptr + luma_output_stride_y);
+    vstore16(cbcr, 0, out_uv.ptr);
+}
+
+/** Convert a YUYV image to NV12 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ * No offset.
+ *
+ * @param[in]  yuyv_input_ptr                            Pointer to the source image. Supported Format: U8
+ * @param[in]  yuyv_input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  yuyv_input_step_x                         yuyv_input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  yuyv_input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  yuyv_input_step_y                         yuyv_input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  yuyv_input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] luma_output_ptr                           Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_output_stride_x                      Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_output_step_x                        luma_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_output_stride_y                      Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_output_step_y                        luma_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_output_offset_first_element_in_bytes The offset of the first element in the destination luma channel
+ * @param[out] uv_output_ptr                             Pointer to the destination UV channel. Supported Format: U8
+ * @param[in]  uv_output_stride_x                        Stride of the destination UV channel in X dimension (in bytes)
+ * @param[in]  uv_output_step_x                          uv_output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_output_stride_y                        Stride of the destination image UV channel in Y dimension (in bytes)
+ * @param[in]  uv_output_step_y                          uv_output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_output_offset_first_element_in_bytes   The offset of the first element in the destination UV channel
+ *
+ */
+__kernel void YUYV422_to_NV12_bt709(
+    IMAGE_DECLARATION(yuyv_input),
+    IMAGE_DECLARATION(luma_output),
+    IMAGE_DECLARATION(uv_output))
+{
+    Image in_yuyv = CONVERT_TO_IMAGE_STRUCT(yuyv_input);
+    Image out_y   = CONVERT_TO_IMAGE_STRUCT(luma_output);
+    Image out_uv  = CONVERT_TO_IMAGE_STRUCT(uv_output);
+
+    // handle 16 pixels every time, each line 8 pixels
+    uchar16 yuyv   = vload16(0, in_yuyv.ptr);
+    ushort8 cbcr_0 = (ushort8)(yuyv.s1, yuyv.s3, yuyv.s5, yuyv.s7, yuyv.s9, yuyv.sb, yuyv.sd, yuyv.sf);
+    uchar8  luma   = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
+    vstore8(luma, 0, out_y.ptr);
+
+    yuyv           = vload16(0, in_yuyv.ptr + yuyv_input_stride_y);
+    ushort8 cbcr_1 = (ushort8)(yuyv.s1, yuyv.s3, yuyv.s5, yuyv.s7, yuyv.s9, yuyv.sb, yuyv.sd, yuyv.sf);
+    luma           = (uchar8)(yuyv.s0, yuyv.s2, yuyv.s4, yuyv.s6, yuyv.s8, yuyv.sa, yuyv.sc, yuyv.se);
+    vstore8(luma, 0, out_y.ptr + luma_output_stride_y);
+
+    uchar8 cbcr = convert_uchar8((cbcr_0 + cbcr_1) / (ushort8)(2));
+    vstore8(cbcr, 0, out_uv.ptr);
+}
+
+/** Convert a UYVY image to NV12 using BT709 color space
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 4), height ]
+ * No offset.
+ *
+ * @param[in]  input_uyvy_ptr                           Pointer to the source image. Supported Format: U8
+ * @param[in]  input_uyvy_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_uyvy_step_x                        input_uyvy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_uyvy_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_uyvy_step_y                        input_uyvy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_uyvy_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] luma_ptr                                 Pointer to the destination luma channel. Supported Format: U8
+ * @param[in]  luma_stride_x                            Stride of the destination luma channel in X dimension (in bytes)
+ * @param[in]  luma_step_x                              luma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  luma_stride_y                            Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  luma_step_y                              luma_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  luma_offset_first_element_in_bytes       The offset of the first element in the destination image luma channel
+ * @param[out] uv_ptr                                   Pointer to the destination uv channel. Supported Format: U8
+ * @param[in]  uv_stride_x                              Stride of the destination uv channel in X dimension (in bytes)
+ * @param[in]  uv_step_x                                uv_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  uv_stride_y                              Stride of the destination image luma channel in Y dimension (in bytes)
+ * @param[in]  uv_step_y                                uv_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  uv_offset_first_element_in_bytes         The offset of the first element in the destination image uv channel
+ *
+ */
+__kernel void UYVY422_to_NV12_bt709(
+    IMAGE_DECLARATION(input_uyvy),
+    IMAGE_DECLARATION(luma),
+    IMAGE_DECLARATION(uv))
+{
+    Image in     = CONVERT_TO_IMAGE_STRUCT(input_uyvy);
+    Image out_y  = CONVERT_TO_IMAGE_STRUCT(luma);
+    Image out_uv = CONVERT_TO_IMAGE_STRUCT(uv);
+
+    // handle 16 pixels every time, each line 8 pixels
+    const uchar16 uyvy_t = vload16(0, in.ptr);
+    vstore8(uyvy_t.s13579bdf, 0, out_y.ptr);
+
+    const uchar16 uyvy_b = vload16(0, in.ptr + input_uyvy_stride_y);
+    vstore8(uyvy_b.s13579bdf, 0, out_y.ptr + luma_stride_y);
+
+    const ushort8 cbcr_t = (ushort8)(uyvy_t.s0, uyvy_t.s2, uyvy_t.s4, uyvy_t.s6, uyvy_t.s8, uyvy_t.sa, uyvy_t.sc, uyvy_t.se);
+    const ushort8 cbcr_b = (ushort8)(uyvy_b.s0, uyvy_b.s2, uyvy_b.s4, uyvy_b.s6, uyvy_b.s8, uyvy_b.sa, uyvy_b.sc, uyvy_b.se);
+    const uchar8  cbcr   = convert_uchar8((cbcr_t + cbcr_b) / (ushort8)(2));
+    vstore8(cbcr, 0, out_uv.ptr);
+}
diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/concatenate.cl
new file mode 100644
index 0000000000..00f5189508
--- /dev/null
+++ b/src/core/CL/cl_kernels/concatenate.cl
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This kernel concatenates the input tensor into the output tensor along the third dimension
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: F32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  offset                            The offset to the first valid element of the output tensor in bytes
+ */
+__kernel void concatenate_depth(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    unsigned int offset)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    float4 source_values = vload4(0, (__global float *)src.ptr);
+
+    vstore4(source_values, 0, (__global float *)(dst.ptr + offset));
+}
diff --git a/src/core/CL/cl_kernels/convolution3x3.cl b/src/core/CL/cl_kernels/convolution3x3.cl
new file mode 100644
index 0000000000..3733d0c733
--- /dev/null
+++ b/src/core/CL/cl_kernels/convolution3x3.cl
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifndef DATA_TYPE
+#define DATA_TYPE short
+#endif
+
+#ifndef DATA_TYPE_OUT
+#define DATA_TYPE_OUT uchar
+#endif
+
+/** Compute a 1D horizontal convolution of size 3 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] left_pixel   Pointer to the left pixel.
+ * @param[in] left_coeff   Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right_coeff  Weight of the right pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) convolution1x3(__global const uchar *left_pixel,
+                                                  const short left_coeff,
+                                                  const short middle_coeff,
+                                                  const short right_coeff)
+{
+    uchar16 temp = vload16(0, left_pixel);
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    middle = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8));
+
+    return left * (VEC_DATA_TYPE(DATA_TYPE, 8))left_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right * (VEC_DATA_TYPE(DATA_TYPE, 8))right_coeff;
+}
+
+/** Apply a 3x3 convolution matrix to a single channel U8 input image and return the result.
+ *
+ * Convolution matrix layout:
+ *
+ * [ mat0, mat1, mat2 ]\n
+ * [ mat3, mat4, mat5 ]\n
+ * [ mat6, mat7, mat8 ]\n
+ *
+ * @param[in] src   A pointer to source Image structure
+ * @param[in] mat0  Coefficient from the convolution matrix
+ * @param[in] mat1  Coefficient from the convolution matrix
+ * @param[in] mat2  Coefficient from the convolution matrix
+ * @param[in] mat3  Coefficient from the convolution matrix
+ * @param[in] mat4  Coefficient from the convolution matrix
+ * @param[in] mat5  Coefficient from the convolution matrix
+ * @param[in] mat6  Coefficient from the convolution matrix
+ * @param[in] mat0  Coefficient from the convolution matrix
+ * @param[in] mat7  Coefficient from the convolution matrix
+ * @param[in] mat8  Coefficient from the convolution matrix
+ * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
+ *
+ * @return a short8 containing 8 convoluted and scaled values.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) convolution3x3(
+    Image      *src,
+    const short mat0, const short mat1, const short mat2,
+    const short mat3, const short mat4, const short mat5,
+    const short mat6, const short mat7, const short mat8, uint scale)
+{
+    // Output pixels
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels;
+
+    // Row 0
+    pixels = convolution1x3(offset(src, -1, -1), mat0, mat1, mat2);
+    // Row
+    pixels += convolution1x3(offset(src, -1, 0), mat3, mat4, mat5);
+    // Row 2
+    pixels += convolution1x3(offset(src, -1, 1), mat6, mat7, mat8);
+
+    // Divide by the scale
+    return pixels / (VEC_DATA_TYPE(DATA_TYPE, 8))scale;
+}
+
+#ifndef DYNAMIC_MATRIX_CONVOLUTION
+
+/** Apply a 3x3 static convolution matrix to a single channel U8 input image and output a single channel image.
+ *
+ * @attention The matrix coefficients(MAT0, MAT1, ... MAT8, SCALE), DATA_TYPE, and DATA_TYPE_OUT need to be passed at compile time.\n
+ * e.g. -DMAT0=1 -DMAT2=2, ...-DMAT8=8, -DSCALE=1, -DDATA_TYPE=int, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution3x3_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels = convolution3x3(&src,
+                            MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, SCALE);
+
+    // Store the result as is in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+#endif // DYNAMIC_MATRIX_CONVOLUTION
diff --git a/src/core/CL/cl_kernels/convolution5x5.cl b/src/core/CL/cl_kernels/convolution5x5.cl
new file mode 100644
index 0000000000..d1335c5558
--- /dev/null
+++ b/src/core/CL/cl_kernels/convolution5x5.cl
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifndef DATA_TYPE
+#define DATA_TYPE short
+#endif
+
+#ifndef COMPUTE_TYPE
+#define COMPUTE_TYPE int
+#endif
+
+#ifndef DATA_TYPE_OUT
+#define DATA_TYPE_OUT uchar
+#endif
+
+/** Compute a 1D horizontal convolution of size 5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] left_pixel   Pointer to the left pixel
+ * @param[in] left1_coeff  Weight of the most left pixel
+ * @param[in] left2_coeff  Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right1_coeff Weight of the right pixel
+ * @param[in] right2_coeff Weight of the most right pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+VEC_DATA_TYPE(DATA_TYPE, 8)
+convolution1x5(
+    __global const uchar *left_pixel,
+    const short           left1_coeff,
+    const short           left2_coeff,
+    const short           middle_coeff,
+    const short           right1_coeff,
+    const short           right2_coeff)
+{
+    uchar16 temp = vload16(0, left_pixel);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    middle = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right1 = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right2 = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8));
+
+    return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff
+           + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE, 8))right2_coeff;
+}
+
+/** Compute a 1D vertical convolution of size 5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] src          Pointer to source image.
+ * @param[in] up1_coeff    Weight of the most up pixel
+ * @param[in] up2_coeff    Weight of the up pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] down1_coeff  Weight of the down pixel
+ * @param[in] down2_coeff  Weight of the most down pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+convolution5x1(
+    Image      *src,
+    const short up1_coeff,
+    const short up2_coeff,
+    const short middle_coeff,
+    const short down1_coeff,
+    const short down2_coeff)
+{
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    val;
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff;
+
+    return out;
+}
+
+/** Apply a 5x5 convolution matrix to a single channel U8 input image and return the result.
+ *
+ * Convolution matrix layout:\n
+ * [  mat0,  mat1,  mat2,  mat3 , mat4 ]\n
+ * [  mat5,  mat6,  mat7,  mat8,  mat9 ]\n
+ * [ mat10, mat11, mat12, mat13, mat14 ]\n
+ * [ mat15, mat16, mat17, mat18, mat19 ]\n
+ * [ mat20, mat21, mat22, mat23, mat24 ]
+ *
+ * @param[in] src   A pointer to source Image structure.
+ * @param[in] mat0  Coefficient from the convolution matrix
+ * @param[in] mat1  Coefficient from the convolution matrix
+ * @param[in] mat2  Coefficient from the convolution matrix
+ * @param[in] mat3  Coefficient from the convolution matrix
+ * @param[in] mat4  Coefficient from the convolution matrix
+ * @param[in] mat5  Coefficient from the convolution matrix
+ * @param[in] mat6  Coefficient from the convolution matrix
+ * @param[in] mat0  Coefficient from the convolution matrix
+ * @param[in] mat7  Coefficient from the convolution matrix
+ * @param[in] mat8  Coefficient from the convolution matrix
+ * @param[in] mat9  Coefficient from the convolution matrix
+ * @param[in] mat10 Coefficient from the convolution matrix
+ * @param[in] mat11 Coefficient from the convolution matrix
+ * @param[in] mat12 Coefficient from the convolution matrix
+ * @param[in] mat13 Coefficient from the convolution matrix
+ * @param[in] mat14 Coefficient from the convolution matrix
+ * @param[in] mat15 Coefficient from the convolution matrix
+ * @param[in] mat16 Coefficient from the convolution matrix
+ * @param[in] mat10 Coefficient from the convolution matrix
+ * @param[in] mat17 Coefficient from the convolution matrix
+ * @param[in] mat18 Coefficient from the convolution matrix
+ * @param[in] mat19 Coefficient from the convolution matrix
+ * @param[in] mat20 Coefficient from the convolution matrix
+ * @param[in] mat21 Coefficient from the convolution matrix
+ * @param[in] mat22 Coefficient from the convolution matrix
+ * @param[in] mat23 Coefficient from the convolution matrix
+ * @param[in] mat24 Coefficient from the convolution matrix
+ * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
+ *
+ * @return a short8 containing 8 convoluted and scaled values.
+ */
+short8 convolution5x5(
+    Image      *src,
+    const short mat0, const short mat1, const short mat2, const short mat3, const short mat4,
+    const short mat5, const short mat6, const short mat7, const short mat8, const short mat9,
+    const short mat10, const short mat11, const short mat12, const short mat13, const short mat14,
+    const short mat15, const short mat16, const short mat17, const short mat18, const short mat19,
+    const short mat20, const short mat21, const short mat22, const short mat23, const short mat24,
+    uint scale)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels;
+
+    pixels = convolution1x5(offset(src, -2, -2), mat0, mat1, mat2, mat3, mat4);
+    pixels += convolution1x5(offset(src, -2, -1), mat5, mat6, mat7, mat8, mat9);
+    pixels += convolution1x5(offset(src, -2, 0), mat10, mat11, mat12, mat13, mat14);
+    pixels += convolution1x5(offset(src, -2, 1), mat15, mat16, mat17, mat18, mat19);
+    pixels += convolution1x5(offset(src, -2, 2), mat20, mat21, mat22, mat23, mat24);
+
+    if(scale > 0)
+    {
+        pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale;
+    }
+
+    return convert_short8_sat(pixels);
+}
+
+#ifndef DYNAMIC_MATRIX_CONVOLUTION
+
+/** Apply a 1x5 static convolution matrix to a single channel U8 input image and output a single temporary channel image(Support U16, S16, S32).
+ *
+ * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4) and DATA_TYPE need to be passed at compile time:\n
+ * e.g. -DMAT0=1 -DMAT2=2, -DMAT3=3, -DMAT4=4, -DDATA_TYPE=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U16, S16, S32
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_separable1x5_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Output pixels
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels = convolution1x5(offset(&src, -2, 0), MAT0, MAT1, MAT2, MAT3, MAT4);
+
+    // Store result in dst
+    vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+/** Apply a 5x1 static convolution matrix to a single channel U8 input image and output a single channel image.
+ *
+ * @attention The matrix coefficients (MAT5, MAT6, MAT7, MAT8, MAT9, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT5=1 -DMAT6=2, -DMAT7=3, -DMAT8=4, -DMAT9=5, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U16, S16, S32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_separable5x1_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Output pixels
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    pixels = convolution5x1(&src, MAT5, MAT6, MAT7, MAT8, MAT9);
+
+    // Divide by the scale
+    pixels /= (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE;
+
+    // Store result in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+/** Apply a static 5x5 convolution matrix to a single channel U8 input image and output a single channel image including borders
+ *
+ * @attention The matrix coefficients(MAT0, MAT1, ... MAT24, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT0=1 -DMAT1=2, ... -DMAT24=24, -DSCALE=6, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution5x5_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    short8 pixels = convolution5x5(&src,
+                                   MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13,
+                                   MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, SCALE);
+
+    // Store the result as is in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+#endif // DYNAMIC_MATRIX_CONVOLUTION
diff --git a/src/core/CL/cl_kernels/convolution7x7.cl b/src/core/CL/cl_kernels/convolution7x7.cl
new file mode 100644
index 0000000000..74a0055370
--- /dev/null
+++ b/src/core/CL/cl_kernels/convolution7x7.cl
@@ -0,0 +1,340 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifndef DATA_TYPE
+#define DATA_TYPE short
+#endif
+
+#ifndef COMPUTE_TYPE
+#define COMPUTE_TYPE int
+#endif
+
+#ifndef DATA_TYPE_OUT
+#define DATA_TYPE_OUT uchar
+#endif
+
+/** Compute a 1D horizontal convolution of size 7 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] left_pixel   Pointer to the left pixel
+ * @param[in] left1_coeff  Weight of the most left pixel
+ * @param[in] left2_coeff  Weight of the second left pixel
+ * @param[in] left3_coeff  Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right1_coeff Weight of the right pixel
+ * @param[in] right2_coeff Weight of the second right pixel
+ * @param[in] right3_coeff Weight of the most right pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+VEC_DATA_TYPE(DATA_TYPE, 8)
+convolution1x7(
+    __global const uchar *left_pixel,
+    const short           left1_coeff,
+    const short           left2_coeff,
+    const short           left3_coeff,
+    const short           middle_coeff,
+    const short           right1_coeff,
+    const short           right2_coeff,
+    const short           right3_coeff)
+{
+    uchar16 temp = vload16(0, left_pixel);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left3 = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    middle = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right1 = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right2 = CONVERT(temp.s56789abc, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right3 = CONVERT(temp.s6789abcd, VEC_DATA_TYPE(DATA_TYPE, 8));
+
+    return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff + left3 * (VEC_DATA_TYPE(DATA_TYPE, 8))left3_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE,
+            8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE, 8))right2_coeff + right3 * (VEC_DATA_TYPE(DATA_TYPE, 8))right3_coeff;
+}
+
+/** Compute a 1D vertical convolution of size 7 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] src          Pointer to source image.
+ * @param[in] up1_coeff    Weight of the most up pixel
+ * @param[in] up2_coeff    Weight of the second up pixel
+ * @param[in] up3_coeff    Weight of the up pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] down1_coeff  Weight of the down pixel
+ * @param[in] down2_coeff  Weight of the second down pixel
+ * @param[in] down3_coeff  Weight of the third down pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+convolution7x1(
+    Image      *src,
+    const short up1_coeff,
+    const short up2_coeff,
+    const short up3_coeff,
+    const short middle_coeff,
+    const short down1_coeff,
+    const short down2_coeff,
+    const short down3_coeff)
+{
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    val;
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up3_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down3_coeff;
+
+    return out;
+}
+
+/** Apply a 7x7 convolution matrix to a single channel U8 input image and return the result.
+ *
+ * Convolution matrix layout:\n
+ * [  mat0,  mat1,  mat2,  mat3 , mat4,  mat5,  mat6 ]\n
+ * [  mat7,  mat8,  mat9,  mat10, mat11, mat12, mat13 ]\n
+ * [  mat14, mat15, mat16, mat17, mat18, mat19, mat20 ]\n
+ * [  mat21, mat22, mat23, mat24, mat25, mat26, mat27 ]\n
+ * [  mat28, mat29, mat30, mat31, mat32, mat33, mat34 ]\n
+ * [  mat35, mat36, mat37, mat38, mat39, mat40, mat41 ]\n
+ * [  mat42, mat43, mat44, mat45, mat46, mat47, mat48 ]
+ *
+ * @param[in] src   A pointer to source Image structure.
+ * @param[in] mat0  Coefficient from the convolution matrix
+ * @param[in] mat1  Coefficient from the convolution matrix
+ * @param[in] mat2  Coefficient from the convolution matrix
+ * @param[in] mat3  Coefficient from the convolution matrix
+ * @param[in] mat4  Coefficient from the convolution matrix
+ * @param[in] mat5  Coefficient from the convolution matrix
+ * @param[in] mat6  Coefficient from the convolution matrix
+ * @param[in] mat0  Coefficient from the convolution matrix
+ * @param[in] mat7  Coefficient from the convolution matrix
+ * @param[in] mat8  Coefficient from the convolution matrix
+ * @param[in] mat9  Coefficient from the convolution matrix
+ * @param[in] mat10 Coefficient from the convolution matrix
+ * @param[in] mat11 Coefficient from the convolution matrix
+ * @param[in] mat12 Coefficient from the convolution matrix
+ * @param[in] mat13 Coefficient from the convolution matrix
+ * @param[in] mat14 Coefficient from the convolution matrix
+ * @param[in] mat15 Coefficient from the convolution matrix
+ * @param[in] mat16 Coefficient from the convolution matrix
+ * @param[in] mat10 Coefficient from the convolution matrix
+ * @param[in] mat17 Coefficient from the convolution matrix
+ * @param[in] mat18 Coefficient from the convolution matrix
+ * @param[in] mat19 Coefficient from the convolution matrix
+ * @param[in] mat20 Coefficient from the convolution matrix
+ * @param[in] mat21 Coefficient from the convolution matrix
+ * @param[in] mat22 Coefficient from the convolution matrix
+ * @param[in] mat23 Coefficient from the convolution matrix
+ * @param[in] mat24 Coefficient from the convolution matrix
+ * @param[in] mat25 Coefficient from the convolution matrix
+ * @param[in] mat26 Coefficient from the convolution matrix
+ * @param[in] mat27 Coefficient from the convolution matrix
+ * @param[in] mat28 Coefficient from the convolution matrix
+ * @param[in] mat29 Coefficient from the convolution matrix
+ * @param[in] mat30 Coefficient from the convolution matrix
+ * @param[in] mat31 Coefficient from the convolution matrix
+ * @param[in] mat32 Coefficient from the convolution matrix
+ * @param[in] mat33 Coefficient from the convolution matrix
+ * @param[in] mat34 Coefficient from the convolution matrix
+ * @param[in] mat35 Coefficient from the convolution matrix
+ * @param[in] mat36 Coefficient from the convolution matrix
+ * @param[in] mat37 Coefficient from the convolution matrix
+ * @param[in] mat38 Coefficient from the convolution matrix
+ * @param[in] mat39 Coefficient from the convolution matrix
+ * @param[in] mat40 Coefficient from the convolution matrix
+ * @param[in] mat41 Coefficient from the convolution matrix
+ * @param[in] mat42 Coefficient from the convolution matrix
+ * @param[in] mat43 Coefficient from the convolution matrix
+ * @param[in] mat44 Coefficient from the convolution matrix
+ * @param[in] mat45 Coefficient from the convolution matrix
+ * @param[in] mat46 Coefficient from the convolution matrix
+ * @param[in] mat47 Coefficient from the convolution matrix
+ * @param[in] mat48 Coefficient from the convolution matrix
+ * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
+ *
+ */
+short8 convolution7x7(
+    Image      *src,
+    const short mat0, const short mat1, const short mat2, const short mat3, const short mat4,
+    const short mat5, const short mat6, const short mat7, const short mat8, const short mat9,
+    const short mat10, const short mat11, const short mat12, const short mat13, const short mat14,
+    const short mat15, const short mat16, const short mat17, const short mat18, const short mat19,
+    const short mat20, const short mat21, const short mat22, const short mat23, const short mat24,
+    const short mat25, const short mat26, const short mat27, const short mat28, const short mat29,
+    const short mat30, const short mat31, const short mat32, const short mat33, const short mat34,
+    const short mat35, const short mat36, const short mat37, const short mat38, const short mat39,
+    const short mat40, const short mat41, const short mat42, const short mat43, const short mat44,
+    const short mat45, const short mat46, const short mat47, const short mat48, uint scale)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels;
+
+    pixels = convolution1x7(offset(src, -3, -3), mat0, mat1, mat2, mat3, mat4, mat5, mat6);
+    pixels += convolution1x7(offset(src, -3, -2), mat7, mat8, mat9, mat10, mat11, mat12, mat13);
+    pixels += convolution1x7(offset(src, -3, -1), mat14, mat15, mat16, mat17, mat18, mat19, mat20);
+    pixels += convolution1x7(offset(src, -3, 0), mat21, mat22, mat23, mat24, mat25, mat26, mat27);
+    pixels += convolution1x7(offset(src, -3, 1), mat28, mat29, mat30, mat31, mat32, mat33, mat34);
+    pixels += convolution1x7(offset(src, -3, 2), mat35, mat36, mat37, mat38, mat39, mat40, mat41);
+    pixels += convolution1x7(offset(src, -3, 3), mat42, mat43, mat44, mat45, mat46, mat47, mat48);
+
+    if(scale > 0)
+    {
+        pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale;
+    }
+
+    return convert_short8_sat(pixels);
+}
+
+#ifndef DYNAMIC_MATRIX_CONVOLUTION
+
+/** Apply a 1x7 static convolution matrix to a single channel U8 input image and output a single temporary channel image.
+ *
+ * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6) and DATA_TYPE need to be passed at compile time:\n
+ * e.g. -DMAT0=1 -DMAT1=2, ... -DMAT6=6, -DDATA_TYPE=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U16, S16, S32
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_separable1x7_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Output pixels
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels = convolution1x7(offset(&src, -3, 0), MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6);
+
+    // Store result in dst
+    vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+/** Apply a 7x1 static convolution matrix to a single channel U8 input image and output a single channel image.
+ *
+ * @attention The matrix coefficients (MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT24=13, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U16, S16, S32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_separable7x1_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Output pixels
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    pixels = convolution7x1(&src, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13);
+
+    // Divide by the scale
+    pixels /= (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE;
+
+    // Store result in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+/** Apply a static 7x7 convolution matrix to a single channel U8 input image and output a single channel U8 image including the borders.
+ *
+ * @attention The matrix coefficients(MAT0, MAT1, ... MAT48, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT48=48, -DSCALE=6, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution7x7_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    short8 pixels = convolution7x7(&src,
+                                   MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13,
+                                   MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, MAT25,
+                                   MAT26, MAT27, MAT28, MAT29, MAT30, MAT31, MAT32, MAT33, MAT34, MAT35, MAT36, MAT37,
+                                   MAT38, MAT39, MAT40, MAT41, MAT42, MAT43, MAT44, MAT45, MAT46, MAT47, MAT48, SCALE);
+
+    // Clamp results to [ 0, 255 ] and store them in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+#endif // DYNAMIC_MATRIX_CONVOLUTION
diff --git a/src/core/CL/cl_kernels/convolution9x9.cl b/src/core/CL/cl_kernels/convolution9x9.cl
new file mode 100644
index 0000000000..d8b07cafac
--- /dev/null
+++ b/src/core/CL/cl_kernels/convolution9x9.cl
@@ -0,0 +1,406 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifndef DATA_TYPE
+#define DATA_TYPE short
+#endif
+
+#ifndef COMPUTE_TYPE
+#define COMPUTE_TYPE int
+#endif
+
+#ifndef DATA_TYPE_OUT
+#define DATA_TYPE_OUT uchar
+#endif
+
+/** Compute a 1D horizontal convolution of size 9 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] left_pixel   Pointer to the left pixel
+ * @param[in] left1_coeff  Weight of the most left pixel
+ * @param[in] left2_coeff  Weight of the second left pixel
+ * @param[in] left3_coeff  Weight of the third left pixel
+ * @param[in] left4_coeff  Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right1_coeff Weight of the right pixel
+ * @param[in] right2_coeff Weight of the second right pixel
+ * @param[in] right3_coeff Weight of the third right pixel
+ * @param[in] right4_coeff Weight of the most right pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+VEC_DATA_TYPE(DATA_TYPE, 8)
+convolution1x9(
+    __global const uchar *left_pixel,
+    const short           left1_coeff,
+    const short           left2_coeff,
+    const short           left3_coeff,
+    const short           left4_coeff,
+    const short           middle_coeff,
+    const short           right1_coeff,
+    const short           right2_coeff,
+    const short           right3_coeff,
+    const short           right4_coeff)
+{
+    uchar16 temp = vload16(0, left_pixel);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left1 = CONVERT(temp.s01234567, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left2 = CONVERT(temp.s12345678, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left3 = CONVERT(temp.s23456789, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    left4 = CONVERT(temp.s3456789a, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    middle = CONVERT(temp.s456789ab, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right1 = CONVERT(temp.s56789abc, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right2 = CONVERT(temp.s6789abcd, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right3 = CONVERT(temp.s789abcde, VEC_DATA_TYPE(DATA_TYPE, 8));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    right4 = CONVERT(temp.s89abcdef, VEC_DATA_TYPE(DATA_TYPE, 8));
+
+    return left1 * (VEC_DATA_TYPE(DATA_TYPE, 8))left1_coeff + left2 * (VEC_DATA_TYPE(DATA_TYPE, 8))left2_coeff + left3 * (VEC_DATA_TYPE(DATA_TYPE, 8))left3_coeff + left4 * (VEC_DATA_TYPE(DATA_TYPE,
+            8))left4_coeff + middle * (VEC_DATA_TYPE(DATA_TYPE, 8))middle_coeff + right1 * (VEC_DATA_TYPE(DATA_TYPE, 8))right1_coeff + right2 * (VEC_DATA_TYPE(DATA_TYPE,
+                    8))right2_coeff + right3 * (VEC_DATA_TYPE(DATA_TYPE, 8))right3_coeff + right4 * (VEC_DATA_TYPE(DATA_TYPE, 8))right4_coeff;
+}
+
+/** Compute a 1D vertical convolution of size 9 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] src          Pointer to source image.
+ * @param[in] up1_coeff    Weight of the most up pixel
+ * @param[in] up2_coeff    Weight of the second up pixel
+ * @param[in] up3_coeff    Weight of the third up pixel
+ * @param[in] up4_coeff    Weight of the up pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] down1_coeff  Weight of the down pixel
+ * @param[in] down2_coeff  Weight of the second down pixel
+ * @param[in] down3_coeff  Weight of the third down pixel
+ * @param[in] down4_coeff  Weight of the most down pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+convolution9x1(
+    Image      *src,
+    const short up1_coeff,
+    const short up2_coeff,
+    const short up3_coeff,
+    const short up4_coeff,
+    const short middle_coeff,
+    const short down1_coeff,
+    const short down2_coeff,
+    const short down3_coeff,
+    const short down4_coeff)
+{
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    val;
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    out = (VEC_DATA_TYPE(COMPUTE_TYPE, 8))0;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -4)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up1_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up2_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up3_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, -1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))up4_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 0)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))middle_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 1)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down1_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 2)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down2_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 3)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down3_coeff;
+
+    val = CONVERT(vload8(0, (__global DATA_TYPE *)offset(src, 0, 4)), VEC_DATA_TYPE(COMPUTE_TYPE, 8));
+    out += val * (VEC_DATA_TYPE(COMPUTE_TYPE, 8))down4_coeff;
+
+    return out;
+}
+
+/** Apply a 9x9 convolution matrix to a single channel U8 input image and return the result.
+ *
+ * Convolution matrix layout:\n
+ * [  mat0,  mat1,  mat2,  mat3 , mat4,  mat5,  mat6,  mat7, mat8 ]\n
+ * [  mat9,  mat10, mat11, mat12, mat13, mat14, mat15, mat16, mat17 ]\n
+ * [  mat18, mat19, mat20, mat21, mat22, mat23, mat24, mat25, mat26 ]\n
+ * [  mat27, mat28, mat29, mat30, mat31, mat32, mat33, mat34, mat35 ]\n
+ * [  mat36, mat37, mat38, mat39, mat40, mat41, mat42, mat43, mat44 ]\n
+ * [  mat45, mat46, mat47, mat48, mat49, mat50, mat51, mat52, mat53 ]\n
+ * [  mat54, mat55, mat56, mat57, mat58, mat59, mat60, mat61, mat62 ]
+ * [  mat63, mat64, mat65, mat66, mat67, mat68, mat69, mat70, mat71 ]
+ * [  mat72, mat73, mat74, mat75, mat76, mat77, mat78, mat79, mat80 ]
+ *
+ * @param[in] src   A pointer to source Image structure.
+ * @param[in] mat0  Coefficient from the convolution matrix
+ * @param[in] mat1  Coefficient from the convolution matrix
+ * @param[in] mat2  Coefficient from the convolution matrix
+ * @param[in] mat3  Coefficient from the convolution matrix
+ * @param[in] mat4  Coefficient from the convolution matrix
+ * @param[in] mat5  Coefficient from the convolution matrix
+ * @param[in] mat6  Coefficient from the convolution matrix
+ * @param[in] mat0  Coefficient from the convolution matrix
+ * @param[in] mat7  Coefficient from the convolution matrix
+ * @param[in] mat8  Coefficient from the convolution matrix
+ * @param[in] mat9  Coefficient from the convolution matrix
+ * @param[in] mat10 Coefficient from the convolution matrix
+ * @param[in] mat11 Coefficient from the convolution matrix
+ * @param[in] mat12 Coefficient from the convolution matrix
+ * @param[in] mat13 Coefficient from the convolution matrix
+ * @param[in] mat14 Coefficient from the convolution matrix
+ * @param[in] mat15 Coefficient from the convolution matrix
+ * @param[in] mat16 Coefficient from the convolution matrix
+ * @param[in] mat10 Coefficient from the convolution matrix
+ * @param[in] mat17 Coefficient from the convolution matrix
+ * @param[in] mat18 Coefficient from the convolution matrix
+ * @param[in] mat19 Coefficient from the convolution matrix
+ * @param[in] mat20 Coefficient from the convolution matrix
+ * @param[in] mat21 Coefficient from the convolution matrix
+ * @param[in] mat22 Coefficient from the convolution matrix
+ * @param[in] mat23 Coefficient from the convolution matrix
+ * @param[in] mat24 Coefficient from the convolution matrix
+ * @param[in] mat25 Coefficient from the convolution matrix
+ * @param[in] mat26 Coefficient from the convolution matrix
+ * @param[in] mat27 Coefficient from the convolution matrix
+ * @param[in] mat28 Coefficient from the convolution matrix
+ * @param[in] mat29 Coefficient from the convolution matrix
+ * @param[in] mat30 Coefficient from the convolution matrix
+ * @param[in] mat31 Coefficient from the convolution matrix
+ * @param[in] mat32 Coefficient from the convolution matrix
+ * @param[in] mat33 Coefficient from the convolution matrix
+ * @param[in] mat34 Coefficient from the convolution matrix
+ * @param[in] mat35 Coefficient from the convolution matrix
+ * @param[in] mat36 Coefficient from the convolution matrix
+ * @param[in] mat37 Coefficient from the convolution matrix
+ * @param[in] mat38 Coefficient from the convolution matrix
+ * @param[in] mat39 Coefficient from the convolution matrix
+ * @param[in] mat40 Coefficient from the convolution matrix
+ * @param[in] mat41 Coefficient from the convolution matrix
+ * @param[in] mat42 Coefficient from the convolution matrix
+ * @param[in] mat43 Coefficient from the convolution matrix
+ * @param[in] mat44 Coefficient from the convolution matrix
+ * @param[in] mat45 Coefficient from the convolution matrix
+ * @param[in] mat46 Coefficient from the convolution matrix
+ * @param[in] mat47 Coefficient from the convolution matrix
+ * @param[in] mat48 Coefficient from the convolution matrix
+ * @param[in] mat49 Coefficient from the convolution matrix
+ * @param[in] mat50 Coefficient from the convolution matrix
+ * @param[in] mat51 Coefficient from the convolution matrix
+ * @param[in] mat52 Coefficient from the convolution matrix
+ * @param[in] mat53 Coefficient from the convolution matrix
+ * @param[in] mat54 Coefficient from the convolution matrix
+ * @param[in] mat55 Coefficient from the convolution matrix
+ * @param[in] mat56 Coefficient from the convolution matrix
+ * @param[in] mat57 Coefficient from the convolution matrix
+ * @param[in] mat58 Coefficient from the convolution matrix
+ * @param[in] mat59 Coefficient from the convolution matrix
+ * @param[in] mat60 Coefficient from the convolution matrix
+ * @param[in] mat61 Coefficient from the convolution matrix
+ * @param[in] mat62 Coefficient from the convolution matrix
+ * @param[in] mat63 Coefficient from the convolution matrix
+ * @param[in] mat64 Coefficient from the convolution matrix
+ * @param[in] mat65 Coefficient from the convolution matrix
+ * @param[in] mat66 Coefficient from the convolution matrix
+ * @param[in] mat67 Coefficient from the convolution matrix
+ * @param[in] mat68 Coefficient from the convolution matrix
+ * @param[in] mat69 Coefficient from the convolution matrix
+ * @param[in] mat70 Coefficient from the convolution matrix
+ * @param[in] mat71 Coefficient from the convolution matrix
+ * @param[in] mat72 Coefficient from the convolution matrix
+ * @param[in] mat73 Coefficient from the convolution matrix
+ * @param[in] mat74 Coefficient from the convolution matrix
+ * @param[in] mat75 Coefficient from the convolution matrix
+ * @param[in] mat76 Coefficient from the convolution matrix
+ * @param[in] mat76 Coefficient from the convolution matrix
+ * @param[in] mat77 Coefficient from the convolution matrix
+ * @param[in] mat78 Coefficient from the convolution matrix
+ * @param[in] mat79 Coefficient from the convolution matrix
+ * @param[in] mat80 Coefficient from the convolution matrix
+ * @param[in] scale Convolution matrix scale (Sum of the coefficients, or 1 if the sum is 0)
+ *
+ */
+short8 convolution9x9(
+    Image      *src,
+    const short mat0, const short mat1, const short mat2, const short mat3, const short mat4,
+    const short mat5, const short mat6, const short mat7, const short mat8, const short mat9,
+    const short mat10, const short mat11, const short mat12, const short mat13, const short mat14,
+    const short mat15, const short mat16, const short mat17, const short mat18, const short mat19,
+    const short mat20, const short mat21, const short mat22, const short mat23, const short mat24,
+    const short mat25, const short mat26, const short mat27, const short mat28, const short mat29,
+    const short mat30, const short mat31, const short mat32, const short mat33, const short mat34,
+    const short mat35, const short mat36, const short mat37, const short mat38, const short mat39,
+    const short mat40, const short mat41, const short mat42, const short mat43, const short mat44,
+    const short mat45, const short mat46, const short mat47, const short mat48, const short mat49,
+    const short mat50, const short mat51, const short mat52, const short mat53, const short mat54,
+    const short mat55, const short mat56, const short mat57, const short mat58, const short mat59,
+    const short mat60, const short mat61, const short mat62, const short mat63, const short mat64,
+    const short mat65, const short mat66, const short mat67, const short mat68, const short mat69,
+    const short mat70, const short mat71, const short mat72, const short mat73, const short mat74,
+    const short mat75, const short mat76, const short mat77, const short mat78, const short mat79,
+    const short mat80, uint scale)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels;
+
+    pixels = convolution1x9(offset(src, -4, -4), mat0, mat1, mat2, mat3, mat4, mat5, mat6, mat7, mat8);
+    pixels += convolution1x9(offset(src, -4, -3), mat9, mat10, mat11, mat12, mat13, mat14, mat15, mat16, mat17);
+    pixels += convolution1x9(offset(src, -4, -2), mat18, mat19, mat20, mat21, mat22, mat23, mat24, mat25, mat26);
+    pixels += convolution1x9(offset(src, -4, -1), mat27, mat28, mat29, mat30, mat31, mat32, mat33, mat34, mat35);
+    pixels += convolution1x9(offset(src, -4, 0), mat36, mat37, mat38, mat39, mat40, mat41, mat42, mat43, mat44);
+    pixels += convolution1x9(offset(src, -4, 1), mat45, mat46, mat47, mat48, mat49, mat50, mat51, mat52, mat53);
+    pixels += convolution1x9(offset(src, -4, 2), mat54, mat55, mat56, mat57, mat58, mat59, mat60, mat61, mat62);
+    pixels += convolution1x9(offset(src, -4, 3), mat63, mat64, mat65, mat66, mat67, mat68, mat69, mat70, mat71);
+    pixels += convolution1x9(offset(src, -4, 4), mat72, mat73, mat74, mat75, mat76, mat77, mat78, mat79, mat80);
+
+    if(scale > 0)
+    {
+        pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))scale;
+    }
+
+    return convert_short8_sat(pixels);
+}
+
+#ifndef DYNAMIC_MATRIX_CONVOLUTION
+
+/** Apply a 1x9 static convolution matrix to a single channel U8 input image and output a single temporary channel image.
+ *
+ * @attention The matrix coefficients (MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8) and DATA_TYPE need to be passed at compile time:\n
+ * e.g. -DMAT0=7 -DMAT1=8, ... -DMAT8=8, -DCOMPUTE_TYPE=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U16, S16, S32
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_separable1x9_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Output pixels
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels = convolution1x9(offset(&src, -4, 0), MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8);
+
+    // Store result in dst
+    vstore8(pixels, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+/** Apply a 9x1 static convolution matrix to a single channel U8 input image and output a single channel image.
+ *
+ * @attention The matrix coefficients (MAT9, MAT10, ... MAT17, SCALE), COMPUTE_TYPE and DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT9=9 -DMAT10=10, ... -DMAT17=17, -DSCALE=6, -DCOMPUTE_TYPE=int, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U16, S16, S32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_separable9x1_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Output pixels
+    VEC_DATA_TYPE(COMPUTE_TYPE, 8)
+    pixels = convolution9x1(&src, MAT9, MAT10, MAT11, MAT12, MAT13, MAT14, MAT15, MAT16, MAT17);
+
+    // Divide by the scale
+    pixels = pixels / (VEC_DATA_TYPE(COMPUTE_TYPE, 8))SCALE;
+
+    // Store result in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+/** Apply a static 9x9 convolution matrix to a single channel U8 input image and output a single channel image including borders
+ *
+ * @attention The matrix coefficients(MAT0, MAT1, ... MAT80, SCALE), DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT0=0 -DMAT1=1, ... -DMAT80=80, -DSCALE=6, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution9x9_static(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    short8 pixels = convolution9x9(&src,
+                                   MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, MAT9, MAT10, MAT11, MAT12, MAT13,
+                                   MAT14, MAT15, MAT16, MAT17, MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, MAT25,
+                                   MAT26, MAT27, MAT28, MAT29, MAT30, MAT31, MAT32, MAT33, MAT34, MAT35, MAT36, MAT37,
+                                   MAT38, MAT39, MAT40, MAT41, MAT42, MAT43, MAT44, MAT45, MAT46, MAT47, MAT48, MAT49,
+                                   MAT50, MAT51, MAT52, MAT53, MAT54, MAT55, MAT56, MAT57, MAT58, MAT59, MAT60, MAT61,
+                                   MAT62, MAT63, MAT64, MAT65, MAT66, MAT67, MAT68, MAT69, MAT70, MAT71, MAT72, MAT73,
+                                   MAT74, MAT75, MAT76, MAT77, MAT78, MAT79, MAT80, SCALE);
+
+    // Store the result as is in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, (__global DATA_TYPE_OUT *)dst.ptr);
+}
+
+#endif // DYNAMIC_MATRIX_CONVOLUTION
diff --git a/src/core/CL/cl_kernels/convolution_layer.cl b/src/core/CL/cl_kernels/convolution_layer.cl
new file mode 100644
index 0000000000..bd5dfaff68
--- /dev/null
+++ b/src/core/CL/cl_kernels/convolution_layer.cl
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This kernel reshapes the tensor's low three dimensions to single column
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ *
+ * @param[in]  src_ptr                            Pointer to the source tensor. Supported data types: F16, F32
+ * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Same as input
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  bias_ptr                           Pointer to the bias tensor. Same as input
+ * @param[in]  bias_stride_x                      Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  bias_step_x                        bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  width                              The width of the input tensor
+ * @param[in]  height                             The height of the input tensor
+ * @param[in]  depth                              The depth of the input tensor
+ * @param[in]  total_filters                      Total number of filters. 4th dimension of the weights matrix
+ */
+__kernel void reshape_to_columns(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+#if defined HAS_BIAS
+    VECTOR_DECLARATION(bias),
+#endif
+    uint width, uint height, uint depth, uint total_filters)
+{
+    Tensor3D src            = CONVERT_TO_TENSOR3D_STRUCT(src);
+    bool     is_last_thread = (get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1));
+
+    __global uchar *tmp_src_ptr = src.ptr;
+    __global uchar *tmp_dst_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(0) * dst_stride_y + get_global_id(1) * width * dst_stride_y + get_global_id(
+                                      2) * width * height * dst_stride_y;
+#if defined         HAS_BIAS
+    __global uchar *tmp_bias_ptr = bias_ptr + bias_offset_first_element_in_bytes;
+#endif
+
+    if(is_last_thread)
+    {
+        for(uint i = 0; i < total_filters; ++i)
+        {
+            *((__global DATA_TYPE *)tmp_dst_ptr) = *((__global DATA_TYPE *)tmp_src_ptr);
+
+#if defined HAS_BIAS
+            *((__global DATA_TYPE *)(tmp_dst_ptr + dst_stride_y)) = *((__global DATA_TYPE *)(tmp_bias_ptr));
+            tmp_bias_ptr += bias_stride_x;
+#endif
+            tmp_src_ptr += depth * src_stride_z;
+            tmp_dst_ptr += dst_stride_x;
+        }
+    }
+    else
+    {
+        for(uint i = 0; i < total_filters; ++i)
+        {
+            *((__global DATA_TYPE *)tmp_dst_ptr) = *((__global DATA_TYPE *)tmp_src_ptr);
+            tmp_src_ptr += depth * src_stride_z;
+            tmp_dst_ptr += dst_stride_x;
+        }
+    }
+}
+
+/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16, F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: F16, F32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  kernel_size                       The convolution kernel size
+ * @param[in]  kernel_depth                      The kernel depth
+ * @param[in]  width                             The output tensor width
+ * @param[in]  input_dims                        The input tensor dimensions
+ * @param[in]  strides                           The strides of the im2col operation
+ * @param[in]  paddings                          The input tensor paddings
+ */
+__kernel void im2col_generic(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    int  kernel_size,
+    int  kernel_depth,
+    int  width,
+    int2 input_dims,
+    int2 strides,
+    int2 paddings)
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Image    dst = CONVERT_TO_IMAGE_STRUCT_NO_STEP(dst);
+
+    // Determine output index
+    uint     idx               = (get_global_id(1) * width + get_global_id(0)) * dst.stride_y;
+    __global uchar *output_ptr = dst.ptr + idx;
+
+    // Determine current input index
+    const int top_left_x = get_global_id(0) * strides.x - paddings.x;
+    const int top_left_y = get_global_id(1) * strides.y - paddings.y;
+
+    // Linearize convolution elements
+    for(int d = 0; d < kernel_depth; ++d)
+    {
+        for(int y = top_left_y, y_e = top_left_y + kernel_size; y < y_e; ++y)
+        {
+            for(int x = top_left_x, x_e = top_left_x + kernel_size; x < x_e; ++x, output_ptr += dst.stride_x)
+            {
+                if(x < 0 || x >= input_dims.x || y < 0 || y >= input_dims.y)
+                {
+                    *((__global DATA_TYPE *)output_ptr) = 0;
+                }
+                else
+                {
+                    *((__global DATA_TYPE *)output_ptr) = *((__global DATA_TYPE *)(tensor3D_offset(&src, x, y, d)));
+                }
+            }
+        }
+    }
+
+#if defined HAS_BIAS
+    *((__global DATA_TYPE *)output_ptr) = 1;
+#endif
+}
+
+/** This kernel performs a reshaping of the output of the convolution layer.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16, F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: F16, F32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  width                             The output tensor width
+ */
+__kernel void col2im(
+    IMAGE_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint width)
+{
+    Image    src = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(dst);
+
+    int      idx                         = get_global_id(0) * dst.stride_z + (get_global_id(1) / width) * dst.stride_y + (get_global_id(1) % width) * dst.stride_x;
+    __global uchar *tmp_out_ptr          = dst.ptr + idx;
+    *((__global DATA_TYPE *)tmp_out_ptr) = *((__global DATA_TYPE *)(src.ptr));
+}
+
+/** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note In case biases will be added in late stage, -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16, F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Same as input.
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  width                             The width of the input tensor
+ * @param[in]  height                            The height of the input tensor
+ */
+__kernel void im2col_reduced(
+    TENSOR3D_DECLARATION(src),
+    VECTOR_DECLARATION(dst),
+    uint width, uint height)
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+
+    const uint image_size = width * height;
+
+    __global uchar *tmp_out_ptr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) + get_global_id(1) * width + get_global_id(2) * image_size) * dst_stride_x;
+
+    *((__global DATA_TYPE *)tmp_out_ptr) = *((__global DATA_TYPE *)src.ptr);
+
+#if defined HAS_BIAS
+    // If it is the last thread in the 3 dimensional workgroup
+    if(get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1))
+    {
+        tmp_out_ptr += dst_stride_x;
+        *((__global DATA_TYPE *)tmp_out_ptr) = (DATA_TYPE)1;
+    }
+#endif
+}
diff --git a/src/core/CL/cl_kernels/convolution_rectangle.cl b/src/core/CL/cl_kernels/convolution_rectangle.cl
new file mode 100644
index 0000000000..96b9cff3eb
--- /dev/null
+++ b/src/core/CL/cl_kernels/convolution_rectangle.cl
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "convolution3x3.cl"
+#include "convolution5x5.cl"
+#include "convolution7x7.cl"
+#include "convolution9x9.cl"
+#include "helpers.h"
+
+#define MAT_INDEX(i) MAT##i
+
+#ifndef DATA_TYPE
+#define DATA_TYPE short
+#endif
+
+#ifndef COMPUTE_TYPE
+#define COMPUTE_TYPE int
+#endif
+
+#ifndef DATA_TYPE_OUT
+#define DATA_TYPE_OUT uchar
+#endif
+
+#ifndef DYNAMIC_MATRIX_CONVOLUTION
+
+/** Apply a rectangle matrix to a single channel U8 input image and output a single channel image including borders
+ *
+ * @attention The matrix coefficients(MAT0, MAT1, ... MAT80, SCALE), MATRIX_WIDTH, MATRIX_HEIGHT, COMPUTE_TYPE, DATA_TYPE, DATA_TYPE_OUT need to be passed at compile time:\n
+ * e.g. -DMAT0=0 -DMAT1=1, ... -DMAT80=80, -DSCALE=6, -DMATRIX_WIDTH=3, -DMATRIX_HEIGHT=5, -DCOMPUTE_TYPE=int, -DDATA_TYPE=int, -DDATA_TYPE_OUT=int
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convolution_rectangle(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    short matrix_coeff[81] =
+    {
+        MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8,
+        MAT9, MAT10, MAT11, MAT12, MAT13, MAT14, MAT15, MAT16, MAT17,
+        MAT18, MAT19, MAT20, MAT21, MAT22, MAT23, MAT24, MAT25, MAT26,
+        MAT27, MAT28, MAT29, MAT30, MAT31, MAT32, MAT33, MAT34, MAT35,
+        MAT36, MAT37, MAT38, MAT39, MAT40, MAT41, MAT42, MAT43, MAT44,
+        MAT45, MAT46, MAT47, MAT48, MAT49, MAT50, MAT51, MAT52, MAT53,
+        MAT54, MAT55, MAT56, MAT57, MAT58, MAT59, MAT60, MAT61, MAT62,
+        MAT63, MAT64, MAT65, MAT66, MAT67, MAT68, MAT69, MAT70, MAT71,
+        MAT72, MAT73, MAT74, MAT75, MAT76, MAT77, MAT78, MAT79, MAT80
+    };
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    pixels = (VEC_DATA_TYPE(DATA_TYPE, 8))0;
+
+    for(int i = 0; i < MATRIX_HEIGHT; i++)
+    {
+#if MATRIX_WIDTH == 3
+        pixels += convolution1x3(offset(&src, -1, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 3], matrix_coeff[1 + i * 3],
+                                 matrix_coeff[2 + i * 3]);
+#endif
+
+#if MATRIX_WIDTH == 5
+        pixels += convolution1x5(offset(&src, -2, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 5], matrix_coeff[1 + i * 5],
+                                 matrix_coeff[2 + i * 5], matrix_coeff[3 + i * 5], matrix_coeff[4 + i * 5]);
+#endif
+
+#if MATRIX_WIDTH == 7
+        pixels += convolution1x7(offset(&src, -3, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 7], matrix_coeff[1 + i * 7],
+                                 matrix_coeff[2 + i * 7], matrix_coeff[3 + i * 7], matrix_coeff[4 + i * 7],
+                                 matrix_coeff[5 + i * 7], matrix_coeff[6 + i * 7]);
+#endif
+
+#if MATRIX_WIDTH == 9
+        pixels += convolution1x9(offset(&src, -4, -(MATRIX_HEIGHT / 2) + i), matrix_coeff[0 + i * 9], matrix_coeff[1 + i * 9],
+                                 matrix_coeff[2 + i * 9], matrix_coeff[3 + i * 9], matrix_coeff[4 + i * 9],
+                                 matrix_coeff[5 + i * 9], matrix_coeff[6 + i * 9], matrix_coeff[7 + i * 9], matrix_coeff[8 + i * 9]);
+#endif
+    }
+
+    pixels /= (VEC_DATA_TYPE(DATA_TYPE, 8))SCALE;
+
+    // Store the result as is in dst
+    vstore8(CONVERT_SAT(pixels, VEC_DATA_TYPE(DATA_TYPE_OUT, 8)), 0, ((__global DATA_TYPE_OUT *)dst.ptr));
+}
+
+#endif // DYNAMIC_MATRIX_CONVOLUTION
diff --git a/src/core/CL/cl_kernels/depth_convert.cl b/src/core/CL/cl_kernels/depth_convert.cl
new file mode 100644
index 0000000000..c8eaa95352
--- /dev/null
+++ b/src/core/CL/cl_kernels/depth_convert.cl
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifdef SATURATE
+#define CONVERT_DOWN(x, type) CONVERT_SAT(x, type)
+#else
+#define CONVERT_DOWN(x, type) CONVERT(x, type)
+#endif
+
+/** This function performs a down-scaling depth conversion.
+ *
+ * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, U16, S16, U32 or S32
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, U16, S16, U32 or S32
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  shift                             The integer shift amount value. Supported data types: S32
+ */
+__kernel void convert_depth_down(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const int shift)
+{
+    // Get pixels pointer
+    Image in  = CONVERT_TO_IMAGE_STRUCT(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE_IN, 16)
+    in_data = vload16(0, (__global DATA_TYPE_IN *)in.ptr);
+    vstore16(CONVERT_DOWN(in_data >> shift, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
+
+/** This function performs a up-scaling depth conversion.
+ *
+ * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, U16, S16, U32 or S32
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, U16, S16, U32 or S32
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  shift                             The integer shift amount value. Supported data types: S32
+ */
+__kernel void convert_depth_up(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const int shift)
+{
+    // Get pixels pointer
+    Image in  = CONVERT_TO_IMAGE_STRUCT(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    in_data = CONVERT(vload16(0, (__global DATA_TYPE_IN *)in.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+    vstore16(in_data << shift, 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/derivative.cl b/src/core/CL/cl_kernels/derivative.cl
new file mode 100644
index 0000000000..0e810d2e7c
--- /dev/null
+++ b/src/core/CL/cl_kernels/derivative.cl
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This OpenCL kernel that computes the first-order derivative.
+ *
+ * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
+ * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
+ *
+ * @param[in]  src_ptr                              Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void derivative(
+    IMAGE_DECLARATION(src)
+#ifdef GRAD_X
+    ,
+    IMAGE_DECLARATION(dst_gx)
+#endif
+#ifdef GRAD_Y
+    ,
+    IMAGE_DECLARATION(dst_gy)
+#endif
+)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+#ifdef GRAD_X
+    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
+#endif
+#ifdef GRAD_Y
+    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
+#endif
+
+#ifdef GRAD_X
+    short16 l_data = convert_short16(vload16(0, offset(&src, -1, 0)));
+    short16 r_data = convert_short16(vload16(0, offset(&src, 1, 0)));
+    vstore16(r_data - l_data, 0, ((__global short *)dst_gx.ptr));
+#endif
+#ifdef GRAD_Y
+    short16 t_data = convert_short16(vload16(0, offset(&src, 0, -1)));
+    short16 b_data = convert_short16(vload16(0, offset(&src, 0, 1)));
+    vstore16(b_data - t_data, 0, ((__global short *)dst_gy.ptr));
+#endif
+}
diff --git a/src/core/CL/cl_kernels/dilate.cl b/src/core/CL/cl_kernels/dilate.cl
new file mode 100644
index 0000000000..c62c701757
--- /dev/null
+++ b/src/core/CL/cl_kernels/dilate.cl
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function dilates an input image.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void dilate(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 top    = vload16(0, offset(&src, -1, -1));
+    uchar16 middle = vload16(0, offset(&src, -1, 0));
+    uchar16 bottom = vload16(0, offset(&src, -1, 1));
+
+    uchar16 tmp = max(top, max(middle, bottom));
+    uchar8  out = max(tmp.s01234567, max(tmp.s12345678, tmp.s23456789));
+
+    vstore8(out, 0, dst.ptr);
+}
diff --git a/src/core/CL/cl_kernels/erode.cl b/src/core/CL/cl_kernels/erode.cl
new file mode 100644
index 0000000000..6576f1827f
--- /dev/null
+++ b/src/core/CL/cl_kernels/erode.cl
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function erodes an input image image.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void erode(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uchar16 top    = vload16(0, offset(&src, -1, -1));
+    uchar16 middle = vload16(0, offset(&src, -1, 0));
+    uchar16 bottom = vload16(0, offset(&src, -1, 1));
+
+    uchar16 tmp = min(top, min(middle, bottom));
+    uchar8  out = min(tmp.s01234567, min(tmp.s12345678, tmp.s23456789));
+
+    vstore8(out, 0, dst.ptr);
+}
diff --git a/src/core/CL/cl_kernels/fast_corners.cl b/src/core/CL/cl_kernels/fast_corners.cl
new file mode 100644
index 0000000000..470d14a7b0
--- /dev/null
+++ b/src/core/CL/cl_kernels/fast_corners.cl
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "types.h"
+
+/* The map table to retrieve the 16 texels in the Bresenham circle of radius 3 with center in P.
+ *
+ *      . . F 0 1 . . .
+ *      . E . . . 2 . .
+ *      D . . . . . 3 .
+ *      C . . P . . 4 .
+ *      B . . . . . 5 .
+ *      . A . . . 6 . .
+ *      . . 9 8 7 . . .
+ */
+constant int offsets_s[16][2] =
+{
+    { 0, -3 },  // 0
+    { 1, -3 },  // 1
+    { 2, -2 },  // 2
+    { 3, -1 },  // 3
+    { 3, 0 },   // 4
+    { 3, 1 },   // 5
+    { 2, 2 },   // 6
+    { 1, 3 },   // 7
+    { 0, 3 },   // 8
+    { -1, 3 },  // 9
+    { -2, 2 },  // A
+    { -3, 1 },  // B
+    { -3, 0 },  // C
+    { -3, -1 }, // D
+    { -2, -2 }, // E
+    { -1, -3 }, // F
+};
+
+/** Load a pixel and set the mask values.
+ *
+ * @param[in]  ptr         The pointer to the starting address of source image
+ * @param[in]  a           Index to indicate the position in the Bresenham circle
+ * @param[in]  stride      Stride of source image in x dimension
+ * @param[in]  dark        The left end of the threshold range
+ * @param[in]  bright      The right end of the threshold range
+ * @param[out] dark_mask   The bit-set mask records dark pixels. Its bit is set as 1 if the corresponding pixel is dark
+ * @param[out] bright_mask The bit-set mask records bright pixels. Its bit is set as 1 if the corresponding pixel is bright
+ *
+ */
+#define LOAD_AND_SET_MASK(ptr, a, stride, dark, bright, dark_mask, bright_mask) \
+    {                                                                           \
+        unsigned char pixel;                                                    \
+        pixel = *(ptr + (int)stride * offsets_s[a][1] + offsets_s[a][0]);       \
+        dark_mask |= (pixel < dark) << a;                                       \
+        bright_mask |= (pixel > bright) << a;                                   \
+    }
+
+/** Checks if a pixel is a corner. Pixel is considerred as a corner if the 9 continuous pixels in the Bresenham circle are bright or dark.
+ *
+ * @param[in]  bright_mask The mask recording postions of bright pixels
+ * @param[in]  dark_mask   The mask recording postions of dark pixels
+ * @param[out] isCorner    Indicate whether candidate pixel is corner
+ */
+#define CHECK_CORNER(bright_mask, dark_mask, isCorner)    \
+    {                                                     \
+        for(int i = 0; i < 16; i++)                       \
+        {                                                 \
+            isCorner |= ((bright_mask & 0x1FF) == 0x1FF); \
+            isCorner |= ((dark_mask & 0x1FF) == 0x1FF);   \
+            if(isCorner)                                  \
+            {                                             \
+                break;                                    \
+            }                                             \
+            bright_mask >>= 1;                            \
+            dark_mask >>= 1;                              \
+        }                                                 \
+    }
+
+/* Calculate pixel's strength */
+uchar compute_strength(uchar candidate_pixel, __global unsigned char *ptr, unsigned int stride, unsigned char threshold)
+{
+    short a = threshold;
+    short b = 255;
+    while(b - a > 1)
+    {
+        uchar        c           = convert_uchar_sat((a + b) / 2);
+        unsigned int bright_mask = 0;
+        unsigned int dark_mask   = 0;
+
+        unsigned char p_bright = add_sat(candidate_pixel, c);
+        unsigned char p_dark   = sub_sat(candidate_pixel, c);
+
+        bool isCorner = 0;
+
+        for(uint i = 0; i < 16; i++)
+        {
+            LOAD_AND_SET_MASK(ptr, i, stride, p_dark, p_bright, dark_mask, bright_mask)
+        }
+
+        bright_mask |= (bright_mask << 16);
+        dark_mask |= (dark_mask << 16);
+        CHECK_CORNER(bright_mask, dark_mask, isCorner);
+
+        if(isCorner)
+        {
+            a = convert_short(c);
+        }
+        else
+        {
+            b = convert_short(c);
+        }
+    }
+    return a;
+}
+
+/** Fast corners implementation. Calculates and returns the strength of each pixel.
+ *
+ * The algorithm loops through the 16 pixels in the Bresenham circle and set low 16 bit of masks if corresponding pixel is bright
+ * or dark. It then copy the low 16 bit to the high 16 bit of the masks. Right shift the bit to check whether the 9 continuous bits
+ * from the LSB are set.
+ *
+ * @param[in]  input_ptr                            Pointer to the first source image. Supported data types: U8
+ * @param[in]  input_stride_x                       Stride of the first source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source image
+ * @param[out] output_ptr                           Pointer to the first source image. Supported data types: U8
+ * @param[in]  output_stride_x                      Stride of the first source image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[in]  threshold_value                      Threshold value.
+ *
+ */
+__kernel void fast_corners(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output),
+    float threshold_value)
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+    const unsigned char threshold = (uchar)threshold_value;
+
+    unsigned int bright_mask = 0;
+    unsigned int dark_mask   = 0;
+
+    unsigned char isCorner = 0;
+
+    unsigned char p        = *in.ptr;
+    unsigned char p_bright = add_sat(p, threshold);
+    unsigned char p_dark   = sub_sat(p, threshold);
+
+    LOAD_AND_SET_MASK(in.ptr, 0, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 4, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 8, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 12, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+
+    if(((bright_mask | dark_mask) & 0x1111) == 0)
+    {
+        *out.ptr = 0;
+        return;
+    }
+
+    LOAD_AND_SET_MASK(in.ptr, 1, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 2, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 3, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 5, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 6, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 7, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 9, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 10, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 11, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 13, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 14, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+    LOAD_AND_SET_MASK(in.ptr, 15, input_stride_y, p_dark, p_bright, dark_mask, bright_mask)
+
+    bright_mask |= (bright_mask << 16);
+    dark_mask |= (dark_mask << 16);
+
+    CHECK_CORNER(bright_mask, dark_mask, isCorner)
+
+    if(!isCorner)
+    {
+        *out.ptr = 0;
+        return;
+    }
+
+#ifndef USE_MAXSUPPRESSION
+    *out.ptr = 1;
+#else
+
+    *out.ptr = compute_strength(p, in.ptr, input_stride_y, threshold);
+#endif
+}
+
+/** Copy result to Keypoint buffer and count number of corners
+ *
+ * @param[in]  input_ptr                           Pointer to the image with calculated strenghs. Supported data types: U8
+ * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[in]  max_num_points                      The maximum number of keypoints the array can hold
+ * @param[out] offset                              The number of skipped pixels in x dimension
+ * @param[out] num_of_points                       Number of points found
+ * @param[out] out                                 The keypoints found
+ *
+ */
+__kernel void copy_to_keypoint(
+    IMAGE_DECLARATION(input),
+    uint     max_num_points,
+    uint     offset,
+    __global uint *num_of_points,
+    __global Keypoint *out)
+{
+#ifndef UPDATE_NUMBER
+    if(*num_of_points >= max_num_points)
+    {
+        return;
+    }
+#endif
+
+    Image in = CONVERT_TO_IMAGE_STRUCT(input);
+
+    uchar value = *in.ptr;
+
+    if(value > 0)
+    {
+        int id = atomic_inc(num_of_points);
+        if(id < max_num_points)
+        {
+            out[id].strength        = value;
+            out[id].x               = get_global_id(0) + offset;
+            out[id].y               = get_global_id(1) + offset;
+            out[id].tracking_status = 1;
+        }
+    }
+}
diff --git a/src/core/CL/cl_kernels/fill_border.cl b/src/core/CL/cl_kernels/fill_border.cl
new file mode 100644
index 0000000000..df635869b1
--- /dev/null
+++ b/src/core/CL/cl_kernels/fill_border.cl
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Fill N pixel of the padding edge of a single channel image by replicating the closest valid pixel.
+ *
+ * @attention  The DATA_TYPE needs to be passed at the compile time.
+ * e.g. -DDATA_TYPE=int
+ *
+ * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
+ * e.g. --DBORDER_SIZE_TOP=0 -DBORDER_SIZE_BOTTOM=2 -DBORDER_SIZE_LEFT=0 -DBORDER_SIZE_RIGHT=2
+ *
+ * @param[in,out] buf_ptr                           Pointer to the source image. Supported data types: U8, U16, S16, U32, S32, F32
+ * @param[in]     buf_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]     buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     buf_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]     buf_step_y                        buf_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     buf_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]     width                             Width of the valid region of the image
+ * @param[in]     height                            Height of the valid region of the image
+ * @param[in]     start_pos                         XY coordinate indicating the start point of the valid region
+ */
+__kernel void fill_image_borders_replicate(
+    IMAGE_DECLARATION(buf),
+    uint width,
+    uint height,
+    int2 start_pos)
+{
+    Image buf = CONVERT_TO_IMAGE_STRUCT_NO_STEP(buf);
+
+    // Update pointer to point to the starting point of the valid region
+    buf.ptr += start_pos.y * buf.stride_y + start_pos.x * buf.stride_x;
+
+    const int total_width = BORDER_SIZE_LEFT + width + BORDER_SIZE_RIGHT;
+    const int gid0        = get_global_id(0);
+    const int gidH        = gid0 - total_width;
+    const int gidW        = gid0 - BORDER_SIZE_LEFT;
+
+    if(gidH >= 0)
+    {
+        // Handle left border
+        DATA_TYPE left_val = *(__global DATA_TYPE *)offset(&buf, 0, gidH);
+        for(int i = -BORDER_SIZE_LEFT; i < 0; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, i, gidH) = left_val;
+        }
+        // Handle right border
+        DATA_TYPE right_val = *(__global DATA_TYPE *)offset(&buf, width - 1, gidH);
+        for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, width + i, gidH) = right_val;
+        }
+    }
+    else
+    {
+        // Get value for corners
+        int val_idx = gidW;
+        if(gidW < 0 || gidW > (width - 1))
+        {
+            val_idx = gidW < 0 ? 0 : width - 1;
+        }
+
+        // Handle top border
+        DATA_TYPE top_val = *(__global DATA_TYPE *)offset(&buf, val_idx, 0);
+        for(int i = -BORDER_SIZE_TOP; i < 0; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, gidW, i) = top_val;
+        }
+        // Handle bottom border
+        DATA_TYPE bottom_val = *(__global DATA_TYPE *)offset(&buf, val_idx, height - 1);
+        for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, gidW, height + i) = bottom_val;
+        }
+    }
+}
+
+/** Fill N pixels of the padding edge of a single channel image with a constant value.
+ *
+ * @attention  The DATA_TYPE needs to be passed at the compile time.
+ * e.g. -DDATA_TYPE=int
+ *
+ * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
+ * e.g. --DBORDER_SIZE_TOP=0 -DBORDER_SIZE_BOTTOM=2 -DBORDER_SIZE_LEFT=0 -DBORDER_SIZE_RIGHT=2
+ *
+ * @param[out] buf_ptr                           Pointer to the source image. Supported data types: U8, U16, S16, U32, S32, F32
+ * @param[in]  buf_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  buf_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  buf_step_y                        buf_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  buf_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  width                             Width of the valid region of the image
+ * @param[in]  height                            Height of the valid region of the image
+ * @param[in]  start_pos                         XY coordinate indicating the start point of the valid region
+ * @param[in]  constant_value                    Constant value to use to fill the edges
+ */
+__kernel void fill_image_borders_constant(
+    IMAGE_DECLARATION(buf),
+    uint      width,
+    uint      height,
+    int2      start_pos,
+    DATA_TYPE constant_value)
+{
+    Image buf = CONVERT_TO_IMAGE_STRUCT_NO_STEP(buf);
+
+    // Update pointer to point to the starting point of the valid region
+    buf.ptr += start_pos.y * buf.stride_y + start_pos.x * buf.stride_x;
+
+    const int total_width = BORDER_SIZE_LEFT + width + BORDER_SIZE_RIGHT;
+    const int gid0        = get_global_id(0);
+    const int gidH        = gid0 - total_width;
+    const int gidW        = gid0 - BORDER_SIZE_LEFT;
+
+    if(gidH >= 0)
+    {
+        // Handle left border
+        for(int i = -BORDER_SIZE_LEFT; i < 0; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, i, gidH) = constant_value;
+        }
+        // Handle right border
+        for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, width + i, gidH) = constant_value;
+        }
+    }
+    else
+    {
+        // Handle top border
+        for(int i = -BORDER_SIZE_TOP; i < 0; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, gidW, i) = constant_value;
+        }
+        // Handle bottom border
+        for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, gidW, height + i) = constant_value;
+        }
+    }
+}
diff --git a/src/core/CL/cl_kernels/gaussian_pyramid.cl b/src/core/CL/cl_kernels/gaussian_pyramid.cl
new file mode 100644
index 0000000000..618937f36d
--- /dev/null
+++ b/src/core/CL/cl_kernels/gaussian_pyramid.cl
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Computes the Gaussian Filter 1x5 + sub-sampling along the X direction
+ *
+ * @note Each thread computes 8 pixels
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void gaussian1x5_sub_x(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load values for the convolution (20 bytes needed)
+    uchar16 temp0 = vload16(0, src.ptr);
+    uchar4  temp1 = vload4(0, src.ptr + 16);
+
+    // Convert to USHORT8
+    ushort8 l2_data = convert_ushort8((uchar8)(temp0.s02468ACE));
+    ushort8 l1_data = convert_ushort8((uchar8)(temp0.s13579BDF));
+    ushort8 m_data  = convert_ushort8((uchar8)(temp0.s2468, temp0.sACE, temp1.s0));
+    ushort8 r1_data = convert_ushort8((uchar8)(temp0.s3579, temp0.sBDF, temp1.s1));
+    ushort8 r2_data = convert_ushort8((uchar8)(temp0.s468A, temp0.sCE, temp1.s02));
+
+    // Compute convolution along the X direction
+    ushort8 pixels = l2_data + r2_data;
+    pixels += l1_data * (ushort8)4;
+    pixels += m_data * (ushort8)6;
+    pixels += r1_data * (ushort8)4;
+
+    // Store result
+    vstore8(pixels, 0, (__global ushort *)dst.ptr);
+}
+
+/** Computes the Gaussian Filter 5x1 + sub-sampling along the Y direction
+ *
+ * @note Each thread computes 8 pixels
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U16
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void gaussian5x1_sub_y(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load values
+    ushort8 u2_data = vload8(0, (__global ushort *)offset(&src, 0, 0));
+    ushort8 u1_data = vload8(0, (__global ushort *)offset(&src, 0, 1));
+    ushort8 m_data  = vload8(0, (__global ushort *)offset(&src, 0, 2));
+    ushort8 d1_data = vload8(0, (__global ushort *)offset(&src, 0, 3));
+    ushort8 d2_data = vload8(0, (__global ushort *)offset(&src, 0, 4));
+
+    // Compute convolution along the Y direction
+    ushort8 pixels = u2_data + d2_data;
+    pixels += u1_data * (ushort8)4;
+    pixels += m_data * (ushort8)6;
+    pixels += d1_data * (ushort8)4;
+
+    // Scale result
+    pixels >>= (ushort8)8;
+
+    // Store result
+    vstore8(convert_uchar8_sat(pixels), 0, dst.ptr);
+}
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
new file mode 100644
index 0000000000..caf6e3ffd8
--- /dev/null
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -0,0 +1,1099 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This OpenCL kernel computes the "vector" 1x4 transposition of input matrix
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: F32
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_transpose1x4_f32(IMAGE_DECLARATION(src),
+                                    IMAGE_DECLARATION(dst))
+{
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+
+    /* Compute address for Matrix B - source */
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    /* Compute address for Matrix B transposed - destination. X and Y are swapped */
+    uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
+
+    float4 b0 = vload4(0, (__global float *)src.ptr);
+
+    vstore4(b0, 0, (__global float *)(dst_ptr + dst_addr_in_bytes));
+}
+
+/** This OpenCL kernel computes the "vector" 1x8 transposition of input matrix
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: F16
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_transpose1x8_f16(IMAGE_DECLARATION(src),
+                                    IMAGE_DECLARATION(dst))
+{
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+
+    /* Compute address for Matrix B - source */
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    /* Compute address for Matrix B transposed - destination. X and Y are swapped */
+    uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
+
+    half8 b0 = vload8(0, (__global half *)src.ptr);
+
+    vstore8(b0, 0, (__global half *)(dst_ptr + dst_addr_in_bytes));
+}
+
+/** This OpenCL kernel computes the "vector" 1x16 transposition of input matrix
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_transpose1x16_u8(IMAGE_DECLARATION(src),
+                                    IMAGE_DECLARATION(dst))
+{
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+
+    /* Compute address for Matrix B - source */
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    /* Compute address for Matrix B transposed - destination. X and Y are swapped */
+    uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
+
+    uchar16 b0 = vload16(0, (__global uchar *)src.ptr);
+
+    vstore16(b0, 0, (__global uchar *)(dst_ptr + dst_addr_in_bytes));
+}
+
+/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: U32/S32/F32
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_interleave4x4_32bit(IMAGE_DECLARATION(src),
+                                       IMAGE_DECLARATION(dst))
+{
+    /* Compute source and destination addresses */
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Load values from Matrix A */
+    float4 a0 = vload4(0, (__global float *)(offset(&src, 0, 0)));
+    float4 a1 = vload4(0, (__global float *)(offset(&src, 0, 1)));
+    float4 a2 = vload4(0, (__global float *)(offset(&src, 0, 2)));
+    float4 a3 = vload4(0, (__global float *)(offset(&src, 0, 3)));
+
+    float4 val0 = (float4)(a0.s0, a1.s0, a2.s0, a3.s0);
+    vstore4(val0, 0, ((__global float *)dst.ptr) + 0);
+
+    val0 = (float4)(a0.s1, a1.s1, a2.s1, a3.s1);
+    vstore4(val0, 0, ((__global float *)dst.ptr) + 4);
+
+    val0 = (float4)(a0.s2, a1.s2, a2.s2, a3.s2);
+    vstore4(val0, 0, ((__global float *)dst.ptr) + 8);
+
+    val0 = (float4)(a0.s3, a1.s3, a2.s3, a3.s3);
+    vstore4(val0, 0, ((__global float *)dst.ptr) + 12);
+}
+
+/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U16/S16/F16
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: U16/S16/F16
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_interleave4x4_16bit(IMAGE_DECLARATION(src),
+                                       IMAGE_DECLARATION(dst))
+{
+    /* Compute source and destination addresses */
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Load values from Matrix A */
+    half8 a0 = vload8(0, (__global half *)(offset(&src, 0, 0)));
+    half8 a1 = vload8(0, (__global half *)(offset(&src, 0, 1)));
+    half8 a2 = vload8(0, (__global half *)(offset(&src, 0, 2)));
+    half8 a3 = vload8(0, (__global half *)(offset(&src, 0, 3)));
+
+    half8 val0 = (half8)((half4)(a0.s0, a1.s0, a2.s0, a3.s0), (half4)(a0.s1, a1.s1, a2.s1, a3.s1));
+    vstore8(val0, 0, ((__global half *)dst.ptr) + 0);
+
+    val0 = (half8)((half4)(a0.s2, a1.s2, a2.s2, a3.s2), (half4)(a0.s3, a1.s3, a2.s3, a3.s3));
+    vstore8(val0, 0, ((__global half *)dst.ptr) + 8);
+
+    val0 = (half8)((half4)(a0.s4, a1.s4, a2.s4, a3.s4), (half4)(a0.s5, a1.s5, a2.s5, a3.s5));
+    vstore8(val0, 0, ((__global half *)dst.ptr) + 16);
+
+    val0 = (half8)((half4)(a0.s6, a1.s6, a2.s6, a3.s6), (half4)(a0.s7, a1.s7, a2.s7, a3.s7));
+    vstore8(val0, 0, ((__global half *)dst.ptr) + 24);
+}
+
+/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: U8/S8
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_interleave4x4_8bit(IMAGE_DECLARATION(src),
+                                      IMAGE_DECLARATION(dst))
+{
+    /* Compute source and destination addresses */
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Load values from Matrix A */
+    uchar16 a0 = vload16(0, (__global uchar *)(offset(&src, 0, 0)));
+    uchar16 a1 = vload16(0, (__global uchar *)(offset(&src, 0, 1)));
+    uchar16 a2 = vload16(0, (__global uchar *)(offset(&src, 0, 2)));
+    uchar16 a3 = vload16(0, (__global uchar *)(offset(&src, 0, 3)));
+
+    uchar16 val0 = (uchar16)((uchar4)(a0.s0, a1.s0, a2.s0, a3.s0), (uchar4)(a0.s1, a1.s1, a2.s1, a3.s1),
+                             (uchar4)(a0.s2, a1.s2, a2.s2, a3.s2), (uchar4)(a0.s3, a1.s3, a2.s3, a3.s3));
+    vstore16(val0, 0, ((__global uchar *)dst.ptr) + 0);
+
+    val0 = (uchar16)((uchar4)(a0.s4, a1.s4, a2.s4, a3.s4), (uchar4)(a0.s5, a1.s5, a2.s5, a3.s5),
+                     (uchar4)(a0.s6, a1.s6, a2.s6, a3.s6), (uchar4)(a0.s7, a1.s7, a2.s7, a3.s7));
+    vstore16(val0, 0, ((__global uchar *)dst.ptr) + 16);
+
+    val0 = (uchar16)((uchar4)(a0.s8, a1.s8, a2.s8, a3.s8), (uchar4)(a0.s9, a1.s9, a2.s9, a3.s9),
+                     (uchar4)(a0.sA, a1.sA, a2.sA, a3.sA), (uchar4)(a0.sB, a1.sB, a2.sB, a3.sB));
+    vstore16(val0, 0, ((__global uchar *)dst.ptr) + 32);
+
+    val0 = (uchar16)((uchar4)(a0.sC, a1.sC, a2.sC, a3.sC), (uchar4)(a0.sD, a1.sD, a2.sD, a3.sD),
+                     (uchar4)(a0.sE, a1.sE, a2.sE, a3.sE), (uchar4)(a0.sF, a1.sF, a2.sF, a3.sF));
+    vstore16(val0, 0, ((__global uchar *)dst.ptr) + 48);
+}
+
+/** This kernel accumulates each row with the biases vector
+ *
+ * @param[in, out] accum_ptr                            Pointer to the accumulate tensor. Supported data type: F32
+ * @param[in]      accum_stride_x                       Stride of the accmulate tensor in X dimension (in bytes)
+ * @param[in]      accum_step_x                         accum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      accum_stride_y                       Stride of the accumlulate tensor in Y dimension (in bytes)
+ * @param[in]      accum_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]      accum_offset_first_element_in_bytes  The offset of the first element in the accumulate tensor
+ * @param[in]      biases_ptr                           Pointer to the biases vector. Same as input.
+ * @param[in]      biases_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]      biases_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gemm_accumulate_biases_f32(
+    IMAGE_DECLARATION(accum),
+    VECTOR_DECLARATION(biases))
+{
+    Image  accum  = CONVERT_TO_IMAGE_STRUCT(accum);
+    Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
+
+    float4 accum_value  = vload4(0, (__global float *)accum.ptr);
+    float4 biases_value = vload4(0, (__global float *)biases.ptr);
+    accum_value         = biases_value + accum_value;
+
+    // Store result in the accummulate buffer
+    vstore4(accum_value, 0, (__global float *)accum.ptr);
+}
+
+/** This kernel accumulates each row with the biases vector
+ *
+ * @param[in, out] accum_ptr                            Pointer to the accumulate tensor. Supported data type: F16
+ * @param[in]      accum_stride_x                       Stride of the accumulate tensor in X dimension (in bytes)
+ * @param[in]      accum_step_x                         accum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      accum_stride_y                       Stride of the accumlulate tensor in Y dimension (in bytes)
+ * @param[in]      accum_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]      accum_offset_first_element_in_bytes  The offset of the first element in the accumulate tensor
+ * @param[in]      biases_ptr                           Pointer to the biases vector. Same as input.
+ * @param[in]      biases_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]      biases_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gemm_accumulate_biases_f16(
+    IMAGE_DECLARATION(accum),
+    VECTOR_DECLARATION(biases))
+{
+    Image  accum  = CONVERT_TO_IMAGE_STRUCT(accum);
+    Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
+
+    half8 accum_value  = vload8(0, (__global half *)accum.ptr);
+    half8 biases_value = vload8(0, (__global half *)biases.ptr);
+    accum_value        = biases_value + accum_value;
+
+    // Store result in the accummulate buffer
+    vstore8(accum_value, 0, (__global half *)accum.ptr);
+}
+
+#if(defined WIDTH_MATRIX_B)
+/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_u8 and @ref gemm_transpose1x16_u8 before running the matrix multiplication
+ *
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported formats: U8
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported formats: U8
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported formats: U8
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  a_offset                           Offset to be added to each element of the matrix A
+ * @param[in]  b_offset                           Offset to be added to each element of the matrix B.
+ * @param[in]  c_offset                           Offset to be added to each element of the matrix C.
+ * @param[in]  c_mult_int                         Multiplied with each element of the matrix C.
+ * @param[in]  shift                              Number of bits to shift right the result.
+ */
+__kernel void gemm_mm_u8(IMAGE_DECLARATION(src0),
+                         IMAGE_DECLARATION(src1),
+                         IMAGE_DECLARATION(dst),
+                         int a_offset,
+                         int b_offset,
+                         int c_offset,
+                         int c_mult_int,
+                         int shift)
+{
+    /* src_addr.s0 = address of matrix A */
+    /* src_addr.s1 = address of matrix B */
+
+    /* Compute address for matrix A and B */
+    int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
+                                                                        (src1_stride_y));
+
+    /* Add offset_first_element_in_bytes */
+    src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    /* Compute end row address for matrix B */
+    int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
+
+    /* Reset accumulators */
+    int16 c00 = 0.0f;
+    int16 c10 = 0.0f;
+    int16 c20 = 0.0f;
+    int16 c30 = 0.0f;
+
+    for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 32))
+    {
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        int8 a0  = (int8)a_offset + convert_int8(vload8(0, ((__global uchar *)src0_ptr) + src_addr.s0));
+        int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
+
+        c00 += (int16)a0.s0 * b0;
+        c10 += (int16)a0.s1 * b0;
+        c20 += (int16)a0.s2 * b0;
+        c30 += (int16)a0.s3 * b0;
+
+        int16 b1 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1 + 16));
+
+        c00 += (int16)a0.s4 * b1;
+        c10 += (int16)a0.s5 * b1;
+        c20 += (int16)a0.s6 * b1;
+        c30 += (int16)a0.s7 * b1;
+    }
+
+    for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 16))
+    {
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        int4 a0  = (int4)a_offset + convert_int4(vload4(0, ((__global uchar *)src0_ptr) + src_addr.s0));
+        int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
+
+        c00 += (int16)a0.s0 * b0;
+        c10 += (int16)a0.s1 * b0;
+        c20 += (int16)a0.s2 * b0;
+        c30 += (int16)a0.s3 * b0;
+    }
+
+    /* Compute destination address */
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Multiply by the weight of matrix product */
+    c00 = (((int16)c_offset + c00) * (int16)c_mult_int) >> shift;
+    c10 = (((int16)c_offset + c10) * (int16)c_mult_int) >> shift;
+    c20 = (((int16)c_offset + c20) * (int16)c_mult_int) >> shift;
+    c30 = (((int16)c_offset + c30) * (int16)c_mult_int) >> shift;
+
+    /* Store 4x16 block */
+    vstore16(convert_uchar16_sat(c00), 0, (__global uchar *)(offset(&dst, 0, 0)));
+    vstore16(convert_uchar16_sat(c10), 0, (__global uchar *)(offset(&dst, 0, 1)));
+    vstore16(convert_uchar16_sat(c20), 0, (__global uchar *)(offset(&dst, 0, 2)));
+    vstore16(convert_uchar16_sat(c30), 0, (__global uchar *)(offset(&dst, 0, 3)));
+}
+#endif
+
+#if(defined WIDTH_MATRIX_B && defined ALPHA)
+/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f32 and @ref gemm_transpose1x4_f32 before running the matrix multiplication
+ *
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: F32
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemm_mm_f32_midgard(IMAGE_DECLARATION(src0),
+                                  IMAGE_DECLARATION(src1),
+                                  IMAGE_DECLARATION(dst))
+{
+    /* src_addr.s0 = address of matrix A */
+    /* src_addr.s1 = address of matrix B */
+
+    /* Compute address for matrix A and B */
+    int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
+                                                                        (src1_stride_y));
+
+    /* Add offset_first_element_in_bytes */
+    src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    /* Divide by 4 in order to get the src_addr in unit of float */
+    src_addr = src_addr >> 2;
+
+    /* Compute end row address for matrix B */
+    int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
+
+    /* Reset accumulators */
+    float4 c00 = 0.0f;
+    float4 c10 = 0.0f;
+    float4 c20 = 0.0f;
+    float4 c30 = 0.0f;
+
+    for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 8))
+    {
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);
+        float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);
+
+        c00 += (float4)a0.s0 * b0;
+        c10 += (float4)a0.s1 * b0;
+        c20 += (float4)a0.s2 * b0;
+        c30 += (float4)a0.s3 * b0;
+
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0 + 4);
+        b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1 + 4);
+
+        c00 += (float4)a0.s0 * b0;
+        c10 += (float4)a0.s1 * b0;
+        c20 += (float4)a0.s2 * b0;
+        c30 += (float4)a0.s3 * b0;
+    }
+
+    for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 4))
+    {
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);
+        float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);
+
+        c00 += (float4)a0.s0 * b0;
+        c10 += (float4)a0.s1 * b0;
+        c20 += (float4)a0.s2 * b0;
+        c30 += (float4)a0.s3 * b0;
+    }
+
+    /* Compute destination address */
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Multiply by the weight of matrix product */
+    c00 = c00 * (float4)ALPHA;
+    c10 = c10 * (float4)ALPHA;
+    c20 = c20 * (float4)ALPHA;
+    c30 = c30 * (float4)ALPHA;
+
+    /* Store 4x4 block */
+    vstore4(c00, 0, (__global float *)(offset(&dst, 0, 0)));
+    vstore4(c10, 0, (__global float *)(offset(&dst, 0, 1)));
+    vstore4(c20, 0, (__global float *)(offset(&dst, 0, 2)));
+    vstore4(c30, 0, (__global float *)(offset(&dst, 0, 3)));
+}
+
+/** This OpenCL kernel is optimised for Bifrost. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f32 and @ref gemm_transpose1x4_f32 before running the matrix multiplication
+ *
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: F32
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemm_mm_f32_bifrost(IMAGE_DECLARATION(src0),
+                                  IMAGE_DECLARATION(src1),
+                                  IMAGE_DECLARATION(dst))
+{
+    // src_addr_a = address of matrix A
+    // src_addr_b = address of matrix B
+    __global float *src_addr_a = (__global float *)(src0_ptr + get_global_id(1) * src0_stride_y + src0_offset_first_element_in_bytes);
+    __global float *src_addr_b = (__global float *)(src1_ptr + get_global_id(0) * src1_stride_y + src1_offset_first_element_in_bytes);
+
+    // Compute end row address for matrix B
+    __global float *src_end_addr_b = src_addr_b + WIDTH_MATRIX_B;
+
+    // Reset accumulators
+    float c00 = 0.0f;
+    float c01 = 0.0f;
+    float c02 = 0.0f;
+    float c03 = 0.0f;
+    float c10 = 0.0f;
+    float c11 = 0.0f;
+    float c12 = 0.0f;
+    float c13 = 0.0f;
+    float c20 = 0.0f;
+    float c21 = 0.0f;
+    float c22 = 0.0f;
+    float c23 = 0.0f;
+    float c30 = 0.0f;
+    float c31 = 0.0f;
+    float c32 = 0.0f;
+    float c33 = 0.0f;
+
+    for(; src_addr_b <= (src_end_addr_b - 16); src_addr_a += 16, src_addr_b += 16)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        float4 a0 = vload4(0, src_addr_a);
+        float4 b0 = vload4(0, src_addr_b);
+
+        c00 = fma(a0.s0, b0.s0, c00);
+        c01 = fma(a0.s0, b0.s1, c01);
+        c02 = fma(a0.s0, b0.s2, c02);
+        c03 = fma(a0.s0, b0.s3, c03);
+
+        c10 = fma(a0.s1, b0.s0, c10);
+        c11 = fma(a0.s1, b0.s1, c11);
+        c12 = fma(a0.s1, b0.s2, c12);
+        c13 = fma(a0.s1, b0.s3, c13);
+
+        c20 = fma(a0.s2, b0.s0, c20);
+        c21 = fma(a0.s2, b0.s1, c21);
+        c22 = fma(a0.s2, b0.s2, c22);
+        c23 = fma(a0.s2, b0.s3, c23);
+
+        c30 = fma(a0.s3, b0.s0, c30);
+        c31 = fma(a0.s3, b0.s1, c31);
+        c32 = fma(a0.s3, b0.s2, c32);
+        c33 = fma(a0.s3, b0.s3, c33);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a + 4);
+        b0 = vload4(0, src_addr_b + 4);
+
+        c00 = fma(a0.s0, b0.s0, c00);
+        c01 = fma(a0.s0, b0.s1, c01);
+        c02 = fma(a0.s0, b0.s2, c02);
+        c03 = fma(a0.s0, b0.s3, c03);
+
+        c10 = fma(a0.s1, b0.s0, c10);
+        c11 = fma(a0.s1, b0.s1, c11);
+        c12 = fma(a0.s1, b0.s2, c12);
+        c13 = fma(a0.s1, b0.s3, c13);
+
+        c20 = fma(a0.s2, b0.s0, c20);
+        c21 = fma(a0.s2, b0.s1, c21);
+        c22 = fma(a0.s2, b0.s2, c22);
+        c23 = fma(a0.s2, b0.s3, c23);
+
+        c30 = fma(a0.s3, b0.s0, c30);
+        c31 = fma(a0.s3, b0.s1, c31);
+        c32 = fma(a0.s3, b0.s2, c32);
+        c33 = fma(a0.s3, b0.s3, c33);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a + 8);
+        b0 = vload4(0, src_addr_b + 8);
+
+        c00 = fma(a0.s0, b0.s0, c00);
+        c01 = fma(a0.s0, b0.s1, c01);
+        c02 = fma(a0.s0, b0.s2, c02);
+        c03 = fma(a0.s0, b0.s3, c03);
+
+        c10 = fma(a0.s1, b0.s0, c10);
+        c11 = fma(a0.s1, b0.s1, c11);
+        c12 = fma(a0.s1, b0.s2, c12);
+        c13 = fma(a0.s1, b0.s3, c13);
+
+        c20 = fma(a0.s2, b0.s0, c20);
+        c21 = fma(a0.s2, b0.s1, c21);
+        c22 = fma(a0.s2, b0.s2, c22);
+        c23 = fma(a0.s2, b0.s3, c23);
+
+        c30 = fma(a0.s3, b0.s0, c30);
+        c31 = fma(a0.s3, b0.s1, c31);
+        c32 = fma(a0.s3, b0.s2, c32);
+        c33 = fma(a0.s3, b0.s3, c33);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a + 12);
+        b0 = vload4(0, src_addr_b + 12);
+
+        c00 = fma(a0.s0, b0.s0, c00);
+        c01 = fma(a0.s0, b0.s1, c01);
+        c02 = fma(a0.s0, b0.s2, c02);
+        c03 = fma(a0.s0, b0.s3, c03);
+
+        c10 = fma(a0.s1, b0.s0, c10);
+        c11 = fma(a0.s1, b0.s1, c11);
+        c12 = fma(a0.s1, b0.s2, c12);
+        c13 = fma(a0.s1, b0.s3, c13);
+
+        c20 = fma(a0.s2, b0.s0, c20);
+        c21 = fma(a0.s2, b0.s1, c21);
+        c22 = fma(a0.s2, b0.s2, c22);
+        c23 = fma(a0.s2, b0.s3, c23);
+
+        c30 = fma(a0.s3, b0.s0, c30);
+        c31 = fma(a0.s3, b0.s1, c31);
+        c32 = fma(a0.s3, b0.s2, c32);
+        c33 = fma(a0.s3, b0.s3, c33);
+    }
+
+    for(; src_addr_b < src_end_addr_b; src_addr_a += 4, src_addr_b += 4)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        float4 a0 = vload4(0, src_addr_a);
+        float4 b0 = vload4(0, src_addr_b);
+
+        c00 = fma(a0.s0, b0.s0, c00);
+        c01 = fma(a0.s0, b0.s1, c01);
+        c02 = fma(a0.s0, b0.s2, c02);
+        c03 = fma(a0.s0, b0.s3, c03);
+
+        c10 = fma(a0.s1, b0.s0, c10);
+        c11 = fma(a0.s1, b0.s1, c11);
+        c12 = fma(a0.s1, b0.s2, c12);
+        c13 = fma(a0.s1, b0.s3, c13);
+
+        c20 = fma(a0.s2, b0.s0, c20);
+        c21 = fma(a0.s2, b0.s1, c21);
+        c22 = fma(a0.s2, b0.s2, c22);
+        c23 = fma(a0.s2, b0.s3, c23);
+
+        c30 = fma(a0.s3, b0.s0, c30);
+        c31 = fma(a0.s3, b0.s1, c31);
+        c32 = fma(a0.s3, b0.s2, c32);
+        c33 = fma(a0.s3, b0.s3, c33);
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Multiply by the weight of matrix product
+    c00 = c00 * ALPHA;
+    c01 = c01 * ALPHA;
+    c02 = c02 * ALPHA;
+    c03 = c03 * ALPHA;
+    c10 = c10 * ALPHA;
+    c11 = c11 * ALPHA;
+    c12 = c12 * ALPHA;
+    c13 = c13 * ALPHA;
+    c20 = c20 * ALPHA;
+    c21 = c21 * ALPHA;
+    c22 = c22 * ALPHA;
+    c23 = c23 * ALPHA;
+    c30 = c30 * ALPHA;
+    c31 = c31 * ALPHA;
+    c32 = c32 * ALPHA;
+    c33 = c33 * ALPHA;
+
+    barrier(CLK_GLOBAL_MEM_FENCE);
+
+    // Store 4x4 block
+    vstore4((float4)(c00, c01, c02, c03), 0, (__global float *)(offset(&dst, 0, 0)));
+    vstore4((float4)(c10, c11, c12, c13), 0, (__global float *)(offset(&dst, 0, 1)));
+    vstore4((float4)(c20, c21, c22, c23), 0, (__global float *)(offset(&dst, 0, 2)));
+    vstore4((float4)(c30, c31, c32, c33), 0, (__global float *)(offset(&dst, 0, 3)));
+}
+
+/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f16 and @ref gemm_transpose1x8_f16 before running the matrix multiplication
+ *
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: F16
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemm_mm_f16(IMAGE_DECLARATION(src0),
+                          IMAGE_DECLARATION(src1),
+                          IMAGE_DECLARATION(dst))
+{
+    /* src_addr.s0 = address of matrix A */
+    /* src_addr.s1 = address of matrix B */
+
+    /* Compute address for matrix A and B */
+    int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
+                                                                        (src1_stride_y));
+
+    /* Add offset_first_element_in_bytes */
+    src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    /* Divide by 2 in order to get the src_addr in unit of half */
+    src_addr = src_addr >> 1;
+
+    /* Compute end row address for matrix B */
+    int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
+
+    /* Reset accumulators */
+    half8 c00 = 0.0f;
+    half8 c10 = 0.0f;
+    half8 c20 = 0.0f;
+    half8 c30 = 0.0f;
+
+    for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 16))
+    {
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);
+        half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);
+
+        c00 += (half8)a0.s0 * b0;
+        c10 += (half8)a0.s1 * b0;
+        c20 += (half8)a0.s2 * b0;
+        c30 += (half8)a0.s3 * b0;
+
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0 + 4);
+        b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1 + 8);
+
+        c00 += (half8)a0.s0 * b0;
+        c10 += (half8)a0.s1 * b0;
+        c20 += (half8)a0.s2 * b0;
+        c30 += (half8)a0.s3 * b0;
+    }
+
+    for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 8))
+    {
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);
+        half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);
+
+        c00 += (half8)a0.s0 * b0;
+        c10 += (half8)a0.s1 * b0;
+        c20 += (half8)a0.s2 * b0;
+        c30 += (half8)a0.s3 * b0;
+    }
+
+    /* Compute destination address */
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Multiply by the weight of matrix product */
+    c00 = c00 * (half8)ALPHA;
+    c10 = c10 * (half8)ALPHA;
+    c20 = c20 * (half8)ALPHA;
+    c30 = c30 * (half8)ALPHA;
+
+    /* Store 4x8 block */
+    vstore8(c00, 0, (__global half *)(offset(&dst, 0, 0)));
+    vstore8(c10, 0, (__global half *)(offset(&dst, 0, 1)));
+    vstore8(c20, 0, (__global half *)(offset(&dst, 0, 2)));
+    vstore8(c30, 0, (__global half *)(offset(&dst, 0, 3)));
+}
+
+#if(defined WIDTH_VECTOR_A)
+/** This OpenCL kernel computes the vector by matrix multiplication between the vector A (src0) and matrix B (src1)
+ *
+ * @attention The width of vector A, the width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_VECTOR_A -DWIDTH_MATRIX_B and -DALPHA
+ *
+ * @attention The input vector A and matrix B must not be reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: F32
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemm_vm_f32(IMAGE_DECLARATION(src0),
+                          IMAGE_DECLARATION(src1),
+                          IMAGE_DECLARATION(dst))
+{
+    int idx = get_global_id(0) * 4;
+
+    /* Compute the address for the vector A and matrix B */
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+    src_addr.s1 += idx * sizeof(float);
+
+    int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));
+
+    float4 acc = 0.0f;
+
+    for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
+    {
+        float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));
+        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+        float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));
+
+        acc += b0 * (float4)a0.s0;
+        acc += b1 * (float4)a0.s1;
+    }
+
+    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
+    {
+        float  a0 = *((__global float *)(src0_ptr + src_addr.s0));
+        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+
+        acc += b0 * (float4)a0;
+    }
+
+    /* Compute destination address */
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Multiply by the weight of vector-matrix product */
+    acc = acc * (float4)ALPHA;
+
+    vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
+}
+
+/** This OpenCL kernel computes the vector by matrix multiplication between the vector A (src0) and matrix B (src1)
+ *
+ * @attention The width of vector A, the width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_VECTOR_A -DWIDTH_MATRIX_B and -DALPHA
+ *
+ * @attention The input vector A and matrix B must not be reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: F16
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemm_vm_f16(IMAGE_DECLARATION(src0),
+                          IMAGE_DECLARATION(src1),
+                          IMAGE_DECLARATION(dst))
+{
+    int idx = get_global_id(0) * 8;
+
+    /* Compute the address for the vector A and matrix B */
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+    src_addr.s1 += idx * sizeof(half);
+
+    int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(half));
+
+    half8 acc = 0.0f;
+
+    for(; src_addr.s0 <= (end_row_vec_a - 4 * sizeof(half)); src_addr += (int2)(4 * sizeof(half), 4 * src1_stride_y))
+    {
+        half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0));
+        half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));
+        half8 b1 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));
+        half8 b2 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 2 * src1_stride_y));
+        half8 b3 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 3 * src1_stride_y));
+
+        acc += b0 * (half8)a0.s0;
+        acc += b1 * (half8)a0.s1;
+        acc += b2 * (half8)a0.s2;
+        acc += b3 * (half8)a0.s3;
+    }
+
+    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(half), src1_stride_y))
+    {
+        half a0  = *((__global half *)(src0_ptr + src_addr.s0));
+        half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+
+        acc += b0 * (half8)a0;
+    }
+
+    /* Compute destination address */
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Multiply by the weight of vector-matrix product */
+    acc = acc * (half8)ALPHA;
+
+    vstore8(acc, 0, (__global half *)(offset(&dst, 0, 0)));
+}
+#endif /* (defined WIDTH_VECTOR_A) */
+#endif /* (defined WIDTH_MATRIX_B && defined ALPHA) */
+
+#if(defined BETA)
+/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
+ *
+ * @attention The beta's value need to be passed at compile time using -DBETA
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: F32
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_ma_f32(IMAGE_DECLARATION(src),
+                          IMAGE_DECLARATION(dst))
+{
+    /* Compute source and destination addresses */
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Load values from A x B */
+    float4 alpha_ab = vload4(0, (__global float *)dst.ptr);
+
+    /* Load values from Matrix C */
+    float4 c = vload4(0, (__global float *)src.ptr);
+
+    /* Computes alpha * axb + beta * c */
+    float4 out = alpha_ab + (float4)BETA * c;
+
+    /* Store final result in axb matrix */
+    vstore4(out, 0, (__global float *)dst.ptr);
+}
+
+/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: F16
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_ma_f16(IMAGE_DECLARATION(src),
+                          IMAGE_DECLARATION(dst))
+{
+    /* Compute source and destination addresses */
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Load values from A x B */
+    half8 alpha_ab = vload8(0, (__global half *)dst.ptr);
+
+    /* Load values from Matrix C */
+    half8 c = vload8(0, (__global half *)src.ptr);
+
+    /* Computes alpha * axb + beta * c */
+    half8 out = alpha_ab + (half8)BETA * c;
+
+    /* Store final result in axb matrix */
+    vstore8(out, 0, (__global half *)dst.ptr);
+}
+#endif /* (defined BETA) */
+
+#if(defined WIDTH_VECTOR_A)
+/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer
+ *
+ * @attention The width of A need to be passed at compile time using -DWIDTH_VECTOR_A
+ *
+ * @attention The input A and matrix B must not be reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: F32
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0),
+                             TENSOR3D_DECLARATION(src1),
+                             IMAGE_DECLARATION(dst))
+{
+    int idx = get_global_id(0) * 4;
+    int idy = get_global_id(1);
+
+    /* Compute the address for the vector A and matrix B */
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, src1_offset_first_element_in_bytes + src1_stride_z * idy));
+    src_addr.s1 += idx * sizeof(float);
+
+    int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));
+
+    float4 acc = 0.0f;
+
+    for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
+    {
+        float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));
+        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+        float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));
+
+        acc += b0 * (float4)a0.s0;
+        acc += b1 * (float4)a0.s1;
+    }
+
+    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
+    {
+        float  a0 = *((__global float *)(src0_ptr + src_addr.s0));
+        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+
+        acc += b0 * (float4)a0;
+    }
+
+    /* Compute destination address */
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
+}
+#endif /* (defined WIDTH_VECTOR_A) */
diff --git a/src/core/CL/cl_kernels/harris_corners.cl b/src/core/CL/cl_kernels/harris_corners.cl
new file mode 100644
index 0000000000..5320a064ed
--- /dev/null
+++ b/src/core/CL/cl_kernels/harris_corners.cl
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Function running harris score on 3x3 block size
+ *
+ * @attention: The input data type should be passed using a compile option -DDATA_TYPE. Supported types: short and int.
+ *             e.g. -DDATA_TYPE=short.
+ *
+ * @param[in]  src_gx_ptr                           Pointer to the first source image. Supported data types: S16, S32
+ * @param[in]  src_gx_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_gx_step_x                        src_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_gx_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_gx_step_y                        src_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_gx_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  src_gy_ptr                           Pointer to the second source image. Supported data types: S16, S32
+ * @param[in]  src_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  src_gy_step_x                        src_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  src_gy_step_y                        src_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] vc_ptr                               Pointer to the destination image. Supported data types: F32
+ * @param[in]  vc_stride_x                          Stride of the destination image in X dimension (in bytes)
+ * @param[in]  vc_step_x                            vc_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  vc_stride_y                          Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  vc_step_y                            vc_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  vc_offset_first_element_in_bytes     The offset of the first element in the destination image
+ * @param[in]  sensitivity                          Sensitivity threshold k from the Harris-Stephens equation
+ * @param[in]  strength_thresh                      Minimum threshold with which to eliminate Harris Corner scores
+ * @param[in]  pow4_normalization_factor            Normalization factor to apply harris score
+ */
+__kernel void harris_score_3x3(
+    IMAGE_DECLARATION(src_gx),
+    IMAGE_DECLARATION(src_gy),
+    IMAGE_DECLARATION(vc),
+    float sensitivity,
+    float strength_thresh,
+    float pow4_normalization_factor)
+{
+    Image src_gx = CONVERT_TO_IMAGE_STRUCT(src_gx);
+    Image src_gy = CONVERT_TO_IMAGE_STRUCT(src_gy);
+    Image vc     = CONVERT_TO_IMAGE_STRUCT(vc);
+
+    /* Gx^2, Gy^2 and Gx*Gy */
+    float4 gx2  = (float4)0.0f;
+    float4 gy2  = (float4)0.0f;
+    float4 gxgy = (float4)0.0f;
+
+    /* Row0 */
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp_gx = vload8(0, (__global DATA_TYPE *)offset(&src_gx, -1, -1));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp_gy = vload8(0, (__global DATA_TYPE *)offset(&src_gy, -1, -1));
+
+    float4 l_gx = convert_float4(temp_gx.s0123);
+    float4 m_gx = convert_float4(temp_gx.s1234);
+    float4 r_gx = convert_float4(temp_gx.s2345);
+
+    float4 l_gy = convert_float4(temp_gy.s0123);
+    float4 m_gy = convert_float4(temp_gy.s1234);
+    float4 r_gy = convert_float4(temp_gy.s2345);
+
+    gx2 += (l_gx * l_gx) + (m_gx * m_gx) + (r_gx * r_gx);
+    gy2 += (l_gy * l_gy) + (m_gy * m_gy) + (r_gy * r_gy);
+    gxgy += (l_gx * l_gy) + (m_gx * m_gy) + (r_gx * r_gy);
+
+    /* Row1 */
+    temp_gx = vload8(0, (__global DATA_TYPE *)offset(&src_gx, -1, 0));
+    temp_gy = vload8(0, (__global DATA_TYPE *)offset(&src_gy, -1, 0));
+
+    l_gx = convert_float4(temp_gx.s0123);
+    m_gx = convert_float4(temp_gx.s1234);
+    r_gx = convert_float4(temp_gx.s2345);
+
+    l_gy = convert_float4(temp_gy.s0123);
+    m_gy = convert_float4(temp_gy.s1234);
+    r_gy = convert_float4(temp_gy.s2345);
+
+    gx2 += (l_gx * l_gx) + (m_gx * m_gx) + (r_gx * r_gx);
+    gy2 += (l_gy * l_gy) + (m_gy * m_gy) + (r_gy * r_gy);
+    gxgy += (l_gx * l_gy) + (m_gx * m_gy) + (r_gx * r_gy);
+
+    /* Row2 */
+    temp_gx = vload8(0, (__global DATA_TYPE *)offset(&src_gx, -1, 1));
+    temp_gy = vload8(0, (__global DATA_TYPE *)offset(&src_gy, -1, 1));
+
+    l_gx = convert_float4(temp_gx.s0123);
+    m_gx = convert_float4(temp_gx.s1234);
+    r_gx = convert_float4(temp_gx.s2345);
+
+    l_gy = convert_float4(temp_gy.s0123);
+    m_gy = convert_float4(temp_gy.s1234);
+    r_gy = convert_float4(temp_gy.s2345);
+
+    gx2 += (l_gx * l_gx) + (m_gx * m_gx) + (r_gx * r_gx);
+    gy2 += (l_gy * l_gy) + (m_gy * m_gy) + (r_gy * r_gy);
+    gxgy += (l_gx * l_gy) + (m_gx * m_gy) + (r_gx * r_gy);
+
+    /* Compute trace and determinant */
+    float4 trace = gx2 + gy2;
+    float4 det   = gx2 * gy2 - (gxgy * gxgy);
+
+    /* Compute harris score */
+    float4 mc = (det - (sensitivity * (trace * trace))) * pow4_normalization_factor;
+
+    mc = select(0.0f, mc, mc > (float4)strength_thresh);
+
+    vstore4(mc, 0, (__global float *)vc.ptr);
+}
+
+/** Function for calculating harris score 1x5.
+ *
+ * @param[in] src_gx Pointer to gx gradient image.
+ * @param[in] src_gy Pointer to gy gradient image.
+ * @param[in] row    Relative row.
+ */
+inline float16 harris_score_1x5(Image *src_gx, Image *src_gy, int row)
+{
+    float4 gx2  = 0.0f;
+    float4 gy2  = 0.0f;
+    float4 gxgy = 0.0f;
+
+    /* Row */
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp_gx = vload8(0, (__global DATA_TYPE *)offset(src_gx, -2, row));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp_gy = vload8(0, (__global DATA_TYPE *)offset(src_gy, -2, row));
+
+    float4 gx = convert_float4(temp_gx.s0123);
+    float4 gy = convert_float4(temp_gy.s0123);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4(temp_gx.s1234);
+    gy = convert_float4(temp_gy.s1234);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4(temp_gx.s2345);
+    gy = convert_float4(temp_gy.s2345);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4(temp_gx.s3456);
+    gy = convert_float4(temp_gy.s3456);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4(temp_gx.s4567);
+    gy = convert_float4(temp_gy.s4567);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    return (float16)(gx2, gy2, gxgy, (float4)0);
+}
+
+/** Function running harris score on 5x5 block size
+ *
+ * @attention: The input data type should be passed using a compile option -DDATA_TYPE. Supported types: short and int.
+ *             e.g. -DDATA_TYPE=short.
+ *
+ * @param[in]  src_gx_ptr                           Pointer to the first source image. Supported data types: S16, S32
+ * @param[in]  src_gx_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_gx_step_x                        src_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_gx_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_gx_step_y                        src_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_gx_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  src_gy_ptr                           Pointer to the second source image. Supported data types: S16, S32
+ * @param[in]  src_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  src_gy_step_x                        src_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  src_gy_step_y                        src_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] vc_ptr                               Pointer to the destination image. Supported data types: F32
+ * @param[in]  vc_stride_x                          Stride of the destination image in X dimension (in bytes)
+ * @param[in]  vc_step_x                            vc_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  vc_stride_y                          Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  vc_step_y                            vc_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  vc_offset_first_element_in_bytes     The offset of the first element in the destination image
+ * @param[in]  sensitivity                          Sensitivity threshold k from the Harris-Stephens equation
+ * @param[in]  strength_thresh                      Minimum threshold with which to eliminate Harris Corner scores
+ * @param[in]  pow4_normalization_factor            Normalization factor to apply harris score
+ */
+__kernel void harris_score_5x5(
+    IMAGE_DECLARATION(src_gx),
+    IMAGE_DECLARATION(src_gy),
+    IMAGE_DECLARATION(vc),
+    float sensitivity,
+    float strength_thresh,
+    float pow4_normalization_factor)
+{
+    Image src_gx = CONVERT_TO_IMAGE_STRUCT(src_gx);
+    Image src_gy = CONVERT_TO_IMAGE_STRUCT(src_gy);
+    Image vc     = CONVERT_TO_IMAGE_STRUCT(vc);
+
+    /* Gx^2, Gy^2 and Gx*Gy */
+    float16 res = (float16)0.0f;
+
+    /* Compute row */
+    for(int i = -2; i < 3; i++)
+    {
+        res += harris_score_1x5(&src_gx, &src_gy, i);
+    }
+
+    float4 gx2  = res.s0123;
+    float4 gy2  = res.s4567;
+    float4 gxgy = res.s89AB;
+
+    /* Compute trace and determinant */
+    float4 trace = gx2 + gy2;
+    float4 det   = gx2 * gy2 - (gxgy * gxgy);
+
+    /* Compute harris score */
+    float4 mc = (det - (sensitivity * (trace * trace))) * pow4_normalization_factor;
+
+    mc = select(0.0f, mc, mc > (float4)strength_thresh);
+
+    vstore4(mc, 0, (__global float *)vc.ptr);
+}
+
+/** Function for calculating harris score 1x7.
+ *
+ * @param[in] src_gx Pointer to gx gradient image.
+ * @param[in] src_gy Pointer to gy gradient image.
+ * @param[in] row    Relative row.
+ */
+inline float16 harris_score_1x7(Image *src_gx, Image *src_gy, int row)
+{
+    float4 gx2  = 0.0f;
+    float4 gy2  = 0.0f;
+    float4 gxgy = 0.0f;
+
+    /* Row */
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp_gx0 = vload8(0, (__global DATA_TYPE *)offset(src_gx, -3, row));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp_gy0 = vload8(0, (__global DATA_TYPE *)offset(src_gy, -3, row));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    temp_gx1 = vload2(0, (__global DATA_TYPE *)offset(src_gx, 5, row));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    temp_gy1 = vload2(0, (__global DATA_TYPE *)offset(src_gy, 5, row));
+
+    float4 gx = convert_float4(temp_gx0.s0123);
+    float4 gy = convert_float4(temp_gy0.s0123);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4(temp_gx0.s1234);
+    gy = convert_float4(temp_gy0.s1234);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4(temp_gx0.s2345);
+    gy = convert_float4(temp_gy0.s2345);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4(temp_gx0.s3456);
+    gy = convert_float4(temp_gy0.s3456);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4(temp_gx0.s4567);
+    gy = convert_float4(temp_gy0.s4567);
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gx0.s567, temp_gx1.s0));
+    gy = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gy0.s567, temp_gy1.s0));
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    gx = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gx0.s67, temp_gx1.s01));
+    gy = convert_float4((VEC_DATA_TYPE(DATA_TYPE, 4))(temp_gy0.s67, temp_gy1.s01));
+    gx2 += (gx * gx);
+    gy2 += (gy * gy);
+    gxgy += (gx * gy);
+
+    return (float16)(gx2, gy2, gxgy, (float4)0);
+}
+
+/** Function running harris score on 7x7 block size
+ *
+ * @attention: The input data type should be passed using a compile option -DDATA_TYPE. Supported types: short and int.
+ *             e.g. -DDATA_TYPE=short.
+ *
+ * @param[in]  src_gx_ptr                           Pointer to the first source image. Supported data types: S16, S32
+ * @param[in]  src_gx_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_gx_step_x                        src_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_gx_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_gx_step_y                        src_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_gx_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  src_gy_ptr                           Pointer to the second source image. Supported data types: S16, S32
+ * @param[in]  src_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  src_gy_step_x                        src_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  src_gy_step_y                        src_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] vc_ptr                               Pointer to the destination image. Supported data types: F32
+ * @param[in]  vc_stride_x                          Stride of the destination image in X dimension (in bytes)
+ * @param[in]  vc_step_x                            vc_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  vc_stride_y                          Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  vc_step_y                            vc_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  vc_offset_first_element_in_bytes     The offset of the first element in the destination image
+ * @param[in]  sensitivity                          Sensitivity threshold k from the Harris-Stephens equation
+ * @param[in]  strength_thresh                      Minimum threshold with which to eliminate Harris Corner scores
+ * @param[in]  pow4_normalization_factor            Normalization factor to apply harris score
+ */
+__kernel void harris_score_7x7(
+    IMAGE_DECLARATION(src_gx),
+    IMAGE_DECLARATION(src_gy),
+    IMAGE_DECLARATION(vc),
+    float sensitivity,
+    float strength_thresh,
+    float pow4_normalization_factor)
+{
+    Image src_gx = CONVERT_TO_IMAGE_STRUCT(src_gx);
+    Image src_gy = CONVERT_TO_IMAGE_STRUCT(src_gy);
+    Image vc     = CONVERT_TO_IMAGE_STRUCT(vc);
+
+    /* Gx^2, Gy^2 and Gx*Gy */
+    float16 res = (float16)0.0f;
+
+    /* Compute row */
+    for(int i = -3; i < 4; i++)
+    {
+        res += harris_score_1x7(&src_gx, &src_gy, i);
+    }
+
+    float4 gx2  = res.s0123;
+    float4 gy2  = res.s4567;
+    float4 gxgy = res.s89AB;
+
+    /* Compute trace and determinant */
+    float4 trace = gx2 + gy2;
+    float4 det   = gx2 * gy2 - (gxgy * gxgy);
+
+    /* Compute harris score */
+    float4 mc = (det - (sensitivity * (trace * trace))) * pow4_normalization_factor;
+
+    mc = select(0.0f, mc, mc > (float4)strength_thresh);
+
+    vstore4(mc, 0, (__global float *)vc.ptr);
+}
diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
new file mode 100644
index 0000000000..6db8ed567c
--- /dev/null
+++ b/src/core/CL/cl_kernels/helpers.h
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
+#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+
+#define VECTOR_DECLARATION(name)     \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR3D_DECLARATION(name)   \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                 name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+
+/** Structure to hold Vector information */
+typedef struct Vector
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+} Vector;
+
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Structure to hold 3D tensor information */
+typedef struct Tensor3D
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    int             stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+} Tensor3D;
+
+/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+Vector inline update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+    };
+    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+Image inline update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+Tensor3D inline update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+__global inline const uchar *vector_offset(const Vector *vec, int x)
+{
+    return vec->ptr + x * vec->stride_x;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+__global inline uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting postion of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+__global inline const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+{
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
+}
+
+#endif // _HELPER_H
diff --git a/src/core/CL/cl_kernels/histogram.cl b/src/core/CL/cl_kernels/histogram.cl
new file mode 100644
index 0000000000..a652b28e6a
--- /dev/null
+++ b/src/core/CL/cl_kernels/histogram.cl
@@ -0,0 +1,243 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define VATOMIC_INC16(histogram, win_pos)   \
+    {                                       \
+        atomic_inc(histogram + win_pos.s0); \
+        atomic_inc(histogram + win_pos.s1); \
+        atomic_inc(histogram + win_pos.s2); \
+        atomic_inc(histogram + win_pos.s3); \
+        atomic_inc(histogram + win_pos.s4); \
+        atomic_inc(histogram + win_pos.s5); \
+        atomic_inc(histogram + win_pos.s6); \
+        atomic_inc(histogram + win_pos.s7); \
+        atomic_inc(histogram + win_pos.s8); \
+        atomic_inc(histogram + win_pos.s9); \
+        atomic_inc(histogram + win_pos.sa); \
+        atomic_inc(histogram + win_pos.sb); \
+        atomic_inc(histogram + win_pos.sc); \
+        atomic_inc(histogram + win_pos.sd); \
+        atomic_inc(histogram + win_pos.se); \
+        atomic_inc(histogram + win_pos.sf); \
+    }
+
+/** Calculate the histogram of an 8 bit grayscale image.
+ *
+ * Each thread will process 16 pixels and use one local atomic operation per pixel.
+ * When all work items in a work group are done the resulting local histograms are
+ * added to the global histogram using global atomics.
+ *
+ * @note The input image is represented as a two-dimensional array of type uchar.
+ * The output is represented as a one-dimensional uint array of length of num_bins
+ *
+ * @param[in]  input_ptr                           Pointer to the first source image. Supported data types: U8
+ * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[in]  histogram_local                     The local buffer to hold histogram result in per workgroup. Supported data types: U32
+ * @param[out] histogram                           The output buffer to hold histogram final result. Supported data types: U32
+ * @param[out] num_bins                            The number of bins
+ * @param[out] offset                              The start of values to use (inclusive)
+ * @param[out] range                               The range of a bin
+ * @param[out] offrange                            The maximum value (exclusive)
+ */
+__kernel void hist_local_kernel(IMAGE_DECLARATION(input),
+                                __local uint *histogram_local,
+                                __global uint *restrict histogram,
+                                uint                    num_bins,
+                                uint                    offset,
+                                uint                    range,
+                                uint                    offrange)
+{
+    Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input);
+    uint  local_id_x   = get_local_id(0);
+
+    uint local_x_size = get_local_size(0);
+
+    if(num_bins > local_x_size)
+    {
+        for(int i = local_id_x; i < num_bins; i += local_x_size)
+        {
+            histogram_local[i] = 0;
+        }
+    }
+    else
+    {
+        if(local_id_x <= num_bins)
+        {
+            histogram_local[local_id_x] = 0;
+        }
+    }
+
+    uint16 vals = convert_uint16(vload16(0, input_buffer.ptr));
+
+    uint16 win_pos = select(num_bins, ((vals - offset) * num_bins) / range, (vals >= offset && vals < offrange));
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    VATOMIC_INC16(histogram_local, win_pos);
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(num_bins > local_x_size)
+    {
+        for(int i = local_id_x; i < num_bins; i += local_x_size)
+        {
+            atomic_add(histogram + i, histogram_local[i]);
+        }
+    }
+    else
+    {
+        if(local_id_x <= num_bins)
+        {
+            atomic_add(histogram + local_id_x, histogram_local[local_id_x]);
+        }
+    }
+}
+
+/** Calculate the histogram of an 8 bit grayscale image's border.
+ *
+ * Each thread will process one pixel using global atomic.
+ * When all work items in a work group are done the resulting local histograms are
+ * added to the global histogram using global atomics.
+ *
+ * @note The input image is represented as a two-dimensional array of type uchar.
+ * The output is represented as a one-dimensional uint array of length of num_bins
+ *
+ * @param[in]  input_ptr                           Pointer to the first source image. Supported data types: U8
+ * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[out] histogram                           The output buffer to hold histogram final result. Supported data types: U32
+ * @param[out] num_bins                            The number of bins
+ * @param[out] offset                              The start of values to use (inclusive)
+ * @param[out] range                               The range of a bin
+ * @param[out] offrange                            The maximum value (exclusive)
+ */
+__kernel void hist_border_kernel(IMAGE_DECLARATION(input),
+                                 __global uint *restrict histogram,
+                                 uint                    num_bins,
+                                 uint                    offset,
+                                 uint                    range,
+                                 uint                    offrange)
+{
+    Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input);
+
+    uint val = (uint)(*input_buffer.ptr);
+
+    uint win_pos = (val >= offset) ? (((val - offset) * num_bins) / range) : 0;
+
+    if(val >= offset && (val < offrange))
+    {
+        atomic_inc(histogram + win_pos);
+    }
+}
+
+/** Calculate the histogram of an 8 bit grayscale image with bin size of 256 and window size of 1.
+ *
+ * Each thread will process 16 pixels and use one local atomic operation per pixel.
+ * When all work items in a work group are done the resulting local histograms are
+ * added to the global histogram using global atomics.
+ *
+ * @note The input image is represented as a two-dimensional array of type uchar.
+ * The output is represented as a one-dimensional uint array of 256 elements
+ *
+ * @param[in]  input_ptr                           Pointer to the first source image. Supported data types: U8
+ * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[in]  histogram_local                     The local buffer to hold histogram result in per workgroup. Supported data types: U32
+ * @param[out] histogram                           The output buffer to hold histogram final result. Supported data types: U32
+ */
+__kernel void hist_local_kernel_fixed(IMAGE_DECLARATION(input),
+                                      __local uint *histogram_local,
+                                      __global uint *restrict histogram)
+{
+    Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input);
+
+    uint local_index  = get_local_id(0);
+    uint local_x_size = get_local_size(0);
+
+    for(int i = local_index; i < 256; i += local_x_size)
+    {
+        histogram_local[i] = 0;
+    }
+
+    uint16 vals = convert_uint16(vload16(0, input_buffer.ptr));
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    atomic_inc(histogram_local + vals.s0);
+    atomic_inc(histogram_local + vals.s1);
+    atomic_inc(histogram_local + vals.s2);
+    atomic_inc(histogram_local + vals.s3);
+    atomic_inc(histogram_local + vals.s4);
+    atomic_inc(histogram_local + vals.s5);
+    atomic_inc(histogram_local + vals.s6);
+    atomic_inc(histogram_local + vals.s7);
+    atomic_inc(histogram_local + vals.s8);
+    atomic_inc(histogram_local + vals.s9);
+    atomic_inc(histogram_local + vals.sa);
+    atomic_inc(histogram_local + vals.sb);
+    atomic_inc(histogram_local + vals.sc);
+    atomic_inc(histogram_local + vals.sd);
+    atomic_inc(histogram_local + vals.se);
+    atomic_inc(histogram_local + vals.sf);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for(int i = local_index; i < 256; i += local_x_size)
+    {
+        atomic_add(histogram + i, histogram_local[i]);
+    }
+}
+
+/** Calculate the histogram of an 8 bit grayscale image with bin size as 256 and window size as 1.
+ *
+ * Each thread will process one pixel using global atomic.
+ * When all work items in a work group are done the resulting local histograms are
+ * added to the global histogram using global atomics.
+ *
+ * @note The input image is represented as a two-dimensional array of type uchar.
+ * The output is represented as a one-dimensional uint array of 256
+ *
+ * @param[in]  input_ptr                           Pointer to the first source image. Supported data types: U8
+ * @param[in]  input_stride_x                      Stride of the first source image in X dimension (in bytes)
+ * @param[in]  input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                      Stride of the first source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[out] histogram                           The output buffer to hold histogram final result. Supported data types: U32
+ */
+__kernel void hist_border_kernel_fixed(IMAGE_DECLARATION(input),
+                                       __global uint *restrict histogram)
+{
+    Image input_buffer = CONVERT_TO_IMAGE_STRUCT(input);
+    atomic_inc(histogram + *input_buffer.ptr);
+}
diff --git a/src/core/CL/cl_kernels/hog.cl b/src/core/CL/cl_kernels/hog.cl
new file mode 100644
index 0000000000..31dd57b767
--- /dev/null
+++ b/src/core/CL/cl_kernels/hog.cl
@@ -0,0 +1,455 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "types.h"
+
+#if(defined CELL_WIDTH && defined CELL_HEIGHT && defined NUM_BINS && defined PHASE_SCALE)
+
+/** This OpenCL kernel computes the HOG orientation binning
+ *
+ * @attention The following variables must be passed at compile time:
+ *
+ * -# -DCELL_WIDTH = Width of the cell
+ * -# -DCELL_HEIGHT = height of the cell
+ * -# -DNUM_BINS = Number of bins for each cell
+ * -# -DPHASE_SCALE = Scale factor used to evaluate the index of the local HOG
+ *
+ * @note Each work-item computes a single cell
+ *
+ * @param[in]  mag_ptr                             Pointer to the source image which stores the magnitude of the gradient for each pixel. Supported data types: S16
+ * @param[in]  mag_stride_x                        Stride of the magnitude image in X dimension (in bytes)
+ * @param[in]  mag_step_x                          mag_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mag_stride_y                        Stride of the magnitude image in Y dimension (in bytes)
+ * @param[in]  mag_step_y                          mag_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  mag_offset_first_element_in_bytes   The offset of the first element in the magnitude image
+ * @param[in]  phase_ptr                           Pointer to the source image which stores the phase of the gradient for each pixel. Supported data types: U8
+ * @param[in]  phase_stride_x                      Stride of the phase image in X dimension (in bytes)
+ * @param[in]  phase_step_x                        phase_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  phase_stride_y                      Stride of the the phase image in Y dimension (in bytes)
+ * @param[in]  phase_step_y                        phase_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  phase_offset_first_element_in_bytes The offset of the first element in the the phase image
+ * @param[out] dst_ptr                             Pointer to the destination image which stores the local HOG for each cell Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell
+ * @param[in]  dst_stride_x                        Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                          dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                        Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                          dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes   The offset of the first element in the destination image
+ */
+__kernel void hog_orientation_binning(IMAGE_DECLARATION(mag),
+                                      IMAGE_DECLARATION(phase),
+                                      IMAGE_DECLARATION(dst))
+{
+    float bins[NUM_BINS] = { 0 };
+
+    // Compute address for the magnitude and phase images
+    Image mag   = CONVERT_TO_IMAGE_STRUCT(mag);
+    Image phase = CONVERT_TO_IMAGE_STRUCT(phase);
+
+    __global uchar *mag_row_ptr   = mag.ptr;
+    __global uchar *phase_row_ptr = phase.ptr;
+
+    for(int yc = 0; yc < CELL_HEIGHT; ++yc)
+    {
+        int xc = 0;
+        for(; xc <= (CELL_WIDTH - 4); xc += 4)
+        {
+            // Load magnitude and phase values
+            const float4 mag_f32   = convert_float4(vload4(0, (__global short *)mag_row_ptr + xc));
+            float4       phase_f32 = convert_float4(vload4(0, phase_row_ptr + xc));
+
+            // Scale phase: phase * scale + 0.5f
+            phase_f32 = (float4)0.5f + phase_f32 * (float4)PHASE_SCALE;
+
+            // Compute histogram index.
+            int4 hidx_s32 = convert_int4(phase_f32);
+
+            // Compute magnitude weights (w0 and w1)
+            const float4 hidx_f32 = convert_float4(hidx_s32);
+
+            // w1 = phase_f32 - hidx_s32
+            const float4 w1_f32 = phase_f32 - hidx_f32;
+
+            // w0 = 1.0 - w1
+            const float4 w0_f32 = (float4)1.0f - w1_f32;
+
+            // Calculate the weights for splitting vote
+            const float4 mag_w0_f32 = mag_f32 * w0_f32;
+            const float4 mag_w1_f32 = mag_f32 * w1_f32;
+
+            // Weighted vote between 2 bins
+
+            // Check if the histogram index is equal to NUM_BINS. If so, replace the index with 0
+            hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS));
+
+            // Bin 0
+            bins[hidx_s32.s0] += mag_w0_f32.s0;
+            bins[hidx_s32.s1] += mag_w0_f32.s1;
+            bins[hidx_s32.s2] += mag_w0_f32.s2;
+            bins[hidx_s32.s3] += mag_w0_f32.s3;
+
+            hidx_s32 += (int4)1;
+
+            // Check if the histogram index is equal to NUM_BINS. If so, replace the index with 0
+            hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS));
+
+            // Bin1
+            bins[hidx_s32.s0] += mag_w1_f32.s0;
+            bins[hidx_s32.s1] += mag_w1_f32.s1;
+            bins[hidx_s32.s2] += mag_w1_f32.s2;
+            bins[hidx_s32.s3] += mag_w1_f32.s3;
+        }
+
+        // Left over computation
+        for(; xc < CELL_WIDTH; xc++)
+        {
+            const float mag_value   = *((__global short *)mag_row_ptr + xc);
+            const float phase_value = *(mag_row_ptr + xc) * (float)PHASE_SCALE + 0.5f;
+            const float w1          = phase_value - floor(phase_value);
+
+            // The quantised phase is the histogram index [0, NUM_BINS - 1]
+            // Check limit of histogram index. If hidx == NUM_BINS, hidx = 0
+            const uint hidx = (uint)(phase_value) % NUM_BINS;
+
+            // Weighted vote between 2 bins
+            bins[hidx] += mag_value * (1.0f - w1);
+            bins[(hidx + 1) % NUM_BINS] += mag_value * w1;
+        }
+
+        // Point to the next row of magnitude and phase images
+        mag_row_ptr += mag_stride_y;
+        phase_row_ptr += phase_stride_y;
+    }
+
+    // Compute address for the destination image
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Store the local HOG in the global memory
+    int xc = 0;
+    for(; xc <= (NUM_BINS - 4); xc += 4)
+    {
+        float4 values = vload4(0, bins + xc);
+
+        vstore4(values, 0, ((__global float *)dst.ptr) + xc);
+    }
+
+    // Left over stores
+    for(; xc < NUM_BINS; ++xc)
+    {
+        ((__global float *)dst.ptr)[xc] = bins[xc];
+    }
+}
+#endif // (defined CELL_WIDTH && defined CELL_HEIGHT && defined NUM_BINS && defined PHASE_SCALE)
+
+#if(defined NUM_CELLS_PER_BLOCK_HEIGHT && defined NUM_BINS_PER_BLOCK_X && defined NUM_BINS_PER_BLOCK && HOG_NORM_TYPE && defined L2_HYST_THRESHOLD)
+
+#ifndef L2_NORM
+#error The value of enum class HOGNormType::L2_NORM has not be passed to the OpenCL kernel
+#endif
+
+#ifndef L2HYS_NORM
+#error The value of enum class HOGNormType::L2HYS_NORM has not be passed to the OpenCL kernel
+#endif
+
+#ifndef L1_NORM
+#error The value of enum class HOGNormType::L1_NORM has not be passed to the OpenCL kernel
+#endif
+
+/** This OpenCL kernel computes the HOG block normalization
+ *
+ * @attention The following variables must be passed at compile time:
+ *
+ * -# -DNUM_CELLS_PER_BLOCK_HEIGHT = Number of cells for each block
+ * -# -DNUM_BINS_PER_BLOCK_X = Number of bins for each block along the X direction
+ * -# -DNUM_BINS_PER_BLOCK = Number of bins for each block
+ * -# -DHOG_NORM_TYPE = Normalization type
+ * -# -DL2_HYST_THRESHOLD = Threshold used for L2HYS_NORM normalization method
+ * -# -DL2_NORM = Value of the enum class HOGNormType::L2_NORM
+ * -# -DL2HYS_NORM = Value of the enum class HOGNormType::L2HYS_NORM
+ * -# -DL1_NORM = Value of the enum class HOGNormType::L1_NORM
+ *
+ * @note Each work-item computes a single block
+ *
+ * @param[in]  src_ptr                           Pointer to the source image which stores the local HOG. Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image which stores the normlized HOG Supported data types: F32. Number of channels supported: equal to the number of histogram bins per block
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void hog_block_normalization(IMAGE_DECLARATION(src),
+                                      IMAGE_DECLARATION(dst))
+{
+    float  sum     = 0.0f;
+    float4 sum_f32 = (float4)(0.0f);
+
+    // Compute address for the source and destination tensor
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    for(size_t yc = 0; yc < NUM_CELLS_PER_BLOCK_HEIGHT; ++yc)
+    {
+        const __global float *hist_ptr = (__global float *)(src.ptr + yc * src_stride_y);
+
+        int xc = 0;
+        for(; xc <= (NUM_BINS_PER_BLOCK_X - 16); xc += 16)
+        {
+            const float4 val0 = vload4(0, hist_ptr + xc + 0);
+            const float4 val1 = vload4(0, hist_ptr + xc + 4);
+            const float4 val2 = vload4(0, hist_ptr + xc + 8);
+            const float4 val3 = vload4(0, hist_ptr + xc + 12);
+
+#if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
+            // Compute val^2 for L2_NORM or L2HYS_NORM
+            sum_f32 += val0 * val0;
+            sum_f32 += val1 * val1;
+            sum_f32 += val2 * val2;
+            sum_f32 += val3 * val3;
+#else
+            // Compute |val| for L1_NORM
+            sum_f32 += fabs(val0);
+            sum_f32 += fabs(val1);
+            sum_f32 += fabs(val2);
+            sum_f32 += fabs(val3);
+#endif // (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
+
+            // Store linearly the input values un-normalized in the output image. These values will be reused for the normalization.
+            // This approach will help us to be cache friendly in the next for loop where the normalization will be done because all the values
+            // will be accessed consecutively
+            vstore4(val0, 0, ((__global float *)dst.ptr) + xc + 0 + yc * NUM_BINS_PER_BLOCK_X);
+            vstore4(val1, 0, ((__global float *)dst.ptr) + xc + 4 + yc * NUM_BINS_PER_BLOCK_X);
+            vstore4(val2, 0, ((__global float *)dst.ptr) + xc + 8 + yc * NUM_BINS_PER_BLOCK_X);
+            vstore4(val3, 0, ((__global float *)dst.ptr) + xc + 12 + yc * NUM_BINS_PER_BLOCK_X);
+        }
+
+        // Compute left over
+        for(; xc < NUM_BINS_PER_BLOCK_X; ++xc)
+        {
+            const float val = hist_ptr[xc];
+
+#if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
+            sum += val * val;
+#else
+            sum += fabs(val);
+#endif // (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
+
+            ((__global float *)dst.ptr)[xc + 0 + yc * NUM_BINS_PER_BLOCK_X] = val;
+        }
+    }
+
+    sum += dot(sum_f32, (float4)1.0f);
+
+    float scale = 1.0f / (sqrt(sum) + NUM_BINS_PER_BLOCK * 0.1f);
+
+#if(HOG_NORM_TYPE == L2HYS_NORM)
+    // Reset sum
+    sum_f32 = (float4)0.0f;
+    sum     = 0.0f;
+
+    int k = 0;
+    for(; k <= NUM_BINS_PER_BLOCK - 16; k += 16)
+    {
+        float4 val0 = vload4(0, ((__global float *)dst.ptr) + k + 0);
+        float4 val1 = vload4(0, ((__global float *)dst.ptr) + k + 4);
+        float4 val2 = vload4(0, ((__global float *)dst.ptr) + k + 8);
+        float4 val3 = vload4(0, ((__global float *)dst.ptr) + k + 12);
+
+        // Scale val
+        val0 = val0 * (float4)scale;
+        val1 = val1 * (float4)scale;
+        val2 = val2 * (float4)scale;
+        val3 = val3 * (float4)scale;
+
+        // Clip val if over _threshold_l2hys
+        val0 = fmin(val0, (float4)L2_HYST_THRESHOLD);
+        val1 = fmin(val1, (float4)L2_HYST_THRESHOLD);
+        val2 = fmin(val2, (float4)L2_HYST_THRESHOLD);
+        val3 = fmin(val3, (float4)L2_HYST_THRESHOLD);
+
+        // Compute val^2
+        sum_f32 += val0 * val0;
+        sum_f32 += val1 * val1;
+        sum_f32 += val2 * val2;
+        sum_f32 += val3 * val3;
+
+        vstore4(val0, 0, ((__global float *)dst.ptr) + k + 0);
+        vstore4(val1, 0, ((__global float *)dst.ptr) + k + 4);
+        vstore4(val2, 0, ((__global float *)dst.ptr) + k + 8);
+        vstore4(val3, 0, ((__global float *)dst.ptr) + k + 12);
+    }
+
+    // Compute left over
+    for(; k < NUM_BINS_PER_BLOCK; ++k)
+    {
+        float val = ((__global float *)dst.ptr)[k] * scale;
+
+        // Clip scaled input_value if over L2_HYST_THRESHOLD
+        val = fmin(val, (float)L2_HYST_THRESHOLD);
+
+        sum += val * val;
+
+        ((__global float *)dst.ptr)[k] = val;
+    }
+
+    sum += dot(sum_f32, (float4)1.0f);
+
+    // We use the same constants of OpenCV
+    scale = 1.0f / (sqrt(sum) + 1e-3f);
+
+#endif // (HOG_NORM_TYPE == L2HYS_NORM)
+
+    int i = 0;
+    for(; i <= (NUM_BINS_PER_BLOCK - 16); i += 16)
+    {
+        float4 val0 = vload4(0, ((__global float *)dst.ptr) + i + 0);
+        float4 val1 = vload4(0, ((__global float *)dst.ptr) + i + 4);
+        float4 val2 = vload4(0, ((__global float *)dst.ptr) + i + 8);
+        float4 val3 = vload4(0, ((__global float *)dst.ptr) + i + 12);
+
+        // Multiply val by the normalization scale factor
+        val0 = val0 * (float4)scale;
+        val1 = val1 * (float4)scale;
+        val2 = val2 * (float4)scale;
+        val3 = val3 * (float4)scale;
+
+        vstore4(val0, 0, ((__global float *)dst.ptr) + i + 0);
+        vstore4(val1, 0, ((__global float *)dst.ptr) + i + 4);
+        vstore4(val2, 0, ((__global float *)dst.ptr) + i + 8);
+        vstore4(val3, 0, ((__global float *)dst.ptr) + i + 12);
+    }
+
+    for(; i < NUM_BINS_PER_BLOCK; ++i)
+    {
+        ((__global float *)dst.ptr)[i] *= scale;
+    }
+}
+#endif // (defined NUM_CELLS_PER_BLOCK_HEIGHT && defined NUM_BINS_PER_BLOCK_X && defined NUM_BINS_PER_BLOCK && HOG_NORM_TYPE && defined L2_HYST_THRESHOLD)
+
+#if(defined NUM_BLOCKS_PER_DESCRIPTOR_Y && defined NUM_BINS_PER_DESCRIPTOR_X && defined THRESHOLD && defined MAX_NUM_DETECTION_WINDOWS && defined IDX_CLASS && defined BLOCK_STRIDE_WIDTH && defined BLOCK_STRIDE_HEIGHT && defined DETECTION_WINDOW_WIDTH && defined DETECTION_WINDOW_HEIGHT)
+
+/** This OpenCL kernel computes the HOG detector using linear SVM
+ *
+ * @attention The following variables must be passed at compile time:
+ *
+ * -# -DNUM_BLOCKS_PER_DESCRIPTOR_Y = Number of blocks per descriptor along the Y direction
+ * -# -DNUM_BINS_PER_DESCRIPTOR_X = Number of bins per descriptor along the X direction
+ * -# -DTHRESHOLD = Threshold for the distance between features and SVM classifying plane
+ * -# -DMAX_NUM_DETECTION_WINDOWS = Maximum number of possible detection windows. It is equal to the size of the DetectioWindow array
+ * -# -DIDX_CLASS = Index of the class to detect
+ * -# -DBLOCK_STRIDE_WIDTH = Block stride for the X direction
+ * -# -DBLOCK_STRIDE_HEIGHT = Block stride for the Y direction
+ * -# -DDETECTION_WINDOW_WIDTH = Width of the detection window
+ * -# -DDETECTION_WINDOW_HEIGHT = Height of the detection window
+ *
+ * @note Each work-item computes a single detection window
+ *
+ * @param[in]  src_ptr                           Pointer to the source image which stores the local HOG. Supported data types: F32. Number of channels supported: equal to the number of histogram bins per cell
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  hog_descriptor                    Pointer to HOG descriptor. Supported data types: F32
+ * @param[out] dst                               Pointer to DetectionWindow array
+ * @param[out] num_detection_windows             Number of objects detected
+ */
+__kernel void hog_detector(IMAGE_DECLARATION(src),
+                           __global float *hog_descriptor,
+                           __global DetectionWindow *dst,
+                           __global uint *num_detection_windows)
+{
+    // Check if the DetectionWindow array is full
+    if(*num_detection_windows >= MAX_NUM_DETECTION_WINDOWS)
+    {
+        return;
+    }
+
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    const int src_step_y_f32 = src_stride_y / sizeof(float);
+
+    // Init score_f32 with 0
+    float4 score_f32 = (float4)0.0f;
+
+    // Init score with 0
+    float score = 0.0f;
+
+    __global float *src_row_ptr = (__global float *)src.ptr;
+
+    // Compute Linear SVM
+    for(int yb = 0; yb < NUM_BLOCKS_PER_DESCRIPTOR_Y; ++yb, src_row_ptr += src_step_y_f32)
+    {
+        int xb = 0;
+
+        const int offset_y = yb * NUM_BINS_PER_DESCRIPTOR_X;
+
+        for(; xb < (int)NUM_BINS_PER_DESCRIPTOR_X - 8; xb += 8)
+        {
+            // Load descriptor values
+            float4 a0_f32 = vload4(0, src_row_ptr + xb + 0);
+            float4 a1_f32 = vload4(0, src_row_ptr + xb + 4);
+
+            float4 b0_f32 = vload4(0, hog_descriptor + xb + 0 + offset_y);
+            float4 b1_f32 = vload4(0, hog_descriptor + xb + 4 + offset_y);
+
+            // Multiply accumulate
+            score_f32 += a0_f32 * b0_f32;
+            score_f32 += a1_f32 * b1_f32;
+        }
+
+        for(; xb < NUM_BINS_PER_DESCRIPTOR_X; ++xb)
+        {
+            const float a = src_row_ptr[xb];
+            const float b = hog_descriptor[xb + offset_y];
+
+            score += a * b;
+        }
+    }
+
+    score += dot(score_f32, (float4)1.0f);
+
+    // Add the bias. The bias is located at the position (descriptor_size() - 1)
+    // (descriptor_size - 1) = NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y
+    score += hog_descriptor[NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y];
+
+    if(score > (float)THRESHOLD)
+    {
+        int id = atomic_inc(num_detection_windows);
+        if(id < MAX_NUM_DETECTION_WINDOWS)
+        {
+            dst[id].x         = get_global_id(0) * BLOCK_STRIDE_WIDTH;
+            dst[id].y         = get_global_id(1) * BLOCK_STRIDE_HEIGHT;
+            dst[id].width     = DETECTION_WINDOW_WIDTH;
+            dst[id].height    = DETECTION_WINDOW_HEIGHT;
+            dst[id].idx_class = IDX_CLASS;
+            dst[id].score     = score;
+        }
+    }
+}
+#endif // defined BIAS && defined NUM_BLOCKS_PER_DESCRIPTOR_Y && defined NUM_BINS_PER_DESCRIPTOR_X && ...
diff --git a/src/core/CL/cl_kernels/integral_image.cl b/src/core/CL/cl_kernels/integral_image.cl
new file mode 100644
index 0000000000..970e04e150
--- /dev/null
+++ b/src/core/CL/cl_kernels/integral_image.cl
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function computes the horizontal integral of the image.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U32
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void integral_horizontal(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uint prev = 0;
+
+    for(uint j = 0; j < src_step_x; j += 16)
+    {
+        barrier(CLK_GLOBAL_MEM_FENCE);
+        uint16 res = convert_uint16(vload16(0, offset(&src, j, 0)));
+        res.s0 += prev;
+        res.s1 += res.s0;
+        res.s2 += res.s1;
+        res.s3 += res.s2;
+        res.s4 += res.s3;
+        res.s5 += res.s4;
+        res.s6 += res.s5;
+        res.s7 += res.s6;
+        res.s8 += res.s7;
+        res.s9 += res.s8;
+        res.sA += res.s9;
+        res.sB += res.sA;
+        res.sC += res.sB;
+        res.sD += res.sC;
+        res.sE += res.sD;
+        res.sF += res.sE;
+        prev = res.sF;
+        vstore16(res, 0, (__global uint *)offset(&dst, j, 0));
+    }
+}
+
+/** This function computes the vertical integral of the image.
+ *
+ * @param[in,out] src_ptr                           Pointer to the source image. Supported data types: U32
+ * @param[in]     src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]     src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]     src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]     height                            Image height.
+ */
+__kernel void integral_vertical(
+    IMAGE_DECLARATION(src),
+    uint height)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    uint8 prev = vload8(0, (__global uint *)offset(&src, 0, 0));
+    for(uint j = 1; j < height; ++j)
+    {
+        barrier(CLK_GLOBAL_MEM_FENCE);
+        uint8 res = vload8(0, (__global uint *)offset(&src, 0, j));
+        res += prev;
+        vstore8(res, 0, (__global uint *)offset(&src, 0, j));
+        prev = res;
+    }
+}
diff --git a/src/core/CL/cl_kernels/magnitude_phase.cl b/src/core/CL/cl_kernels/magnitude_phase.cl
new file mode 100644
index 0000000000..c4b0df8de9
--- /dev/null
+++ b/src/core/CL/cl_kernels/magnitude_phase.cl
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Calculates L1 normalization between two inputs.
+ *
+ * @param[in] a First input. Supported data types: S16, S32
+ * @param[in] b Second input. Supported data types: S16, S32
+ *
+ * @return L1 normalization magnitude result. Supported data types: S16, S32
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 16) magnitude_l1(VEC_DATA_TYPE(DATA_TYPE, 16) a, VEC_DATA_TYPE(DATA_TYPE, 16) b)
+{
+    return CONVERT_SAT(add_sat(abs(a), abs(b)), VEC_DATA_TYPE(DATA_TYPE, 16));
+}
+
+/** Calculates L2 normalization between two inputs.
+ *
+ * @param[in] a First input. Supported data types: S16, S32
+ * @param[in] b Second input. Supported data types: S16, S32
+ *
+ * @return L2 normalization magnitude result. Supported data types: S16, S32
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 16) magnitude_l2(int16 a, int16 b)
+{
+    return CONVERT_SAT((sqrt(convert_float16((convert_uint16(a * a) + convert_uint16(b * b)))) + 0.5f),
+                       VEC_DATA_TYPE(DATA_TYPE, 16));
+}
+
+/** Calculates unsigned phase between two inputs.
+ *
+ * @param[in] a First input. Supported data types: S16, S32
+ * @param[in] b Second input. Supported data types: S16, S32
+ *
+ * @return Unsigned phase mapped in the interval [0, 180]. Supported data types: U8
+ */
+inline uchar16 phase_unsigned(VEC_DATA_TYPE(DATA_TYPE, 16) a, VEC_DATA_TYPE(DATA_TYPE, 16) b)
+{
+    float16 angle_deg_f32 = atan2pi(convert_float16(b), convert_float16(a)) * (float16)180.0f;
+    angle_deg_f32         = select(angle_deg_f32, (float16)180.0f + angle_deg_f32, angle_deg_f32 < (float16)0.0f);
+    return convert_uchar16(angle_deg_f32);
+}
+
+/** Calculates signed phase between two inputs.
+ *
+ * @param[in] a First input. Supported data types: S16, S32
+ * @param[in] b Second input. Supported data types: S16, S32
+ *
+ * @return Signed phase mapped in the interval [0, 256). Supported data types: U8
+ */
+inline uchar16 phase_signed(VEC_DATA_TYPE(DATA_TYPE, 16) a, VEC_DATA_TYPE(DATA_TYPE, 16) b)
+{
+    float16 arct = atan2pi(convert_float16(b), convert_float16(a));
+    arct         = select(arct, arct + 2, arct < 0.0f);
+
+    return convert_uchar16(convert_int16(mad(arct, 128, 0.5f)) & 0xFFu);
+}
+
+#if(1 == MAGNITUDE)
+#define MAGNITUDE_OP(x, y) magnitude_l1((x), (y))
+#elif(2 == MAGNITUDE)
+#define MAGNITUDE_OP(x, y) magnitude_l2(convert_int16(x), convert_int16(y))
+#else
+#define MAGNITUDE_OP(x, y)
+#endif
+
+#if(1 == PHASE)
+#define PHASE_OP(x, y) phase_unsigned((x), (y))
+#elif(2 == PHASE)
+#define PHASE_OP(x, y) phase_signed((x), (y))
+#else
+#define PHASE_OP(x, y)
+#endif
+
+/** Calculate the magnitude and phase of given the gradients of an image.
+ *
+ * @note Magnitude calculation supported: L1 normalization(type = 1) and L2 normalization(type = 2).
+ * @note Phase calculation supported: Unsigned(type = 1) [0,128] and Signed(type = 2) [0,256).
+ *
+ * @attention To enable phase calculation -DPHASE="phase_calculation_type_id" must be provided at compile time. eg -DPHASE=1
+ * @attention To enable magnitude calculation -DMAGNITUDE="magnitude_calculation_type_id" must be provided at compile time. eg -DMAGNITUDE=1
+ * @attention Datatype of the two inputs is passed at compile time using -DDATA_TYPE. e.g -DDATA_TYPE=short. Supported data_types are: short and int
+ *
+ * @param[in]  gx_ptr                                  Pointer to the first source image (gradient X). Supported data types: S16, S32
+ * @param[in]  gx_stride_x                             Stride of the source image in X dimension (in bytes)
+ * @param[in]  gx_step_x                               gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gx_stride_y                             Stride of the source image in Y dimension (in bytes)
+ * @param[in]  gx_step_y                               gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  gx_offset_first_element_in_bytes        The offset of the first element in the source image
+ * @param[in]  gy_ptr                                  Pointer to the second source image (gradient Y) . Supported data types: S16, S32
+ * @param[in]  gy_stride_x                             Stride of the destination image in X dimension (in bytes)
+ * @param[in]  gy_step_x                               gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gy_stride_y                             Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  gy_step_y                               gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  gy_offset_first_element_in_bytes        The offset of the first element in the destination image
+ * @param[out] magnitude_ptr                           Pointer to the magnitude destination image. Supported data types: S16, S32
+ * @param[in]  magnitude_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  magnitude_step_x                        magnitude_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  magnitude_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  magnitude_step_y                        magnitude_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  magnitude_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] phase_ptr                               Pointer to the phase destination image. Supported data types: U8
+ * @param[in]  phase_stride_x                          Stride of the destination image in X dimension (in bytes)
+ * @param[in]  phase_step_x                            phase_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  phase_stride_y                          Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  phase_step_y                            phase_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  phase_offset_first_element_in_bytes     The offset of the first element in the destination image
+ * */
+__kernel void magnitude_phase(
+    IMAGE_DECLARATION(gx),
+    IMAGE_DECLARATION(gy)
+#ifdef MAGNITUDE
+    ,
+    IMAGE_DECLARATION(magnitude)
+#endif
+#ifdef PHASE
+    ,
+    IMAGE_DECLARATION(phase)
+#endif
+)
+{
+    // Get pixels pointer
+    Image gx = CONVERT_TO_IMAGE_STRUCT(gx);
+    Image gy = CONVERT_TO_IMAGE_STRUCT(gy);
+
+    // Load values
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in_a = vload16(0, (__global DATA_TYPE *)gx.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in_b = vload16(0, (__global DATA_TYPE *)gy.ptr);
+
+    // Calculate and store the results
+#ifdef MAGNITUDE
+    Image magnitude = CONVERT_TO_IMAGE_STRUCT(magnitude);
+    vstore16(MAGNITUDE_OP(in_a, in_b), 0, (__global DATA_TYPE *)magnitude.ptr);
+#endif
+#ifdef PHASE
+    Image phase = CONVERT_TO_IMAGE_STRUCT(phase);
+    vstore16(PHASE_OP(in_a, in_b), 0, phase.ptr);
+#endif
+}
diff --git a/src/core/CL/cl_kernels/mean_stddev.cl b/src/core/CL/cl_kernels/mean_stddev.cl
new file mode 100644
index 0000000000..50b8312548
--- /dev/null
+++ b/src/core/CL/cl_kernels/mean_stddev.cl
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
+
+/** This function calculates the sum and sum of squares of a given input image.
+ *
+ * @note To enable calculation sum of squares -DSTDDEV should be passed as a preprocessor argument.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  height                            Height of the input image
+ * @param[out] global_sum                        Global sum of all elements
+ * @param[out] global_sum_sq                     Global sum of squares of all elements
+ */
+__kernel void mean_stddev_accumulate(
+    IMAGE_DECLARATION(src),
+    uint     height,
+    __global ulong *global_sum
+#if defined         STDDEV
+    ,
+    __global ulong *global_sum_sq
+#endif
+)
+{
+    // Get pixels pointer
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    uint8   tmp_sum = 0;
+#if defined STDDEV
+    uint8   tmp_sum_sq = 0;
+#endif
+    // Calculate partial sum
+    for(int i = 0; i < height; i++)
+    {
+        // Load data
+        uint8 data = convert_uint8(vload8(0, offset(&src, 0, i)));
+
+        tmp_sum += data;
+#if defined STDDEV
+        tmp_sum_sq += data * data;
+#endif
+    }
+    // Perform reduction
+    tmp_sum.s0123 += tmp_sum.s4567;
+    tmp_sum.s01 += tmp_sum.s23;
+    atom_add(global_sum, tmp_sum.s0 + tmp_sum.s1);
+
+#if defined STDDEV
+    tmp_sum_sq.s0123 += tmp_sum_sq.s4567;
+    tmp_sum_sq.s01 += tmp_sum_sq.s23;
+    atom_add(global_sum_sq, tmp_sum_sq.s0 + tmp_sum_sq.s1);
+#endif
+}
+
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : disable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : disable
diff --git a/src/core/CL/cl_kernels/minmaxloc.cl b/src/core/CL/cl_kernels/minmaxloc.cl
new file mode 100644
index 0000000000..799b1e8c3b
--- /dev/null
+++ b/src/core/CL/cl_kernels/minmaxloc.cl
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "types.h"
+
+#ifndef DATA_TYPE_MIN
+#define DATA_TYPE_MIN 0x0
+#endif
+
+#ifndef DATA_TYPE_MAX
+#define DATA_TYPE_MAX 0xFF
+#endif
+
+__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(DATA_TYPE_MIN);
+__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_max = (VEC_DATA_TYPE(DATA_TYPE, 16))(DATA_TYPE_MAX);
+__constant uint16 idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+
+/** This function identifies the min and maximum value of an input image.
+ *
+ * @note Input image data type must be passed as a preprocessor argument using -DDATA_TYPE.
+ * Moreover, the minimum and maximum value of the given data type must be provided using -DDATA_TYPE_MIN and -DDATA_TYPE_MAX respectively.
+ * @note In case image width is not a multiple of 16 then -DNON_MULTIPLE_OF_16 must be passed.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] min_max                           Pointer to buffer with minimum value in position 0 and maximum value in position 1
+ * @param[in]  width                             Input image width
+ */
+__kernel void minmax(
+    IMAGE_DECLARATION(src),
+    __global int *min_max,
+    uint          width)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    // Initialize local minimum and local maximum
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    local_min = type_max;
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    local_max = type_min;
+
+    // Calculate min/max of row
+    uint width4 = width >> 4;
+    for(uint i = 0; i < width4; i++)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 16)
+        data      = vload16(0, (__global DATA_TYPE *)offset(&src, i << 4, 0));
+        local_min = min(data, local_min);
+        local_max = max(data, local_max);
+    }
+
+#ifdef NON_MULTIPLE_OF_16
+    // Handle non multiple of 16
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    data = vload16(0, (__global DATA_TYPE *)offset(&src, width4 << 4, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    widx      = CONVERT(((uint16)(width4 << 4) + idx16) < width, VEC_DATA_TYPE(DATA_TYPE, 16));
+    local_max = max(local_max, select(type_min, data, widx));
+    local_min = min(local_min, select(type_max, data, widx));
+#endif
+
+    // Perform min/max reduction
+    local_min.s01234567 = min(local_min.s01234567, local_min.s89ABCDEF);
+    local_max.s01234567 = max(local_max.s01234567, local_max.s89ABCDEF);
+
+    local_min.s0123 = min(local_min.s0123, local_min.s4567);
+    local_max.s0123 = max(local_max.s0123, local_max.s4567);
+
+    local_min.s01 = min(local_min.s01, local_min.s23);
+    local_max.s01 = max(local_max.s01, local_max.s23);
+
+    local_min.s0 = min(local_min.s0, local_min.s1);
+    local_max.s0 = max(local_max.s0, local_max.s1);
+
+    // Update global min/max
+    atomic_min(&min_max[0], local_min.s0);
+    atomic_max(&min_max[1], local_max.s0);
+}
+
+/** This function counts the min and max occurrences in an image and tags their position.
+ *
+ * @note -DCOUNT_MIN_MAX should be specified if we want to count the occurrences of the minimum and maximum values.
+ * @note -DLOCATE_MIN and/or -DLOCATE_MAX should be specified if we want to store the position of each occurrence on the given array.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  min_max                           Pointer to buffer with minimum value in position 0 and maximum value in position 1
+ * @param[out] min_max_count                     Pointer to buffer with minimum value occurrences in position 0 and maximum value occurrences in position 1
+ * @param[out] min_loc                           Array that holds the location of the minimum value occurrences
+ * @param[in]  max_min_loc_count                 The maximum number of min value occurrences coordinates the array can hold
+ * @param[out] max_loc                           Array that holds the location of the maximum value occurrences
+ * @param[in]  max_max_loc_count                 The maximum number of max value occurrences coordinates the array can hold
+ */
+__kernel void minmaxloc(
+    IMAGE_DECLARATION(src),
+    __global int *min_max,
+    __global uint *min_max_count
+#if defined        LOCATE_MIN
+    ,
+    __global Coordinates2D *min_loc, uint max_min_loc_count
+#endif
+#if defined LOCATE_MAX
+    ,
+    __global Coordinates2D *max_loc, uint max_max_loc_count
+#endif
+)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    DATA_TYPE value = *((__global DATA_TYPE *)src.ptr);
+#if defined COUNT_MIN_MAX
+    if(value == min_max[0])
+    {
+        uint idx = atomic_inc(&min_max_count[0]);
+#if defined  LOCATE_MIN
+        if(idx < max_min_loc_count)
+        {
+            min_loc[idx].x = get_global_id(0);
+            min_loc[idx].y = get_global_id(1);
+        }
+#endif
+    }
+    if(value == min_max[1])
+    {
+        uint idx = atomic_inc(&min_max_count[1]);
+#if defined  LOCATE_MAX
+        if(idx < max_max_loc_count)
+        {
+            max_loc[idx].x = get_global_id(0);
+            max_loc[idx].y = get_global_id(1);
+        }
+#endif
+    }
+#endif
+}
diff --git a/src/core/CL/cl_kernels/non_linear_filter3x3.cl b/src/core/CL/cl_kernels/non_linear_filter3x3.cl
new file mode 100644
index 0000000000..f860c96bb8
--- /dev/null
+++ b/src/core/CL/cl_kernels/non_linear_filter3x3.cl
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "non_linear_filter_helpers.h"
+
+/** This function applies a non linear filter on a 3x3 box basis on an input image.
+ *
+ * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void non_linear_filter_box3x3(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load values
+    uchar16 top    = vload16(0, offset(&src, -1, -1));
+    uchar16 middle = vload16(0, offset(&src, -1, 0));
+    uchar16 bottom = vload16(0, offset(&src, -1, 1));
+
+    // Apply respective filter
+#if defined   MIN
+    uchar16   tmp = min(top, min(middle, bottom));
+    uchar8    out = row_reduce_min_3(tmp);
+#elif defined MAX
+    uchar16 tmp = max(top, max(middle, bottom));
+    uchar8  out = row_reduce_max_3(tmp);
+#elif defined MEDIAN
+    uchar8 p0  = top.s01234567;
+    uchar8 p1  = top.s12345678;
+    uchar8 p2  = top.s23456789;
+    uchar8 p3  = middle.s01234567;
+    uchar8 p4  = middle.s12345678;
+    uchar8 p5  = middle.s23456789;
+    uchar8 p6  = bottom.s01234567;
+    uchar8 p7  = bottom.s12345678;
+    uchar8 p8  = bottom.s23456789;
+    uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
+#else
+#error "Unsupported filter function"
+#endif
+
+    // Store result
+    vstore8(out, 0, dst.ptr);
+}
+
+/** This function applies a non linear filter on a 3x3 cross basis on an input image.
+ *
+ * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void non_linear_filter_cross3x3(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load values
+    uchar8  top    = vload8(0, offset(&src, 0, -1));
+    uchar16 middle = vload16(0, offset(&src, -1, 0));
+    uchar8  bottom = vload8(0, offset(&src, 0, 1));
+
+    // Apply respective filter
+#if defined   MIN
+    uchar8    tmp_middle = row_reduce_min_3(middle);
+    uchar8    out        = min(tmp_middle, min(top, bottom));
+#elif defined MAX
+    uchar8  tmp_middle = row_reduce_max_3(middle);
+    uchar8  out        = max(tmp_middle, max(top, bottom));
+#elif defined MEDIAN
+    uchar8 p0  = top.s01234567;
+    uchar8 p1  = middle.s01234567;
+    uchar8 p2  = middle.s12345678;
+    uchar8 p3  = middle.s23456789;
+    uchar8 p4  = bottom.s01234567;
+    uchar8 out = sort5(p0, p1, p2, p3, p4);
+#else
+#error "Unsupported filter function"
+#endif
+
+    // Store result
+    vstore8(out, 0, dst.ptr);
+}
+
+/** This function applies a non linear filter on a 3x3 disk basis on an input image.
+ *
+ * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void non_linear_filter_disk3x3(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load values
+    uchar16 top    = vload16(0, offset(&src, -1, -1));
+    uchar16 middle = vload16(0, offset(&src, -1, 0));
+    uchar16 bottom = vload16(0, offset(&src, -1, 1));
+
+    // Apply respective filter
+#if defined   MIN
+    uchar16   tmp = min(top, min(middle, bottom));
+    uchar8    out = row_reduce_min_3(tmp);
+#elif defined MAX
+    uchar16 tmp        = max(top, max(middle, bottom));
+    uchar8  out        = row_reduce_max_3(tmp);
+#elif defined MEDIAN
+    uchar8 p0  = top.s01234567;
+    uchar8 p1  = top.s12345678;
+    uchar8 p2  = top.s23456789;
+    uchar8 p3  = middle.s01234567;
+    uchar8 p4  = middle.s12345678;
+    uchar8 p5  = middle.s23456789;
+    uchar8 p6  = bottom.s01234567;
+    uchar8 p7  = bottom.s12345678;
+    uchar8 p8  = bottom.s23456789;
+    uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
+#else
+#error "Unsupported filter function"
+#endif
+
+    // Store result
+    vstore8(out, 0, dst.ptr);
+}
diff --git a/src/core/CL/cl_kernels/non_linear_filter5x5.cl b/src/core/CL/cl_kernels/non_linear_filter5x5.cl
new file mode 100644
index 0000000000..d9ae95fd2d
--- /dev/null
+++ b/src/core/CL/cl_kernels/non_linear_filter5x5.cl
@@ -0,0 +1,479 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "non_linear_filter_helpers.h"
+
+// Sorting networks below were generated using http://pages.ripco.net/~jgamble/nw.html
+
+/** Sorting network to sort 8 disks of diameter 5 and return their median.
+ *
+ * @param[in] top2    Values of elements two rows above.
+ * @param[in] top     Values of elements one row above.
+ * @param[in] middle  Values of middle elements.
+ * @param[in] bottom  Values of elements one row below.
+ * @param[in] bottom2 Values of elements two rows below.
+ *
+ * @return Median values for 8 elements.
+ */
+inline uchar8 median_disk5x5(uchar16 top2, uchar16 top, uchar16 middle, uchar16 bottom, uchar16 bottom2)
+{
+    uchar8 p0  = top2.s01234567;
+    uchar8 p1  = top2.s12345678;
+    uchar8 p2  = top2.s23456789;
+    uchar8 p3  = top.s01234567;
+    uchar8 p4  = top.s12345678;
+    uchar8 p5  = top.s23456789;
+    uchar8 p6  = top.s3456789A;
+    uchar8 p7  = top.s456789AB;
+    uchar8 p8  = middle.s01234567;
+    uchar8 p9  = middle.s12345678;
+    uchar8 p10 = middle.s23456789;
+    uchar8 p11 = middle.s3456789A;
+    uchar8 p12 = middle.s456789AB;
+    uchar8 p13 = bottom.s01234567;
+    uchar8 p14 = bottom.s12345678;
+    uchar8 p15 = bottom.s23456789;
+    uchar8 p16 = bottom.s3456789A;
+    uchar8 p17 = bottom.s456789AB;
+    uchar8 p18 = bottom2.s01234567;
+    uchar8 p19 = bottom2.s12345678;
+    uchar8 p20 = bottom2.s23456789;
+
+    SORT(p0, p1);
+    SORT(p2, p3);
+    SORT(p4, p5);
+    SORT(p6, p7);
+    SORT(p8, p9);
+    SORT(p10, p11);
+    SORT(p12, p13);
+    SORT(p14, p15);
+    SORT(p16, p17);
+    SORT(p18, p19);
+    SORT(p0, p2);
+    SORT(p1, p3);
+    SORT(p4, p6);
+    SORT(p5, p7);
+    SORT(p8, p10);
+    SORT(p9, p11);
+    SORT(p12, p14);
+    SORT(p13, p15);
+    SORT(p16, p18);
+    SORT(p17, p19);
+    SORT(p1, p2);
+    SORT(p5, p6);
+    SORT(p0, p4);
+    SORT(p3, p7);
+    SORT(p9, p10);
+    SORT(p13, p14);
+    SORT(p8, p12);
+    SORT(p11, p15);
+    SORT(p17, p18);
+    SORT(p16, p20);
+    SORT(p1, p5);
+    SORT(p2, p6);
+    SORT(p9, p13);
+    SORT(p10, p14);
+    SORT(p0, p8);
+    SORT(p7, p15);
+    SORT(p17, p20);
+    SORT(p1, p4);
+    SORT(p3, p6);
+    SORT(p9, p12);
+    SORT(p11, p14);
+    SORT(p18, p20);
+    SORT(p0, p16);
+    SORT(p2, p4);
+    SORT(p3, p5);
+    SORT(p10, p12);
+    SORT(p11, p13);
+    SORT(p1, p9);
+    SORT(p6, p14);
+    SORT(p19, p20);
+    SORT(p3, p4);
+    SORT(p11, p12);
+    SORT(p1, p8);
+    SORT(p2, p10);
+    SORT(p5, p13);
+    SORT(p7, p14);
+    SORT(p3, p11);
+    SORT(p2, p8);
+    SORT(p4, p12);
+    SORT(p7, p13);
+    SORT(p1, p17);
+    SORT(p3, p10);
+    SORT(p5, p12);
+    SORT(p1, p16);
+    SORT(p2, p18);
+    SORT(p3, p9);
+    SORT(p6, p12);
+    SORT(p2, p16);
+    SORT(p3, p8);
+    SORT(p7, p12);
+    SORT(p5, p9);
+    SORT(p6, p10);
+    SORT(p4, p8);
+    SORT(p7, p11);
+    SORT(p3, p19);
+    SORT(p5, p8);
+    SORT(p7, p10);
+    SORT(p3, p18);
+    SORT(p4, p20);
+    SORT(p6, p8);
+    SORT(p7, p9);
+    SORT(p3, p17);
+    SORT(p5, p20);
+    SORT(p7, p8);
+    SORT(p3, p16);
+    SORT(p6, p20);
+    SORT(p5, p17);
+    SORT(p7, p20);
+    SORT(p4, p16);
+    SORT(p6, p18);
+    SORT(p5, p16);
+    SORT(p7, p19);
+    SORT(p7, p18);
+    SORT(p6, p16);
+    SORT(p7, p17);
+    SORT(p10, p18);
+    SORT(p7, p16);
+    SORT(p9, p17);
+    SORT(p8, p16);
+    SORT(p9, p16);
+    SORT(p10, p16);
+
+    return p10;
+}
+
+/** Sorting network to sort 8 boxes of size 5 and return their median.
+ *
+ * @param[in] top2    Values of elements two rows above.
+ * @param[in] top     Values of elements one row above.
+ * @param[in] middle  Values of middle elements.
+ * @param[in] bottom  Values of elements one row below.
+ * @param[in] bottom2 Values of elements two rows below.
+ *
+ * @return Median values for 8 elements.
+ */
+inline uchar8 median_box5x5(uchar16 top2, uchar16 top, uchar16 middle, uchar16 bottom, uchar16 bottom2)
+{
+    uchar8 p0  = top2.s01234567;
+    uchar8 p1  = top2.s12345678;
+    uchar8 p2  = top2.s23456789;
+    uchar8 p3  = top2.s3456789A;
+    uchar8 p4  = top2.s456789AB;
+    uchar8 p5  = top.s01234567;
+    uchar8 p6  = top.s12345678;
+    uchar8 p7  = top.s23456789;
+    uchar8 p8  = top.s3456789A;
+    uchar8 p9  = top.s456789AB;
+    uchar8 p10 = middle.s01234567;
+    uchar8 p11 = middle.s12345678;
+    uchar8 p12 = middle.s23456789;
+    uchar8 p13 = middle.s3456789A;
+    uchar8 p14 = middle.s456789AB;
+    uchar8 p15 = bottom.s01234567;
+    uchar8 p16 = bottom.s12345678;
+    uchar8 p17 = bottom.s23456789;
+    uchar8 p18 = bottom.s3456789A;
+    uchar8 p19 = bottom.s456789AB;
+    uchar8 p20 = bottom2.s01234567;
+    uchar8 p21 = bottom2.s12345678;
+    uchar8 p22 = bottom2.s23456789;
+    uchar8 p23 = bottom2.s3456789A;
+    uchar8 p24 = bottom2.s456789AB;
+
+    SORT(p1, p2);
+    SORT(p0, p1);
+    SORT(p1, p2);
+    SORT(p4, p5);
+    SORT(p3, p4);
+    SORT(p4, p5);
+    SORT(p0, p3);
+    SORT(p2, p5);
+    SORT(p2, p3);
+    SORT(p1, p4);
+    SORT(p1, p2);
+    SORT(p3, p4);
+    SORT(p7, p8);
+    SORT(p6, p7);
+    SORT(p7, p8);
+    SORT(p10, p11);
+    SORT(p9, p10);
+    SORT(p10, p11);
+    SORT(p6, p9);
+    SORT(p8, p11);
+    SORT(p8, p9);
+    SORT(p7, p10);
+    SORT(p7, p8);
+    SORT(p9, p10);
+    SORT(p0, p6);
+    SORT(p4, p10);
+    SORT(p4, p6);
+    SORT(p2, p8);
+    SORT(p2, p4);
+    SORT(p6, p8);
+    SORT(p1, p7);
+    SORT(p5, p11);
+    SORT(p5, p7);
+    SORT(p3, p9);
+    SORT(p3, p5);
+    SORT(p7, p9);
+    SORT(p1, p2);
+    SORT(p3, p4);
+    SORT(p5, p6);
+    SORT(p7, p8);
+    SORT(p9, p10);
+    SORT(p13, p14);
+    SORT(p12, p13);
+    SORT(p13, p14);
+    SORT(p16, p17);
+    SORT(p15, p16);
+    SORT(p16, p17);
+    SORT(p12, p15);
+    SORT(p14, p17);
+    SORT(p14, p15);
+    SORT(p13, p16);
+    SORT(p13, p14);
+    SORT(p15, p16);
+    SORT(p19, p20);
+    SORT(p18, p19);
+    SORT(p19, p20);
+    SORT(p21, p22);
+    SORT(p23, p24);
+    SORT(p21, p23);
+    SORT(p22, p24);
+    SORT(p22, p23);
+    SORT(p18, p21);
+    SORT(p20, p23);
+    SORT(p20, p21);
+    SORT(p19, p22);
+    SORT(p22, p24);
+    SORT(p19, p20);
+    SORT(p21, p22);
+    SORT(p23, p24);
+    SORT(p12, p18);
+    SORT(p16, p22);
+    SORT(p16, p18);
+    SORT(p14, p20);
+    SORT(p20, p24);
+    SORT(p14, p16);
+    SORT(p18, p20);
+    SORT(p22, p24);
+    SORT(p13, p19);
+    SORT(p17, p23);
+    SORT(p17, p19);
+    SORT(p15, p21);
+    SORT(p15, p17);
+    SORT(p19, p21);
+    SORT(p13, p14);
+    SORT(p15, p16);
+    SORT(p17, p18);
+    SORT(p19, p20);
+    SORT(p21, p22);
+    SORT(p23, p24);
+    SORT(p0, p12);
+    SORT(p8, p20);
+    SORT(p8, p12);
+    SORT(p4, p16);
+    SORT(p16, p24);
+    SORT(p12, p16);
+    SORT(p2, p14);
+    SORT(p10, p22);
+    SORT(p10, p14);
+    SORT(p6, p18);
+    SORT(p6, p10);
+    SORT(p10, p12);
+    SORT(p1, p13);
+    SORT(p9, p21);
+    SORT(p9, p13);
+    SORT(p5, p17);
+    SORT(p13, p17);
+    SORT(p3, p15);
+    SORT(p11, p23);
+    SORT(p11, p15);
+    SORT(p7, p19);
+    SORT(p7, p11);
+    SORT(p11, p13);
+    SORT(p11, p12);
+    return p12;
+}
+
+/** This function applies a non linear filter on a 5x5 box basis on an input image.
+ *
+ * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void non_linear_filter_box5x5(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load values
+    uchar16 top2    = vload16(0, offset(&src, -2, -2));
+    uchar16 top     = vload16(0, offset(&src, -2, -1));
+    uchar16 middle  = vload16(0, offset(&src, -2, 0));
+    uchar16 bottom  = vload16(0, offset(&src, -2, 1));
+    uchar16 bottom2 = vload16(0, offset(&src, -2, 2));
+
+    // Apply respective filter
+#if defined   MIN
+    uchar16   tmp = min(middle, min(min(top2, top), min(bottom, bottom2)));
+    uchar8    out = row_reduce_min_5(tmp);
+#elif defined MAX
+    uchar16 tmp = max(middle, max(max(top2, top), max(bottom, bottom2)));
+    uchar8  out = row_reduce_max_5(tmp);
+#elif defined MEDIAN
+    uchar8 out = median_box5x5(top2, top, middle, bottom, bottom2);
+#else
+#error "Unsupported filter function"
+#endif
+
+    // Store result
+    vstore8(out, 0, dst.ptr);
+}
+
+/** This function applies a non linear filter on a 5x5 cross basis on an input image.
+ *
+ * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void non_linear_filter_cross5x5(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load values
+    uchar16 top2    = vload16(0, offset(&src, 0, -2));
+    uchar16 top     = vload16(0, offset(&src, 0, -1));
+    uchar16 middle  = vload16(0, offset(&src, -2, 0));
+    uchar16 bottom  = vload16(0, offset(&src, 0, 1));
+    uchar16 bottom2 = vload16(0, offset(&src, 0, 2));
+
+    // Apply respective filter
+#if defined   MIN
+    uchar8    tmp_middle = row_reduce_min_5(middle);
+    uchar8    out        = min(tmp_middle, min(min(top2.s01234567, top.s01234567), min(bottom.s01234567, bottom2.s01234567)));
+#elif defined MAX
+    uchar8  tmp_middle = row_reduce_max_5(middle);
+    uchar8  out        = max(tmp_middle, max(max(top2.s01234567, top.s01234567), max(bottom.s01234567, bottom2.s01234567)));
+#elif defined MEDIAN
+    uchar8 p0  = top2.s01234567;
+    uchar8 p1  = top.s01234567;
+    uchar8 p2  = middle.s01234567;
+    uchar8 p3  = middle.s12345678;
+    uchar8 p4  = middle.s23456789;
+    uchar8 p5  = middle.s3456789A;
+    uchar8 p6  = middle.s456789AB;
+    uchar8 p7  = bottom.s01234567;
+    uchar8 p8  = bottom2.s01234567;
+    uchar8 out = sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
+#else
+#error "Unsupported filter function"
+#endif
+
+    // Store result
+    vstore8(out, 0, dst.ptr);
+}
+
+/** This function applies a non linear filter on a 5x5 disk basis on an input image.
+ *
+ * @note The needed filter operation is defined through the preprocessor by passing either -DMIN, -DMAX or -DMEDIAN.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void non_linear_filter_disk5x5(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Load values
+    uchar16 top2    = vload16(0, offset(&src, -1, -2));
+    uchar16 top     = vload16(0, offset(&src, -2, -1));
+    uchar16 middle  = vload16(0, offset(&src, -2, 0));
+    uchar16 bottom  = vload16(0, offset(&src, -2, 1));
+    uchar16 bottom2 = vload16(0, offset(&src, -1, 2));
+
+    // Apply respective filter
+#if defined   MIN
+    uchar16   tmp_3     = min(top2, bottom2);
+    uchar16   tmp_5     = min(middle, min(top, bottom));
+    uchar8    tmp_3_red = row_reduce_min_3(tmp_3);
+    uchar8    tmp_5_red = row_reduce_min_5(tmp_5);
+    uchar8    out       = min(tmp_3_red, tmp_5_red);
+#elif defined MAX
+    uchar16 tmp_3      = max(top2, bottom2);
+    uchar16 tmp_5      = max(middle, max(top, bottom));
+    uchar8  tmp_3_red  = row_reduce_max_3(tmp_3);
+    uchar8  tmp_5_red  = row_reduce_max_5(tmp_5);
+    uchar8  out        = max(tmp_3_red, tmp_5_red);
+#elif defined MEDIAN
+    uchar8 out = median_disk5x5(top2, top, middle, bottom, bottom2);
+#else
+#error "Unsupported filter function"
+#endif
+
+    // Store result
+    vstore8(out, 0, dst.ptr);
+}
diff --git a/src/core/CL/cl_kernels/non_linear_filter_helpers.h b/src/core/CL/cl_kernels/non_linear_filter_helpers.h
new file mode 100644
index 0000000000..77da2091b0
--- /dev/null
+++ b/src/core/CL/cl_kernels/non_linear_filter_helpers.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/** Sorts element-wise two vectors.
+ *
+ * @param[in, out] a First vector
+ * @param[in, out] b Second vector
+ */
+#define SORT(a, b)                  \
+    {                               \
+        uchar8 min_val = min(a, b); \
+        uchar8 max_val = max(a, b); \
+        a              = min_val;   \
+        b              = max_val;   \
+    }
+
+// Sorting networks below were generated using http://pages.ripco.net/~jgamble/nw.html
+
+/** Sorting network to sort 5 vectors of 8 elements and return their median.
+ *
+ * @param[in] p0 First element vector
+ * @param[in] p1 Second element vector
+ * @param[in] p2 Third element vector
+ * @param[in] p3 Fourth element vector
+ * @param[in] p4 Fifth element vector
+ *
+ * @return Median values for 8 elements.
+ */
+inline uchar8 sort5(uchar8 p0, uchar8 p1, uchar8 p2, uchar8 p3, uchar8 p4)
+{
+    SORT(p0, p1);
+    SORT(p2, p3);
+    SORT(p0, p2);
+    SORT(p1, p3);
+    SORT(p1, p2);
+    SORT(p0, p4);
+    SORT(p1, p4);
+    SORT(p2, p4);
+
+    return p2;
+}
+
+/** Sorting network to sort 9 vectors of 8 elements and return their median.
+ *
+ * @param[in] p0 First element vector
+ * @param[in] p1 Second element vector
+ * @param[in] p2 Third element vector
+ * @param[in] p3 Fourth element vector
+ * @param[in] p4 Fifth element vector
+ * @param[in] p5 Sixth element vector
+ * @param[in] p6 Seventh element vector
+ * @param[in] p7 Eigth element vector
+ * @param[in] p8 Ninth element vector
+ *
+ * @return Median values for 8 elements.
+ */
+inline uchar8 sort9(uchar8 p0, uchar8 p1, uchar8 p2, uchar8 p3, uchar8 p4, uchar8 p5, uchar8 p6, uchar8 p7, uchar8 p8)
+{
+    SORT(p1, p2);
+    SORT(p4, p5);
+    SORT(p7, p8);
+    SORT(p0, p1);
+    SORT(p3, p4);
+    SORT(p6, p7);
+    SORT(p1, p2);
+    SORT(p4, p5);
+    SORT(p7, p8);
+    SORT(p0, p3);
+    SORT(p5, p8);
+    SORT(p4, p7);
+    SORT(p3, p6);
+    SORT(p1, p4);
+    SORT(p2, p5);
+    SORT(p4, p7);
+    SORT(p4, p2);
+    SORT(p6, p4);
+    SORT(p4, p2);
+
+    return p4;
+}
+
+/** Calculate the minimum of a sliding window of size 3.
+ *
+ * @param val Values to calculate the minimum values
+ *
+ * @return Minimum values of 8 elements on a sliding window of size 3.
+ */
+inline uchar8 row_reduce_min_3(uchar16 val)
+{
+    return min(val.s01234567, min(val.s12345678, val.s23456789));
+}
+
+/** Calculate the maximum of a sliding window of size 3.
+ *
+ * @param val Values to calculate the maximum values
+ *
+ * @return Maximum values of 8 elements on a sliding window of size 3.
+ */
+inline uchar8 row_reduce_max_3(uchar16 val)
+{
+    return max(val.s01234567, max(val.s12345678, val.s23456789));
+}
+
+/** Calculate the minimum of a sliding window of size 5.
+ *
+ * @param val Values to calculate the minimum values
+ *
+ * @return Minimum values of 8 elements on a sliding window of size 5.
+ */
+inline uchar8 row_reduce_min_5(uchar16 val)
+{
+    return min(val.s01234567, min(min(val.s12345678, val.s23456789), min(val.s3456789A, val.s456789AB)));
+}
+
+/** Calculate the maximum of a sliding window of size 5.
+ *
+ * @param val Values to calculate the maximum values
+ *
+ * @return Maximum values of 8 elements on a sliding window of size 5.
+ */
+inline uchar8 row_reduce_max_5(uchar16 val)
+{
+    return max(val.s01234567, max(max(val.s12345678, val.s23456789), max(val.s3456789A, val.s456789AB)));
+}
diff --git a/src/core/CL/cl_kernels/nonmax.cl b/src/core/CL/cl_kernels/nonmax.cl
new file mode 100644
index 0000000000..0e388d7496
--- /dev/null
+++ b/src/core/CL/cl_kernels/nonmax.cl
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function performs Non maxima suppression over a 3x3 window on a given image.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: F32
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void non_max_suppression(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    vc = vload8(0, (__global DATA_TYPE *)src.ptr);
+
+    if(all(vc == (DATA_TYPE)0))
+    {
+        vstore8(0, 0, (__global DATA_TYPE *)dst.ptr);
+
+        return;
+    }
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    nc = vload16(0, (__global DATA_TYPE *)offset(&src, -1, -1));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out = select((DATA_TYPE)0, vc, (vc >= nc.s01234567) && (vc >= nc.s12345678) && (vc >= nc.s23456789));
+
+    nc  = vload16(0, (__global DATA_TYPE *)offset(&src, -1, 0));
+    out = select((DATA_TYPE)0, out, (vc >= nc.s01234567) && (vc > nc.s23456789));
+
+    nc  = vload16(0, (__global DATA_TYPE *)offset(&src, -1, +1));
+    out = select((DATA_TYPE)0, out, (vc > nc.s01234567) && (vc > nc.s12345678) && (vc > nc.s23456789));
+
+    vstore8(out, 0, (__global DATA_TYPE *)dst.ptr);
+}
diff --git a/src/core/CL/cl_kernels/normalization_layer.cl b/src/core/CL/cl_kernels/normalization_layer.cl
new file mode 100644
index 0000000000..076b0d8909
--- /dev/null
+++ b/src/core/CL/cl_kernels/normalization_layer.cl
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Apply cross map normalization.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ *
+ * @param[in]  input_ptr                                   Pointer to the first source tensor. Supported data types: F16, F32
+ * @param[in]  input_stride_x                              Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                                input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                              Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                                input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                              Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                                input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes         The offset of the first element in the first source tensor
+ * @param[in]  squared_input_ptr                           Pointer to the second source tensor. Supported data types: F16, F32
+ * @param[in]  squared_input_stride_x                      Stride of the second source tensor in X dimension (in bytes)
+ * @param[in]  squared_input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  squared_input_stride_y                      Stride of the second source tensor in Y dimension (in bytes)
+ * @param[in]  squared_input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  squared_input_stride_z                      Stride of the second source tensor in Z dimension (in bytes)
+ * @param[in]  squared_input_step_z                        input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  squared_input_offset_first_element_in_bytes The offset of the second element in the second source tensor
+ * @param[out] output_ptr                                  Pointer to the destination tensor. Supported data types: F16, F32
+ * @param[in]  output_stride_x                             Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                               output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                             Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                               output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                             Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                               output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes        The offset of the first element in the destination tensor
+ * @param[in]  coeff                                       Alpha parameter / norm_size
+ * @param[in]  beta                                        Beta parameter in the normalization equation
+ * @param[in]  kappa                                       Kappa parameter in the normalization equation
+ * @param[in]  radius                                      Number of elements on the right or left side to normalize across
+ */
+__kernel void normalization_layer_cross_map(TENSOR3D_DECLARATION(input),
+                                            TENSOR3D_DECLARATION(squared_input),
+                                            TENSOR3D_DECLARATION(output),
+                                            float coeff,
+                                            float beta,
+                                            float kappa,
+                                            uint  radius)
+{
+    Tensor3D in         = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D squared_in = CONVERT_TO_TENSOR3D_STRUCT(squared_input);
+    Tensor3D out        = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    DATA_TYPE acc = 0;
+
+    const int num_of_slices = get_global_size(2);
+    const int current_slice = get_global_id(2);
+
+    const int left_slice  = max(current_slice - (int)radius, (int)0);
+    const int right_slice = min(current_slice + (int)radius, (int)(num_of_slices - 1));
+
+    for(int i = left_slice; i <= right_slice; i++)
+    {
+        acc += *(__global DATA_TYPE *)tensor3D_offset(&squared_in, 0, 0, i - current_slice);
+    }
+
+    const float normalized = pow(kappa + coeff * (float)acc, beta);
+
+    const float normalized_pixel = (float) * ((__global DATA_TYPE *)in.ptr) / normalized;
+
+    *(__global DATA_TYPE *)out.ptr = CONVERT(normalized_pixel, DATA_TYPE);
+}
+
+/** Apply in map normalization.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ *
+ * @param[in]  input_ptr                                   Pointer to the first source tensor. Supported data types: F16, F32
+ * @param[in]  input_stride_x                              Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                                input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                              Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                                input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                              Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                                input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes         The offset of the first element in the first source tensor
+ * @param[in]  squared_input_ptr                           Pointer to the second source tensor. Supported data types: F16, F32
+ * @param[in]  squared_input_stride_x                      Stride of the second source tensor in X dimension (in bytes)
+ * @param[in]  squared_input_step_x                        input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  squared_input_stride_y                      Stride of the second source tensor in Y dimension (in bytes)
+ * @param[in]  squared_input_step_y                        input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  squared_input_stride_z                      Stride of the second source tensor in Z dimension (in bytes)
+ * @param[in]  squared_input_step_z                        input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  squared_input_offset_first_element_in_bytes The offset of the second element in the second source tensor
+ * @param[out] output_ptr                                  Pointer to the destination tensor. Supported data types: F16, F32
+ * @param[in]  output_stride_x                             Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                               output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                             Stride of the first destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                               output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                             Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                               output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes        The offset of the first element in the destination tensor
+ * @param[in]  coeff                                       Alpha parameter / norm_size
+ * @param[in]  beta                                        Beta parameter in the normalization equation
+ * @param[in]  kappa                                       Kappa parameter in the normalization equation
+ * @param[in]  radius                                      Number of elements on the right or left side to normalize across
+ */
+__kernel void normalization_layer_in_map_1D(TENSOR3D_DECLARATION(input),
+                                            TENSOR3D_DECLARATION(squared_input),
+                                            TENSOR3D_DECLARATION(output),
+                                            float coeff,
+                                            float beta,
+                                            float kappa,
+                                            uint  radius)
+{
+    Tensor3D in         = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D squared_in = CONVERT_TO_TENSOR3D_STRUCT(squared_input);
+    Tensor3D out        = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    acc_vec = 0;
+
+    const int current_pos = get_global_id(0) << 2;
+
+    const int left_pos  = max(current_pos - (int)radius, -3);
+    const int right_pos = min(current_pos + (int)radius, (int)((get_global_size(0) << 2) + 3 - 1));
+
+    for(int i = left_pos; i <= right_pos; i += 1)
+    {
+        acc_vec += vload4(0, (__global DATA_TYPE *)tensor3D_offset(&squared_in, i - current_pos, 0, 0));
+    }
+
+    const float4 normalized = pow((float4)kappa + coeff * (float4)acc_vec, beta);
+
+    const float4 normalized_pixel = CONVERT(vload4(0, (__global DATA_TYPE *)in.ptr), float4) / normalized;
+
+    vstore4(CONVERT(normalized_pixel, VEC_DATA_TYPE(DATA_TYPE, 4)), 0, (__global DATA_TYPE *)out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl b/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl
new file mode 100644
index 0000000000..e1131d5573
--- /dev/null
+++ b/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl
@@ -0,0 +1,522 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "types.h"
+
+/*
+ *The criteria for lost tracking is that the spatial gradient matrix has:
+ * - Determinant less than DETERMINANT_THR
+ * - or minimum eigenvalue is smaller then EIGENVALUE_THR
+ *
+ * The thresholds for the determinant and the minimum eigenvalue is
+ * defined by the OpenVX spec
+ *
+ * Note: Also lost tracking happens when the point tracked coordinate is outside
+ * the image coordinates
+ *
+ * https://www.khronos.org/registry/vx/specs/1.0/html/d0/d0c/group__group__vision__function__opticalflowpyrlk.html
+ */
+
+/* Internal Lucas-Kanade Keypoint struct */
+typedef struct InternalKeypoint
+{
+    float x;               /**< The x coordinate. */
+    float y;               /**< The y coordinate. */
+    float tracking_status; /**< A zero indicates a lost point. Initialized to 1 by corner detectors. */
+    float dummy;
+} InternalKeypoint;
+
+/** Threshold for the determinant. Used for lost tracking criteria */
+#define DETERMINANT_THR 1.0e-07f
+
+/** Thresholds for minimum eigenvalue. Used for lost tracking criteria */
+#define EIGENVALUE_THR 1.0e-04f
+
+/** Constants used for Lucas-Kanade Algorithm */
+#define W_BITS (14)
+#define FLT_SCALE (1.0f / (float)(1 << 20))
+#define D0 ((float)(1 << W_BITS))
+#define D1 (1.0f / (float)(1 << (W_BITS - 5)))
+
+/** Initializes the internal new points array when the level of pyramid is NOT equal to max.
+ *
+ * @param[in,out] old_points_internal An array of internal key points that are defined at the old_images high resolution pyramid.
+ * @param[in,out] new_points_internal An array of internal key points that are defined at the new_images high resolution pyramid.
+ * @param[in]     scale               Scale factor to apply for the new_point coordinates.
+ */
+__kernel void init_level(
+    __global float4 *old_points_internal,
+    __global float4 *new_points_internal,
+    const float      scale)
+{
+    int idx = get_global_id(0);
+
+    // Get old and new keypoints
+    float4 old_point = old_points_internal[idx];
+    float4 new_point = new_points_internal[idx];
+
+    // Scale accordingly with the pyramid_scale
+    old_point.xy *= (float2)(2.0f);
+    new_point.xy *= (float2)(2.0f);
+
+    old_points_internal[idx] = old_point;
+    new_points_internal[idx] = new_point;
+}
+
+/** Initializes the internal new points array when the level of pyramid is equal to max.
+ *
+ * @param[in]     old_points          An array of key points that are defined at the old_images high resolution pyramid.
+ * @param[in,out] old_points_internal An array of internal key points that are defined at the old_images high resolution pyramid.
+ * @param[out]    new_points_internal An array of internal key points that are defined at the new_images high resolution pyramid.
+ * @param[in]     scale               Scale factor to apply for the new_point coordinates.
+ */
+__kernel void init_level_max(
+    __global Keypoint *old_points,
+    __global InternalKeypoint *old_points_internal,
+    __global InternalKeypoint *new_points_internal,
+    const float                scale)
+{
+    int idx = get_global_id(0);
+
+    Keypoint old_point = old_points[idx];
+
+    // Get old keypoint to track
+    InternalKeypoint old_point_internal;
+    old_point_internal.x               = old_point.x * scale;
+    old_point_internal.y               = old_point.y * scale;
+    old_point_internal.tracking_status = 1.f;
+
+    // Store internal keypoints
+    old_points_internal[idx] = old_point_internal;
+    new_points_internal[idx] = old_point_internal;
+}
+
+/** Initializes the new_points array when the level of pyramid is equal to max and if use_initial_estimate = 1.
+ *
+ * @param[in]     old_points           An array of key points that are defined at the old_images high resolution pyramid.
+ * @param[in]     new_points_estimates An array of estimate key points that are defined at the old_images high resolution pyramid.
+ * @param[in,out] old_points_internal  An array of internal key points that are defined at the old_images high resolution pyramid.
+ * @param[out]    new_points_internal  An array of internal key points that are defined at the new_images high resolution pyramid.
+ * @param[in]     scale                Scale factor to apply for the new_point coordinates.
+ */
+__kernel void init_level_max_initial_estimate(
+    __global Keypoint *old_points,
+    __global Keypoint *new_points_estimates,
+    __global InternalKeypoint *old_points_internal,
+    __global InternalKeypoint *new_points_internal,
+    const float                scale)
+{
+    int idx = get_global_id(0);
+
+    Keypoint         old_point          = old_points[idx];
+    Keypoint         new_point_estimate = new_points_estimates[idx];
+    InternalKeypoint old_point_internal;
+    InternalKeypoint new_point_internal;
+
+    // Get old keypoint to track
+    old_point_internal.x               = old_point.x * scale;
+    old_point_internal.y               = old_point.y * scale;
+    old_point_internal.tracking_status = 1.f;
+
+    // Get new keypoint to track
+    new_point_internal.x               = new_point_estimate.x * scale;
+    new_point_internal.y               = new_point_estimate.y * scale;
+    new_point_internal.tracking_status = new_point_estimate.tracking_status;
+
+    // Store internal keypoints
+    old_points_internal[idx] = old_point_internal;
+    new_points_internal[idx] = new_point_internal;
+}
+
+/** Truncates the coordinates stored in new_points array
+ *
+ * @param[in]  new_points_internal An array of estimate key points that are defined at the new_images high resolution pyramid.
+ * @param[out] new_points          An array of internal key points that are defined at the new_images high resolution pyramid.
+ */
+__kernel void finalize(
+    __global InternalKeypoint *new_points_internal,
+    __global Keypoint *new_points)
+{
+    int idx = get_global_id(0);
+
+    // Load internal keypoint
+    InternalKeypoint new_point_internal = new_points_internal[idx];
+
+    // Calculate output point
+    Keypoint new_point;
+    new_point.x               = round(new_point_internal.x);
+    new_point.y               = round(new_point_internal.y);
+    new_point.tracking_status = new_point_internal.tracking_status;
+
+    // Store new point
+    new_points[idx] = new_point;
+}
+
+/** Computes A11, A12, A22, min_eig, ival, ixval and iyval at level 0th of the pyramid. These values will be used in step 1.
+ *
+ * @param[in]      old_image_ptr                               Pointer to the input old image. Supported data types: U8
+ * @param[in]      old_image_stride_x                          Stride of the input old image in X dimension (in bytes)
+ * @param[in]      old_image_step_x                            old_image_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      old_image_stride_y                          Stride of the input old image in Y dimension (in bytes)
+ * @param[in]      old_image_step_y                            old_image_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]      old_image_offset_first_element_in_bytes     The offset of the first element in the input old image
+ * @param[in]      old_scharr_gx_ptr                           Pointer to the input scharr x image. Supported data types: S16
+ * @param[in]      old_scharr_gx_stride_x                      Stride of the input scharr x image in X dimension (in bytes)
+ * @param[in]      old_scharr_gx_step_x                        old_scharr_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      old_scharr_gx_stride_y                      Stride of the input scharr x image in Y dimension (in bytes)
+ * @param[in]      old_scharr_gx_step_y                        old_scharr_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]      old_scharr_gx_offset_first_element_in_bytes The offset of the first element in the input scharr x image
+ * @param[in]      old_scharr_gy_ptr                           Pointer to the input scharr y image. Supported data types: S16
+ * @param[in]      old_scharr_gy_stride_x                      Stride of the input scharr y image in X dimension (in bytes)
+ * @param[in]      old_scharr_gy_step_x                        old_scharr_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      old_scharr_gy_stride_y                      Stride of the input scharr y image in Y dimension (in bytes)
+ * @param[in]      old_scharr_gy_step_y                        old_scharr_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]      old_scharr_gy_offset_first_element_in_bytes The offset of the first element in the input scharr y image
+ * @param[in]      old_points                                  An array of key points. Those key points are defined at the old_images high resolution pyramid
+ * @param[in, out] new_points                                  An output array of key points. Those key points are defined at the new_images high resolution pyramid
+ * @param[out]     coeff                                       It stores | A11 | A12 | A22 | min_eig | for each keypoint
+ * @param[out]     iold_val                                    It stores | ival | ixval | iyval | dummy | for each point in the window centered on old_keypoint
+ * @param[in]      window_dimension                            The size of the window on which to perform the algorithm
+ * @param[in]      window_dimension_pow2                       The squared size of the window on which to perform the algorithm
+ * @param[in]      half_window                                 The half size of the window on which to perform the algorithm
+ * @param[in]      border_limits                               It stores the right border limit (width - window_dimension - 1, height - window_dimension - 1,)
+ * @param[in]      eig_const                                   1.0f / (float)(2.0f * window_dimension * window_dimension)
+ * @param[in]      level0                                      It is set to 1 if level 0 of the pyramid
+ */
+void __kernel lktracker_stage0(
+    IMAGE_DECLARATION(old_image),
+    IMAGE_DECLARATION(old_scharr_gx),
+    IMAGE_DECLARATION(old_scharr_gy),
+    __global float4 *old_points,
+    __global float4 *new_points,
+    __global float4 *coeff,
+    __global short4 *iold_val,
+    const int        window_dimension,
+    const int        window_dimension_pow2,
+    const int        half_window,
+    const float3     border_limits,
+    const float      eig_const,
+    const int        level0)
+{
+    int idx = get_global_id(0);
+
+    Image old_image     = CONVERT_TO_IMAGE_STRUCT_NO_STEP(old_image);
+    Image old_scharr_gx = CONVERT_TO_IMAGE_STRUCT_NO_STEP(old_scharr_gx);
+    Image old_scharr_gy = CONVERT_TO_IMAGE_STRUCT_NO_STEP(old_scharr_gy);
+
+    // Get old keypoint
+    float2 old_keypoint = old_points[idx].xy - (float2)half_window;
+
+    // Get the floor value
+    float2 iold_keypoint = floor(old_keypoint);
+
+    // Check if using the window dimension we can go out of boundary in the following for loops. If so, invalidate the tracked point
+    if(any(iold_keypoint < border_limits.zz) || any(iold_keypoint >= border_limits.xy))
+    {
+        if(level0 == 1)
+        {
+            // Invalidate tracked point as we are at level 0
+            new_points[idx].s2 = 0.0f;
+        }
+
+        // Not valid coordinate. It sets min_eig to 0.0f
+        coeff[idx].s3 = 0.0f;
+
+        return;
+    }
+
+    // Compute weight for the bilinear interpolation
+    float2 ab = old_keypoint - iold_keypoint;
+
+    // Weight used for Bilinear-Interpolation on Scharr images
+    // w_scharr.s0 = (1.0f - ab.x) * (1.0f - ab.y)
+    // w_scharr.s1 = ab.x * (1.0f - ab.y)
+    // w_scharr.s2 = (1.0f - ab.x) * ab.y
+    // w_scharr.s3 = ab.x * ab.y
+
+    float4 w_scharr;
+    w_scharr.s3  = ab.x * ab.y;
+    w_scharr.s0  = w_scharr.s3 + 1.0f - ab.x - ab.y;
+    w_scharr.s12 = ab - (float2)w_scharr.s3;
+
+    // Weight used for Bilinear-Interpolation on Old and New images
+    // w.s0 = round(w_scharr.s0 * D0)
+    // w.s1 = round(w_scharr.s1 * D0)
+    // w.s2 = round(w_scharr.s2 * D0)
+    // w.s3 = w.s3 = D0 - w.s0 - w.s1 - w.s2
+
+    float4 w;
+    w    = round(w_scharr * (float4)D0);
+    w.s3 = D0 - w.s0 - w.s1 - w.s2; // Added for matching VX implementation
+
+    // G.s0 = A11, G.s1 = A12, G.s2 = A22, G.s3 = min_eig
+    int4 iG = (int4)0;
+
+    // Window offset
+    int window_offset = idx * window_dimension_pow2;
+
+    // Compute Spatial Gradient Matrix G
+    for(ushort ky = 0; ky < window_dimension; ++ky)
+    {
+        int offset_y = iold_keypoint.y + ky;
+        for(ushort kx = 0; kx < window_dimension; ++kx)
+        {
+            int    offset_x = iold_keypoint.x + kx;
+            float4 px;
+
+            // Load values from old_image for computing the bilinear interpolation
+            px = convert_float4((uchar4)(vload2(0, offset(&old_image, offset_x, offset_y)),
+                                         vload2(0, offset(&old_image, offset_x, offset_y + 1))));
+
+            // old_i.s0 = ival, old_i.s1 = ixval, old_i.s2 = iyval, old_i.s3 = dummy
+            float4 old_i;
+
+            // Compute bilinear interpolation (with D1 scale factor) for ival
+            old_i.s0 = dot(px, w) * D1;
+
+            // Load values from old_scharr_gx for computing the bilinear interpolation
+            px = convert_float4((short4)(vload2(0, (__global short *)offset(&old_scharr_gx, offset_x, offset_y)),
+                                         vload2(0, (__global short *)offset(&old_scharr_gx, offset_x, offset_y + 1))));
+
+            // Compute bilinear interpolation for ixval
+            old_i.s1 = dot(px, w_scharr);
+
+            // Load values from old_scharr_gy for computing the bilinear interpolation
+            px = convert_float4((short4)(vload2(0, (__global short *)offset(&old_scharr_gy, offset_x, offset_y)),
+                                         vload2(0, (__global short *)offset(&old_scharr_gy, offset_x, offset_y + 1))));
+
+            // Compute bilinear interpolation for iyval
+            old_i.s2 = dot(px, w_scharr);
+
+            // Rounding (it could be omitted. Used just for matching the VX implementation)
+            int4 iold = convert_int4(round(old_i));
+
+            // Accumulate values in the Spatial Gradient Matrix
+            iG.s0 += (int)(iold.s1 * iold.s1);
+            iG.s1 += (int)(iold.s1 * iold.s2);
+            iG.s2 += (int)(iold.s2 * iold.s2);
+
+            // Store ival, ixval and iyval
+            iold_val[window_offset + kx] = convert_short4(iold);
+        }
+        window_offset += window_dimension;
+    }
+
+    // Scale iA11, iA12 and iA22
+    float4 G = convert_float4(iG) * (float4)FLT_SCALE;
+
+    // Compute minimum eigen value
+    G.s3 = (float)(G.s2 + G.s0 - sqrt(pown(G.s0 - G.s2, 2) + 4.0f * G.s1 * G.s1)) * eig_const;
+
+    // Store A11. A11, A22 and min_eig
+    coeff[idx] = G;
+}
+
+/** Computes the motion vector for a given keypoint
+ *
+ * @param[in]      new_image_ptr                           Pointer to the input new image. Supported data types: U8
+ * @param[in]      new_image_stride_x                      Stride of the input new image in X dimension (in bytes)
+ * @param[in]      new_image_step_x                        new_image_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      new_image_stride_y                      Stride of the input new image in Y dimension (in bytes)
+ * @param[in]      new_image_step_y                        new_image_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]      new_image_offset_first_element_in_bytes The offset of the first element in the input new image
+ * @param[in, out] new_points                              An output array of key points. Those key points are defined at the new_images high resolution pyramid
+ * @param[in]      coeff                                   The | A11 | A12 | A22 | min_eig | for each keypoint
+ * @param[in]      iold_val                                The | ival | ixval | iyval | dummy | for each point in the window centered on old_keypoint
+ * @param[in]      window_dimension                        The size of the window on which to perform the algorithm
+ * @param[in]      window_dimension_pow2                   The squared size of the window on which to perform the algorithm
+ * @param[in]      half_window                             The half size of the window on which to perform the algorithm
+ * @param[in]      num_iterations                          The maximum number of iterations
+ * @param[in]      epsilon                                 The value for terminating the algorithm.
+ * @param[in]      border_limits                           It stores the right border limit (width - window_dimension - 1, height - window_dimension - 1,)
+ * @param[in]      eig_const                               1.0f / (float)(2.0f * window_dimension * window_dimension)
+ * @param[in]      level0                                  It is set to 1 if level of pyramid = 0
+ * @param[in]      term_iteration                          It is set to 1 if termination = VX_TERM_CRITERIA_ITERATIONS
+ * @param[in]      term_epsilon                            It is set to 1 if termination = VX_TERM_CRITERIA_EPSILON
+ */
+void __kernel lktracker_stage1(
+    IMAGE_DECLARATION(new_image),
+    __global float4 *new_points,
+    __global float4 *coeff,
+    __global short4 *iold_val,
+    const int        window_dimension,
+    const int        window_dimension_pow2,
+    const int        half_window,
+    const int        num_iterations,
+    const float      epsilon,
+    const float3     border_limits,
+    const float      eig_const,
+    const int        level0,
+    const int        term_iteration,
+    const int        term_epsilon)
+{
+    int   idx       = get_global_id(0);
+    Image new_image = CONVERT_TO_IMAGE_STRUCT_NO_STEP(new_image);
+
+    // G.s0 = A11, G.s1 = A12, G.s2 = A22, G.s3 = min_eig
+    float4 G = coeff[idx];
+
+    // Determinant
+    float D = G.s0 * G.s2 - G.s1 * G.s1;
+
+    // Check if it is a good point to track
+    if(G.s3 < EIGENVALUE_THR || D < DETERMINANT_THR)
+    {
+        if(level0 == 1)
+        {
+            // Invalidate tracked point as we are at level 0
+            new_points[idx].s2 = 0;
+        }
+
+        return;
+    }
+
+    // Compute inverse
+    //D = native_recip(D);
+    D = 1.0 / D;
+
+    // Get new keypoint
+    float2 new_keypoint = new_points[idx].xy - (float)half_window;
+
+    // Get new point
+    float2 out_new_point = new_points[idx].xy;
+
+    // Keep delta obtained in the previous iteration
+    float2 prev_delta = (float2)0.0f;
+
+    int j = 0;
+    while(j < num_iterations)
+    {
+        // Get the floor value
+        float2 inew_keypoint = floor(new_keypoint);
+
+        // Check if using the window dimension we can go out of boundary in the following for loops. If so, invalidate the tracked point
+        if(any(inew_keypoint < border_limits.zz) || any(inew_keypoint >= border_limits.xy))
+        {
+            if(level0 == 1)
+            {
+                // Invalidate tracked point as we are at level 0
+                new_points[idx].s2 = 0.0f;
+            }
+            else
+            {
+                new_points[idx].xy = out_new_point;
+            }
+
+            return;
+        }
+
+        // Compute weight for the bilinear interpolation
+        float2 ab = new_keypoint - inew_keypoint;
+
+        // Weight used for Bilinear-Interpolation on Old and New images
+        // w.s0 = round((1.0f - ab.x) * (1.0f - ab.y) * D0)
+        // w.s1 = round(ab.x * (1.0f - ab.y) * D0)
+        // w.s2 = round((1.0f - ab.x) * ab.y * D0)
+        // w.s3 = D0 - w.s0 - w.s1 - w.s2
+
+        float4 w;
+        w.s3  = ab.x * ab.y;
+        w.s0  = w.s3 + 1.0f - ab.x - ab.y;
+        w.s12 = ab - (float2)w.s3;
+        w     = round(w * (float4)D0);
+        w.s3  = D0 - w.s0 - w.s1 - w.s2;
+
+        // Mismatch vector
+        int2 ib = 0;
+
+        // Old val offset
+        int old_val_offset = idx * window_dimension_pow2;
+
+        for(int ky = 0; ky < window_dimension; ++ky)
+        {
+            for(int kx = 0; kx < window_dimension; ++kx)
+            {
+                // ival, ixval and iyval have been computed in the previous stage
+                int4 old_ival = convert_int4(iold_val[old_val_offset]);
+
+                // Load values from old_image for computing the bilinear interpolation
+                float4 px = convert_float4((uchar4)(vload2(0, offset(&new_image, inew_keypoint.x + kx, inew_keypoint.y + ky)),
+                                                    vload2(0, offset(&new_image, inew_keypoint.x + kx, inew_keypoint.y + ky + 1))));
+
+                // Compute bilinear interpolation on new image
+                int jval = (int)round(dot(px, w) * D1);
+
+                // Compute luminance difference
+                int diff = (int)(jval - old_ival.s0);
+
+                // Accumulate values in mismatch vector
+                ib += (diff * old_ival.s12);
+
+                // Update old val offset
+                old_val_offset++;
+            }
+        }
+
+        float2 b = convert_float2(ib) * (float2)FLT_SCALE;
+
+        // Optical Flow
+        float2 delta;
+
+        delta.x = (float)((G.s1 * b.y - G.s2 * b.x) * D);
+        delta.y = (float)((G.s1 * b.x - G.s0 * b.y) * D);
+
+        // Update new point coordinate
+        new_keypoint += delta;
+
+        out_new_point = new_keypoint + (float2)half_window;
+
+        if(term_epsilon == 1)
+        {
+            float mag2 = dot(delta, delta);
+
+            if(mag2 <= epsilon)
+            {
+                new_points[idx].xy = out_new_point;
+
+                return;
+            }
+        }
+
+        // Check convergence analyzing the previous delta
+        if(j > 0 && all(fabs(delta + prev_delta) < (float2)0.01f))
+        {
+            out_new_point -= delta * (float2)0.5f;
+
+            new_points[idx].xy = out_new_point;
+
+            return;
+        }
+
+        // Update previous delta
+        prev_delta = delta;
+
+        if(term_iteration == 1)
+        {
+            j++;
+        }
+    }
+
+    new_points[idx].xy = out_new_point;
+}
diff --git a/src/core/CL/cl_kernels/pixelwise_mul_float.cl b/src/core/CL/cl_kernels/pixelwise_mul_float.cl
new file mode 100644
index 0000000000..ae2031f422
--- /dev/null
+++ b/src/core/CL/cl_kernels/pixelwise_mul_float.cl
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifdef SATURATE
+#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x))
+#else
+#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x))
+#endif
+#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round)
+
+/** Performs a pixelwise multiplication with float scale of either integer or float inputs.
+ *
+ * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
+ * @attention The data type of the intermediate result of the multiplication should passed as well using -DDATA_TYPE_RES.
+ * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short.
+ * @attention -DDATA_TYPE_FLOAT must be passed if floating point inputs are provided.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8, S16, F16, F32
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8, S16, F16, F32
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16, F16, F32
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  scale                             Float scaling factor. Supported data types: F32
+ */
+__kernel void pixelwise_mul_float(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out),
+    const float scale)
+{
+    // Get pixels pointer
+    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
+    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
+    in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
+    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
+    in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
+
+    // Perform multiplication
+#if defined DATA_TYPE_FLOAT
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    res = CONVERT(in1_data * in2_data * scale, VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+#else
+    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
+    res = CONVERT_OP_FLOAT(CONVERT_OP_FLOAT((convert_float16(in1_data * in2_data) * scale), VEC_DATA_TYPE(DATA_TYPE_RES, 16), ROUND), VEC_DATA_TYPE(DATA_TYPE_OUT, 16), ROUND);
+#endif
+
+    // Store result
+    vstore16(res, 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/pixelwise_mul_int.cl b/src/core/CL/cl_kernels/pixelwise_mul_int.cl
new file mode 100644
index 0000000000..05c437cd17
--- /dev/null
+++ b/src/core/CL/cl_kernels/pixelwise_mul_int.cl
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifdef SATURATE
+#define CONVERT_OP_INT_STR(x, type) (convert_##type##_sat(x))
+#else
+#define CONVERT_OP_INT_STR(x, type) (convert_##type(x))
+#endif
+#define CONVERT_OP_INT(x, type) CONVERT_OP_INT_STR(x, type)
+
+/** Performs a pixelwise multiplication with integer scale of integer inputs.
+ *
+ * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
+ * @attention The data_type of the intermediate result of the multiplication should passed as well using -DDATA_TYPE_RES.
+ * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8, S16
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8, S16
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  scale                             Integer scaling factor. Supported data types: S32
+ */
+__kernel void pixelwise_mul_int(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out),
+    const uint scale)
+{
+    // Get pixels pointer
+    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
+    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
+    in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
+    VEC_DATA_TYPE(DATA_TYPE_RES, 16)
+    in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16));
+
+    // Perform multiplication and store result
+    vstore16(CONVERT_OP_INT(((in1_data * in2_data) >> scale), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
new file mode 100644
index 0000000000..1902df9b7d
--- /dev/null
+++ b/src/core/CL/cl_kernels/pooling_layer.cl
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined POOL_AVG
+#define POOL_OP(x, y) ((x) + (y))
+#else
+#define POOL_OP(x, y) (fmax((x), (y)))
+#endif
+
+float calculate_avg_scale(const int pool_size, const int upper_bound_w, const int upper_bound_h,
+                          const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int start_x = get_global_id(0) * stride_x - pad_x;
+    int start_y = get_global_id(1) * stride_y - pad_y;
+    int end_x   = min(start_x + pool_size, upper_bound_w);
+    int end_y   = min(start_y + pool_size, upper_bound_h);
+    return 1.f / ((end_y - start_y) * (end_x - start_x));
+}
+
+/** Performs a pooling function of pool size equal to 2.
+ *
+ * @note Pooling stride must be passed using -DPOOL_STRIDE e.g -DPOOL_STRIDE=2. Supported strides are 1,2,3
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32;
+ * @note In case of average pooling -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16, F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: F16, F32
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  max_dims                             The maximum index that can be accessed in x and y dimension (width + pad)
+ * @param[in]  strides                              The pooling operation strides in each dimension
+ * @param[in]  paddings                             The pooling operation paddings in each dimension
+ */
+__kernel void pooling_layer_2(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output)
+#ifdef POOL_AVG
+    ,
+    int2 max_dims, int2 strides, int2 paddings
+#endif
+)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data0 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+
+    // Perform calculations
+    data0         = POOL_OP(data0, data1);
+    DATA_TYPE res = POOL_OP(data0.s0, data0.s1);
+
+    // Divide by 4 in case of average pooling
+#ifdef POOL_AVG
+    res *= calculate_avg_scale(2, max_dims.x, max_dims.y, paddings.x, paddings.y, strides.x, strides.y);
+#endif
+
+    // Store result
+    *(__global DATA_TYPE *)output.ptr = res;
+}
+
+/** Performs a pooling function of pool size equal to 3.
+ *
+ * @note Pooling stride must be passed using -DPOOL_STRIDE e.g -DPOOL_STRIDE=2. Supported strides are 1,2,3
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32;
+ * @note In case of average pooling -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16, F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: F16, F32
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  max_dims                             The maximum index that can be accessed in x and y dimension (width + pad)
+ * @param[in]  strides                              The pooling operation strides in each dimension
+ * @param[in]  paddings                             The pooling operation paddings in each dimension
+ */
+__kernel void pooling_layer_3(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output)
+#ifdef POOL_AVG
+    ,
+    int2 max_dims, int2 strides, int2 paddings
+#endif
+)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    data0 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    data1 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    data2 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
+
+    // Perform calculations
+    data0         = POOL_OP(data0, data1);
+    data0         = POOL_OP(data0, data2);
+    DATA_TYPE res = POOL_OP(POOL_OP(data0.s0, data0.s1), data0.s2);
+
+    // Divide by 4 in case of average pooling
+#ifdef POOL_AVG
+    res *= calculate_avg_scale(3, max_dims.x, max_dims.y, paddings.x, paddings.y, strides.x, strides.y);
+#endif
+
+    // Store result
+    *(__global DATA_TYPE *)output.ptr = res;
+}
diff --git a/src/core/CL/cl_kernels/remap.cl b/src/core/CL/cl_kernels/remap.cl
new file mode 100644
index 0000000000..e0f3bf3468
--- /dev/null
+++ b/src/core/CL/cl_kernels/remap.cl
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "warp_helpers.h"
+
+/** Performs a remapping of an input image to an output given two remapping image using nearest neighbor as interpolation.
+ *
+ * This kernel performs remapping with this method of pixel coordinate translation:
+ *     out(x,y) = in(mapx(x,y), mapy(x,y));
+ *
+ * @param[in]  in_ptr                             Pointer to the source image. Supported data types: U8.
+ * @param[in]  in_stride_x                        Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                          in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                          in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes   Offset of the first element in the source image
+ * @param[out] out_ptr                            Pointer to the destination image. Supported data types: U8.
+ * @param[in]  out_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                         out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                         out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes  Offset of the first element in the destination image
+ * @param[in]  mapx_ptr                           Pointer to the x remapping image. Supported data types: F32.
+ * @param[in]  mapx_stride_x                      Stride of the remapping image in X dimension (in bytes)
+ * @param[in]  mapx_step_x                        mapx_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  mapx_stride_y                      Stride of the remapping image in Y dimension (in bytes)
+ * @param[in]  mapx_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in]  mapy_ptr                           Pointer to the x remapping image. Supported data types: F32.
+ * @param[in]  mapy_stride_x                      Stride of the remapping image in X dimension (in bytes)
+ * @param[in]  mapy_step_x                        mapy_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  mapy_stride_y                      Stride of the remapping image in Y dimension (in bytes)
+ * @param[in]  mapy_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in]  width                              Width of the input image
+ * @param[in]  height                             Height of the input image
+ */
+__kernel void remap_nearest_neighbour(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    IMAGE_DECLARATION(mapx),
+    IMAGE_DECLARATION(mapy),
+    const float width,
+    const float height)
+{
+    Image in   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image out  = CONVERT_TO_IMAGE_STRUCT(out);
+    Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx);
+    Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy);
+
+    float4 mapx_coords = vload4(0, (__global float *)mapx.ptr);
+    float4 mapy_coords = vload4(0, (__global float *)mapy.ptr);
+    float8 map_coords  = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1,
+                                  mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3);
+    map_coords += (float8)(0.5f);
+
+    vstore4(read_texels4(&in, convert_int8(clamp_to_border(map_coords, width, height))), 0, out.ptr);
+}
+
+/** Performs a remapping of an input image to an output given two remapping image using bilinear as interpolation.
+ *
+ * This kernel performs remapping with this method of pixel coordinate translation:
+ *     out(x,y) = in(mapx(x,y), mapy(x,y));
+ *
+ * @param[in]  in_ptr                             Pointer to the source image. Supported data types: U8.
+ * @param[in]  in_stride_x                        Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                          in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                          in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes   Offset of the first element in the source image
+ * @param[out] out_ptr                            Pointer to the destination image. Supported data types: U8.
+ * @param[in]  out_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                         out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                         out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes  Offset of the first element in the destination image
+ * @param[in]  mapx_ptr                           Pointer to the x remapping image. Supported data types: F32.
+ * @param[in]  mapx_stride_x                      Stride of the remapping image in X dimension (in bytes)
+ * @param[in]  mapx_step_x                        mapx_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  mapx_stride_y                      Stride of the remapping image in Y dimension (in bytes)
+ * @param[in]  mapx_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in]  mapy_ptr                           Pointer to the x remapping image. Supported data types: F32.
+ * @param[in]  mapy_stride_x                      Stride of the remapping image in X dimension (in bytes)
+ * @param[in]  mapy_step_x                        mapy_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  mapy_stride_y                      Stride of the remapping image in Y dimension (in bytes)
+ * @param[in]  mapy_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in]  width                              Width of the input image
+ * @param[in]  height                             Height of the input image
+ */
+__kernel void remap_bilinear(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    IMAGE_DECLARATION(mapx),
+    IMAGE_DECLARATION(mapy),
+    const float width,
+    const float height)
+{
+    Image in   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image out  = CONVERT_TO_IMAGE_STRUCT(out);
+    Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx);
+    Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy);
+
+    float4 mapx_coords = vload4(0, (__global float *)mapx.ptr);
+    float4 mapy_coords = vload4(0, (__global float *)mapy.ptr);
+    float8 map_coords  = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1,
+                                  mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3);
+
+    vstore4(bilinear_interpolate(&in, clamp_to_border(map_coords, width, height), width, height), 0, out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/scale.cl b/src/core/CL/cl_kernels/scale.cl
new file mode 100644
index 0000000000..9ef33b83ce
--- /dev/null
+++ b/src/core/CL/cl_kernels/scale.cl
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "warp_helpers.h"
+
+/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
+ *
+ * @param[in] coord 2D coordinates to transform.
+ * @param[in] scale input/output scale ratio
+ *
+ * @return a float8 containing 4 2D transformed values in the input image.
+ */
+inline const float8 transform_nearest(const float2 coord, const float2 scale)
+{
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+    const float4 new_x       = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0);
+    const float4 new_y       = (float4)((coord.s1 + 0.5f) * scale.s1);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+}
+
+/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
+ *
+ * @param[in] coord 2D coordinates to transform.
+ * @param[in] scale input/output scale ratio
+ *
+ * @return a float8 containing 4 2D transformed values in the input image.
+ */
+inline const float8 transform_bilinear(const float2 coord, const float2 scale)
+{
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+    const float4 new_x       = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
+    const float4 new_y       = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+}
+
+/** Performs an affine transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8 or S16.
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, S16.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_width                       Input image width
+ * @param[in]  input_height                      Input image height
+ * @param[in]  output_width                      Output image width
+ * @param[in]  output_height                     Output image height
+ */
+__kernel void scale_nearest_neighbour(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const float input_width,
+    const float input_height,
+    const float output_width,
+    const float output_height)
+{
+    Image        in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image        out = CONVERT_TO_IMAGE_STRUCT(out);
+    const float2 r   = (float2)(input_width / output_width, input_height / output_height);
+    const float8 tc  = clamp_to_border(transform_nearest(get_current_coords(), r), input_width, input_height);
+    vstore4(read_texels4(&in, convert_int8(tc)), 0, (__global DATA_TYPE *)out.ptr);
+}
+
+/** Performs an affine transformation on an image interpolating with the BILINEAR method.
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, S16.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_width                       Input image width
+ * @param[in]  input_height                      Input image height
+ * @param[in]  output_width                      Output image width
+ * @param[in]  output_height                     Output image height
+ */
+__kernel void scale_bilinear(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const float input_width,
+    const float input_height,
+    const float output_width,
+    const float output_height)
+{
+    Image        in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image        out = CONVERT_TO_IMAGE_STRUCT(out);
+    const float2 r   = (float2)(input_width / output_width, input_height / output_height);
+    const float8 tc  = clamp_to_border(transform_bilinear(get_current_coords(), r), input_width, input_height);
+    vstore4(bilinear_interpolate(&in, tc, input_width, input_height), 0, (__global DATA_TYPE *)out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/scharr_filter.cl b/src/core/CL/cl_kernels/scharr_filter.cl
new file mode 100644
index 0000000000..ef9878c1a3
--- /dev/null
+++ b/src/core/CL/cl_kernels/scharr_filter.cl
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This OpenCL kernel computes Scharr3x3.
+ *
+ * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
+ * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
+ *
+ * @param[in]  src_ptr                              Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] dst_gx_ptr                           Pointer to the destination image Supported data types: S16
+ * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void scharr3x3(
+    IMAGE_DECLARATION(src)
+#ifdef GRAD_X
+    ,
+    IMAGE_DECLARATION(dst_gx)
+#endif
+#ifdef GRAD_Y
+    ,
+    IMAGE_DECLARATION(dst_gy)
+#endif
+)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+#ifdef GRAD_X
+    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
+#endif
+#ifdef GRAD_Y
+    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
+#endif
+
+    // Output pixels
+#ifdef GRAD_X
+    short8 gx = (short8)0;
+#endif
+#ifdef GRAD_Y
+    short8 gy = (short8)0;
+#endif
+
+    // Row0
+    uchar16 temp   = vload16(0, offset(&src, -1, -1));
+    short8  left   = convert_short8(temp.s01234567);
+    short8  middle = convert_short8(temp.s12345678);
+    short8  right  = convert_short8(temp.s23456789);
+#ifdef GRAD_X
+    gx += left * (short8)(-3);
+    gx += right * (short8)(+3);
+#endif
+#ifdef GRAD_Y
+    gy += left * (short8)(-3);
+    gy += middle * (short8)(-10);
+    gy += right * (short8)(-3);
+#endif
+
+    // Row1
+    temp  = vload16(0, offset(&src, -1, 0));
+    left  = convert_short8(temp.s01234567);
+    right = convert_short8(temp.s23456789);
+#ifdef GRAD_X
+    gx += left * (short8)(-10);
+    gx += right * (short8)(+10);
+#endif
+
+    // Row2
+    temp   = vload16(0, offset(&src, -1, 1));
+    left   = convert_short8(temp.s01234567);
+    middle = convert_short8(temp.s12345678);
+    right  = convert_short8(temp.s23456789);
+#ifdef GRAD_X
+    gx += left * (short8)(-3);
+    gx += right * (short8)(+3);
+#endif
+#ifdef GRAD_Y
+    gy += left * (short8)(+3);
+    gy += middle * (short8)(+10);
+    gy += right * (short8)(+3);
+#endif
+
+    // Store results
+#ifdef GRAD_X
+    vstore8(gx, 0, ((__global short *)dst_gx.ptr));
+#endif
+#ifdef GRAD_Y
+    vstore8(gy, 0, ((__global short *)dst_gy.ptr));
+#endif
+}
diff --git a/src/core/CL/cl_kernels/sobel_filter.cl b/src/core/CL/cl_kernels/sobel_filter.cl
new file mode 100644
index 0000000000..4eb0eef770
--- /dev/null
+++ b/src/core/CL/cl_kernels/sobel_filter.cl
@@ -0,0 +1,541 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/***********************************************/
+/*   Begin implementation of Sobel3x3 filter   */
+/***********************************************/
+
+/** This OpenCL kernel that computes a Sobel3x3 filter.
+ *
+ * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
+ * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
+ *
+ * @param[in]  src_ptr                              Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void sobel3x3(
+    IMAGE_DECLARATION(src)
+#ifdef GRAD_X
+    ,
+    IMAGE_DECLARATION(dst_gx)
+#endif
+#ifdef GRAD_Y
+    ,
+    IMAGE_DECLARATION(dst_gy)
+#endif
+)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+#ifdef GRAD_X
+    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
+#endif
+#ifdef GRAD_Y
+    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
+#endif
+
+    // Output pixels
+#ifdef GRAD_X
+    short8 gx = (short8)0;
+#endif
+#ifdef GRAD_Y
+    short8 gy = (short8)0;
+#endif
+
+    // Row0
+    uchar16 temp   = vload16(0, offset(&src, -1, -1));
+    short8  left   = convert_short8(temp.s01234567);
+    short8  middle = convert_short8(temp.s12345678);
+    short8  right  = convert_short8(temp.s23456789);
+#ifdef GRAD_X
+    gx += left * (short8)(-1);
+    gx += right * (short8)(+1);
+#endif
+#ifdef GRAD_Y
+    gy += left * (short8)(-1);
+    gy += middle * (short8)(-2);
+    gy += right * (short8)(-1);
+#endif
+
+    // Row1
+    temp  = vload16(0, offset(&src, -1, 0));
+    left  = convert_short8(temp.s01234567);
+    right = convert_short8(temp.s23456789);
+#ifdef GRAD_X
+    gx += left * (short8)(-2);
+    gx += right * (short8)(+2);
+#endif
+
+    // Row2
+    temp   = vload16(0, offset(&src, -1, 1));
+    left   = convert_short8(temp.s01234567);
+    middle = convert_short8(temp.s12345678);
+    right  = convert_short8(temp.s23456789);
+#ifdef GRAD_X
+    gx += left * (short8)(-1);
+    gx += right * (short8)(+1);
+#endif
+#ifdef GRAD_Y
+    gy += left * (short8)(+1);
+    gy += middle * (short8)(+2);
+    gy += right * (short8)(+1);
+#endif
+
+    // Store results
+#ifdef GRAD_X
+    vstore8(gx, 0, ((__global short *)dst_gx.ptr));
+#endif
+#ifdef GRAD_Y
+    vstore8(gy, 0, ((__global short *)dst_gy.ptr));
+#endif
+}
+
+/**********************************************/
+/*    End implementation of Sobel3x3 filter   */
+/**********************************************/
+
+/***********************************************/
+/*   Begin implementation of Sobel5x5 filter   */
+/***********************************************/
+
+/** Compute a 1D horizontal sobel filter 1x5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] src             Pointer to source image.
+ * @param[in] left1_coeff_gx  Weight of the most left pixel for gx
+ * @param[in] left2_coeff_gx  Weight of the left pixel for gx
+ * @param[in] middle_coeff_gx Weight of the middle pixel for gx
+ * @param[in] right1_coeff_gx Weight of the right pixel for gx
+ * @param[in] right2_coeff_gx Weight of the most right pixel for gx
+ * @param[in] left1_coeff_gy  Weight of the most left pixel for gy
+ * @param[in] left2_coeff_gy  Weight of the left pixel for gy
+ * @param[in] middle_coeff_gy Weight of the middle pixel for gy
+ * @param[in] right1_coeff_gy Weight of the right pixel for gy
+ * @param[in] right2_coeff_gy Weight of the most right pixel for gy
+ *
+ * @return a short16 containing short8 gx and short8 gy values.
+ */
+short16 sobel1x5(
+    Image      *src,
+    const short left1_coeff_gx,
+    const short left2_coeff_gx,
+    const short middle_coeff_gx,
+    const short right1_coeff_gx,
+    const short right2_coeff_gx,
+    const short left1_coeff_gy,
+    const short left2_coeff_gy,
+    const short middle_coeff_gy,
+    const short right1_coeff_gy,
+    const short right2_coeff_gy)
+{
+    uchar16 temp = vload16(0, offset(src, -2, 0));
+    short8  gx   = 0;
+    short8  gy   = 0;
+    short8  val;
+
+    val = convert_short8(temp.s01234567);
+    gx += val * (short8)left1_coeff_gx;
+    gy += val * (short8)left1_coeff_gy;
+
+    val = convert_short8(temp.s12345678);
+    gx += val * (short8)left2_coeff_gx;
+    gy += val * (short8)left2_coeff_gy;
+
+    val = convert_short8(temp.s23456789);
+    gx += val * (short8)middle_coeff_gx;
+    gy += val * (short8)middle_coeff_gy;
+
+    val = convert_short8(temp.s3456789a);
+    gx += val * (short8)right1_coeff_gx;
+    gy += val * (short8)right1_coeff_gy;
+
+    val = convert_short8(temp.s456789ab);
+    gx += val * (short8)right2_coeff_gx;
+    gy += val * (short8)right2_coeff_gy;
+
+    return (short16)(gx, gy);
+}
+
+/** Compute a 1D vertical sobel filter 5x1 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
+ *
+ * @param[in] src          Pointer to source image.
+ * @param[in] up1_coeff    Weight of the most up pixel
+ * @param[in] up2_coeff    Weight of the up pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] down1_coeff  Weight of the down pixel
+ * @param[in] down2_coeff  Weight of the most down pixel
+ *
+ * @return a short8 containing 8 convoluted values.
+ */
+short8 sobel5x1(
+    Image      *src,
+    const short up1_coeff,
+    const short up2_coeff,
+    const short middle_coeff,
+    const short down1_coeff,
+    const short down2_coeff)
+{
+    short8 val;
+    short8 out = (short8)0;
+
+    val = vload8(0, (__global short *)offset(src, 0, -2));
+    out += val * (short8)up1_coeff;
+
+    val = vload8(0, (__global short *)offset(src, 0, -1));
+    out += val * (short8)up2_coeff;
+
+    val = vload8(0, (__global short *)offset(src, 0, 0));
+    out += val * (short8)middle_coeff;
+
+    val = vload8(0, (__global short *)offset(src, 0, 1));
+    out += val * (short8)down1_coeff;
+
+    val = vload8(0, (__global short *)offset(src, 0, 2));
+    out += val * (short8)down2_coeff;
+
+    return (short8)(out);
+}
+
+/** Apply a 1x5 sobel matrix to a single channel U8 input image and output two temporary channel S16 images.
+ *
+ * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
+ * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
+ *
+ * @param[in]  src_ptr                              Pointer to the source image.. Supported data types: U8
+ * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] dst_gx_ptr                           Pointer to the destination image.. Supported data types: S16
+ * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void sobel_separable1x5(
+    IMAGE_DECLARATION(src)
+#ifdef GRAD_X
+    ,
+    IMAGE_DECLARATION(dst_gx)
+#endif
+#ifdef GRAD_Y
+    ,
+    IMAGE_DECLARATION(dst_gy)
+#endif
+)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+#ifdef GRAD_X
+    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
+#endif
+#ifdef GRAD_Y
+    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
+#endif
+
+    // Output pixels
+    short16 gx_gy = sobel1x5(&src,
+                             -1, -2, 0, 2, 1,
+                             1, 4, 6, 4, 1);
+
+    // Store result in dst
+#ifdef GRAD_X
+    vstore8(gx_gy.s01234567, 0, ((__global short *)dst_gx.ptr));
+#endif
+#ifdef GRAD_Y
+    vstore8(gx_gy.s89ABCDEF, 0, ((__global short *)dst_gy.ptr));
+#endif
+}
+
+/** Apply a 5x1 convolution matrix to two single channel S16 input temporary images
+ *  and output two single channel S16 images.
+ *
+ * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
+ * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
+ *
+ * @param[in]  src_x_ptr                            Pointer to the source image.. Supported data types: S16
+ * @param[in]  src_x_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_x_step_x                         src_x_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_x_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_x_step_y                         src_x_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_x_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  src_y_ptr                            Pointer to the source image. Supported data types: S16
+ * @param[in]  src_y_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_y_step_x                         src_y_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_y_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_y_step_y                         src_y_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_y_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  dummy                                Dummy parameter to easy conditional inclusion
+ */
+__kernel void sobel_separable5x1(
+#ifdef GRAD_X
+    IMAGE_DECLARATION(src_x),
+    IMAGE_DECLARATION(dst_gx),
+#endif
+#ifdef GRAD_Y
+    IMAGE_DECLARATION(src_y),
+    IMAGE_DECLARATION(dst_gy),
+#endif
+    int dummy)
+{
+#ifdef GRAD_X
+    Image src_x  = CONVERT_TO_IMAGE_STRUCT(src_x);
+    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
+#endif
+#ifdef GRAD_Y
+    Image src_y  = CONVERT_TO_IMAGE_STRUCT(src_y);
+    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
+#endif
+
+#ifdef GRAD_X
+    short8 gx = sobel5x1(&src_x,
+                         1, 4, 6, 4, 1);
+    vstore8(gx, 0, ((__global short *)dst_gx.ptr));
+#endif
+#ifdef GRAD_Y
+    short8 gy = sobel5x1(&src_y,
+                         -1, -2, 0, 2, 1);
+    vstore8(gy, 0, ((__global short *)dst_gy.ptr));
+#endif
+}
+
+/**********************************************/
+/*    End implementation of Sobel5x5 filter   */
+/**********************************************/
+
+/***********************************************/
+/*   Begin implementation of Sobel7x7 filter   */
+/***********************************************/
+
+/* Sobel 1x7 horizontal X / 7x1 vertical Y coefficients */
+#define X0 -1
+#define X1 -4
+#define X2 -5
+#define X3 0
+#define X4 5
+#define X5 4
+#define X6 1
+
+/* Sobel 1x7 vertical X / 7x1 horizontal Y coefficients */
+#define Y0 1
+#define Y1 6
+#define Y2 15
+#define Y3 20
+#define Y4 15
+#define Y5 6
+#define Y6 1
+
+/* Calculates single horizontal iteration. */
+#define SOBEL1x1_HOR(src, gx, gy, idx)                               \
+    {                                                                \
+        int8 val = convert_int8(vload8(0, offset(src, idx - 3, 0))); \
+        gx += val * X##idx;                                          \
+        gy += val * Y##idx;                                          \
+    }
+
+/* Calculates single vertical iteration. */
+#define SOBEL1x1_VERT(src, g, direction, idx)                          \
+    {                                                                  \
+        int8 val = vload8(0, (__global int *)offset(src, 0, idx - 3)); \
+        g += val * (int8)direction##idx;                               \
+    }
+
+/* Calculates a 1x7 horizontal iteration. */
+#define SOBEL1x7(ptr, gx, gy)                        \
+    SOBEL1x1_HOR(ptr, gx, gy, 0)                     \
+    SOBEL1x1_HOR(ptr, gx, gy, 1)                 \
+    SOBEL1x1_HOR(ptr, gx, gy, 2)             \
+    SOBEL1x1_HOR(ptr, gx, gy, 3)         \
+    SOBEL1x1_HOR(ptr, gx, gy, 4)     \
+    SOBEL1x1_HOR(ptr, gx, gy, 5) \
+    SOBEL1x1_HOR(ptr, gx, gy, 6)
+
+/* Calculates a 7x1 vertical iteration. */
+#define SOBEL7x1(ptr, g, direction)                         \
+    SOBEL1x1_VERT(ptr, g, direction, 0)                     \
+    SOBEL1x1_VERT(ptr, g, direction, 1)                 \
+    SOBEL1x1_VERT(ptr, g, direction, 2)             \
+    SOBEL1x1_VERT(ptr, g, direction, 3)         \
+    SOBEL1x1_VERT(ptr, g, direction, 4)     \
+    SOBEL1x1_VERT(ptr, g, direction, 5) \
+    SOBEL1x1_VERT(ptr, g, direction, 6)
+
+/** Apply a 1x7 sobel matrix to a single channel U8 input image and output two temporary channel S16 images and leave the borders undefined.
+ *
+ * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
+ * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
+ *
+ * @param[in]  src_ptr                              Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
+ * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S32
+ * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S32
+ * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void sobel_separable1x7(
+    IMAGE_DECLARATION(src)
+#ifdef GRAD_X
+    ,
+    IMAGE_DECLARATION(dst_gx)
+#endif
+#ifdef GRAD_Y
+    ,
+    IMAGE_DECLARATION(dst_gy)
+#endif
+)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+#ifdef GRAD_X
+    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
+#endif
+#ifdef GRAD_Y
+    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
+#endif
+    int8 gx = (int8)0;
+    int8 gy = (int8)0;
+
+    SOBEL1x7(&src, gx, gy);
+
+    // Store result in dst
+#ifdef GRAD_X
+    vstore8(gx, 0, ((__global int *)dst_gx.ptr));
+#endif
+#ifdef GRAD_Y
+    vstore8(gy, 0, ((__global int *)dst_gy.ptr));
+#endif
+}
+
+/** Apply a 7x1 convolution matrix to two single channel S16 input temporary images and output two single channel S16 images and leave the borders undefined.
+ *
+ * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
+ * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
+ *
+ * @param[in]  src_x_ptr                            Pointer to the source image. Supported data types: S32
+ * @param[in]  src_x_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_x_step_x                         src_x_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_x_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_x_step_y                         src_x_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_x_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  src_y_ptr                            Pointer to the source image. Supported data types: S32
+ * @param[in]  src_y_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_y_step_x                         src_y_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_y_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_y_step_y                         src_y_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_y_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  dummy                                Dummy parameter to easy conditional inclusion
+ */
+__kernel void sobel_separable7x1(
+#ifdef GRAD_X
+    IMAGE_DECLARATION(src_x),
+    IMAGE_DECLARATION(dst_gx),
+#endif
+#ifdef GRAD_Y
+    IMAGE_DECLARATION(src_y),
+    IMAGE_DECLARATION(dst_gy),
+#endif
+    int dummy)
+{
+#ifdef GRAD_X
+    Image src_x  = CONVERT_TO_IMAGE_STRUCT(src_x);
+    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
+#endif
+#ifdef GRAD_Y
+    Image src_y  = CONVERT_TO_IMAGE_STRUCT(src_y);
+    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
+#endif
+
+    // Output pixels
+#ifdef GRAD_X
+    int8 gx = 0;
+    SOBEL7x1(&src_x, gx, Y);
+    vstore8(gx, 0, (__global int *)dst_gx.ptr);
+#endif
+#ifdef GRAD_Y
+    int8 gy = 0;
+    SOBEL7x1(&src_y, gy, X);
+    vstore8(gy, 0, (__global int *)dst_gy.ptr);
+#endif
+}
+
+/**********************************************/
+/*    End implementation of Sobel7x7 filter   */
+/**********************************************/
diff --git a/src/core/CL/cl_kernels/softmax_layer.cl b/src/core/CL/cl_kernels/softmax_layer.cl
new file mode 100644
index 0000000000..632b4a5374
--- /dev/null
+++ b/src/core/CL/cl_kernels/softmax_layer.cl
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined USE_F16
+#define MINVAL HALF_MIN
+#define SELECT_DATA_TYPE short
+#define DATA_TYPE half
+#else
+#define MINVAL FLT_MIN
+#define SELECT_DATA_TYPE int
+#define DATA_TYPE float
+#endif
+
+__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL);
+__constant uint16 idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+
+/** Identifies the maximum value across the 1st dimension.
+ *
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note In case F16 is used -DUSE_HALF must be passed otherwise the kernel will default to used F32.
+ * @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_16 must be passed.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: F16, F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: F16, F32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  width                             Input image width
+ */
+__kernel void softmax_layer_max(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    uint width)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Initialize local maximum
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    max_val = (VEC_DATA_TYPE(DATA_TYPE, 16))type_min;
+
+    // Calculate max of row
+    const uint width4 = width >> 4;
+    for(uint i = 0; i < width4; i++)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 16)
+        data    = vload16(0, (__global DATA_TYPE *)offset(&src, i << 4, 0));
+        max_val = max(data, max_val);
+    }
+
+#if defined NON_MULTIPLE_OF_16
+    // Handle non multiple of 16
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    data = vload16(0, (__global DATA_TYPE *)offset(&src, width4 << 4, 0));
+    VEC_DATA_TYPE(SELECT_DATA_TYPE, 16)
+    widx    = CONVERT(((uint16)(width4 << 4) + idx16) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 16));
+    max_val = max(max_val, select(type_min, data, widx));
+#endif
+
+    // Perform max reduction
+    max_val.s01234567 = max(max_val.s01234567, max_val.s89ABCDEF);
+    max_val.s0123     = max(max_val.s0123, max_val.s4567);
+    max_val.s01       = max(max_val.s01, max_val.s23);
+    max_val.s0        = max(max_val.s0, max_val.s1);
+
+    // Store result
+    *((__global DATA_TYPE *)dst.ptr) = max_val.s0;
+}
+
+/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
+ * then gets the exponent of each element as sums all elements across each row.
+ *
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note In case F16 is used -DUSE_HALF must be passed otherwise the kernel will default to used F32.
+ * @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_16 must be passed.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: F16, F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  max_ptr                           Pointer to the max values tensor slice. Supported data types: F16, F32
+ * @param[in]  max_stride_x                      Stride of the max values tensor in X dimension (in bytes)
+ * @param[in]  max_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  max_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
+ * @param[in]  max_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  max_offset_first_element_in_bytes The offset of the first element in the max values tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: F16, F32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] sum_ptr                           Pointer to the sum values tensor slice. Supported data types: F16, F32
+ * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
+ * @param[in]  width                             Input image width
+ */
+__kernel void softmax_layer_shift_exp_sum(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(max),
+    IMAGE_DECLARATION(dst),
+    IMAGE_DECLARATION(sum),
+    uint width)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+    Image max = CONVERT_TO_IMAGE_STRUCT(max);
+    Image sum = CONVERT_TO_IMAGE_STRUCT(sum);
+
+    // Load max value of 1D logits vector (row)
+    DATA_TYPE max_val = *((__global DATA_TYPE *)offset(&max, 0, 0));
+
+    // Set sum vector
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    sum1D = 0;
+
+    // Shift values, exp and sum
+    const uint width4 = width >> 4;
+    for(uint i = 0; i < width4; i++)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 16)
+        data = vload16(0, (__global DATA_TYPE *)offset(&src, i << 4, 0));
+        data = exp(data - max_val);
+        vstore16(data, 0, (__global DATA_TYPE *)offset(&dst, i << 4, 0));
+        sum1D += data;
+    }
+
+#if defined NON_MULTIPLE_OF_16
+    // Handle non multiple of 16
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    data = vload16(0, (__global DATA_TYPE *)offset(&src, width4 << 4, 0));
+    data = exp(data - max_val);
+    VEC_DATA_TYPE(SELECT_DATA_TYPE, 16)
+    widx = CONVERT(((uint16)(width4 << 4) + idx16) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 16));
+    data = select(0, data, widx);
+    vstore16(data, 0, (__global DATA_TYPE *)offset(&dst, width4 << 4, 0));
+    sum1D += data;
+#endif
+
+    // Perform min/max reduction
+    sum1D.s01234567 = sum1D.s01234567 + sum1D.s89ABCDEF;
+    sum1D.s0123     = sum1D.s0123 + sum1D.s4567;
+    sum1D.s01       = sum1D.s01 + sum1D.s23;
+    sum1D.s0        = sum1D.s0 + sum1D.s1;
+
+    // Calculate and store result
+    *((__global DATA_TYPE *)sum.ptr) = sum1D.s0;
+}
+
+/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
+ *
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: F16, F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  sum_ptr                           Pointer to the sum values tensor slice. Supported data types: F16, F32
+ * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: F16, F32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void softmax_layer_norm(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(sum),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+    Image sum = CONVERT_TO_IMAGE_STRUCT_NO_STEP(sum);
+
+    // Load max value of 1D logits vector (row)
+    DATA_TYPE sum_val = *((__global DATA_TYPE *)offset(&sum, 0, get_global_id(1)));
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    data = vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0));
+    vstore16(data / sum_val, 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
+}
diff --git a/src/core/CL/cl_kernels/tablelookup.cl b/src/core/CL/cl_kernels/tablelookup.cl
new file mode 100644
index 0000000000..cee116bd75
--- /dev/null
+++ b/src/core/CL/cl_kernels/tablelookup.cl
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function performs table lookup on U8 input/output images.
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ *
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  lut                               LUT table. Supported data types: U8
+ */
+__kernel void tablelookup_U8(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    __global uchar *lut)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Load input data */
+    uchar8 data = vload8(0, src.ptr);
+
+    /* Load lut data */
+    uchar8 lut_data = (uchar8)(lut[data.s0], lut[data.s1], lut[data.s2], lut[data.s3],
+                               lut[data.s4], lut[data.s5], lut[data.s6], lut[data.s7]);
+
+    /* Store result */
+    vstore8(lut_data, 0, dst.ptr);
+}
+
+/** This function performs table lookup on S16 input/output images.
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 8), height ]
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: S16
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: S16
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  lut                               LUT table. Supported data types: S16
+ * @param[in]  offset                            LUT offset
+ * @param[in]  count                             Number of elements in the LUT
+ */
+__kernel void tablelookup_S16(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    __global short *lut,
+    uint            offset,
+    uint            count)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Load input data */
+    short8 data = vload8(0, (__global short *)src.ptr);
+
+    /* Load output data */
+    int8 out_data = convert_int8(vload8(0, (__global short *)dst.ptr));
+
+    /* Calculate index */
+    int8 index = convert_int8(data) + (int8)(offset);
+    int8 cond  = (index >= 0 && index < (int8)count);
+    index      = select(0, index, cond);
+
+    /* Load lut data */
+    int8 lut_data = (int8)(lut[index.s0], lut[index.s1], lut[index.s2], lut[index.s3],
+                           lut[index.s4], lut[index.s5], lut[index.s6], lut[index.s7]);
+
+    /* Select output data depending on condition */
+    lut_data = select(out_data, lut_data, cond);
+
+    /* Store result */
+    vstore8(convert_short8(lut_data), 0, (__global short *)dst.ptr);
+}
diff --git a/src/core/CL/cl_kernels/threshold.cl b/src/core/CL/cl_kernels/threshold.cl
new file mode 100644
index 0000000000..2b1e6ff35d
--- /dev/null
+++ b/src/core/CL/cl_kernels/threshold.cl
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Perform binary thresholding on an image.
+ *
+ * @param[in]  in_ptr                            Pointer to the source image
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the first source image
+ * @param[out] out_ptr                           Pointer to the destination image
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  false_val                         False value
+ * @param[in]  true_val                          True value
+ * @param[in]  threshold                         The thresold value
+ */
+__kernel void threshold_binary(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const uchar false_val,
+    const uchar true_val,
+    const uchar threshold)
+{
+    // Get pixels pointer
+    Image in  = CONVERT_TO_IMAGE_STRUCT(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    // Load data
+    uchar16 in_data = vload16(0, in.ptr);
+
+    // Perform binary thresholding
+    in_data = select((uchar16)false_val, (uchar16)true_val, in_data > (uchar16)threshold);
+
+    // Store result
+    vstore16(in_data, 0, out.ptr);
+}
+
+/** Perform range thresholding on an image.
+ *
+ * @param[in]  in_ptr                            Pointer to the source image
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the first source image
+ * @param[out] out_ptr                           Pointer to the destination image
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  false_val                         False value
+ * @param[in]  true_val                          True value
+ * @param[in]  lower                             Lower threshold
+ * @param[in]  upper                             Upper threshold
+ */
+__kernel void threshold_range(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const uchar false_val,
+    const uchar true_val,
+    const uchar lower,
+    const uchar upper)
+{
+    // Get pixels pointer
+    Image in  = CONVERT_TO_IMAGE_STRUCT(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+
+    // Load data
+    uchar16 in_data = vload16(0, in.ptr);
+
+    // Perform range thresholding
+    in_data = select((uchar16)true_val, (uchar16)false_val, in_data > (uchar16)upper || in_data < (uchar16)lower);
+
+    // Store result
+    vstore16(in_data, 0, out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/transpose.cl b/src/core/CL/cl_kernels/transpose.cl
new file mode 100644
index 0000000000..c30158f280
--- /dev/null
+++ b/src/core/CL/cl_kernels/transpose.cl
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define SWAP_ROW(u0, l0)     \
+    ({                       \
+        tmp_swap = u0;       \
+        u0       = l0;       \
+        l0       = tmp_swap; \
+    })
+
+#define SWAP_4x4(u0, u1, u2, u3, l0, l1, l2, l3) \
+    ({                                           \
+        VEC_DATA_TYPE(DATA_TYPE, 4)              \
+        tmp_swap;                                \
+        SWAP_ROW(u0, l0);                        \
+        SWAP_ROW(u1, l1);                        \
+        SWAP_ROW(u2, l2);                        \
+        SWAP_ROW(u3, l3);                        \
+    })
+
+#define SWAP_8x8(u0, u1, u2, u3, u4, u5, u6, u7, l0, l1, l2, l3, l4, l5, l6, l7) \
+    ({                                                                           \
+        VEC_DATA_TYPE(DATA_TYPE, 8)                                              \
+        tmp_swap;                                                                \
+        SWAP_ROW(u0, l0);                                                        \
+        SWAP_ROW(u1, l1);                                                        \
+        SWAP_ROW(u2, l2);                                                        \
+        SWAP_ROW(u3, l3);                                                        \
+        SWAP_ROW(u4, l4);                                                        \
+        SWAP_ROW(u5, l5);                                                        \
+        SWAP_ROW(u6, l6);                                                        \
+        SWAP_ROW(u7, l7);                                                        \
+    })
+
+#define TRANSPOSE_4x4(u0, u1, u2, u3) \
+    ({                                \
+        VEC_DATA_TYPE(DATA_TYPE, 4)   \
+        tmp;                          \
+        tmp.s012 = u0.s123;           \
+        u0.s1    = u1.s0;             \
+        u0.s2    = u2.s0;             \
+        u0.s3    = u3.s0;             \
+        u1.s0    = tmp.s0;            \
+        u2.s0    = tmp.s1;            \
+        u3.s0    = tmp.s2;            \
+        \
+        tmp.s01 = u1.s23;             \
+        u1.s2   = u2.s1;              \
+        u1.s3   = u3.s1;              \
+        u2.s1   = tmp.s0;             \
+        u3.s1   = tmp.s1;             \
+        \
+        tmp.s0 = u2.s3;               \
+        u2.s3  = u3.s2;               \
+        u3.s2  = tmp.s0;              \
+    })
+
+#define TRANSPOSE_8x8(u0, u1, u2, u3, u4, u5, u6, u7)                                             \
+    ({                                                                                            \
+        TRANSPOSE_4x4(u0.s0123, u1.s0123, u2.s0123, u3.s0123);                                    \
+        TRANSPOSE_4x4(u0.s4567, u1.s4567, u2.s4567, u3.s4567);                                    \
+        TRANSPOSE_4x4(u4.s0123, u5.s0123, u6.s0123, u7.s0123);                                    \
+        TRANSPOSE_4x4(u4.s4567, u5.s4567, u6.s4567, u7.s4567);                                    \
+        SWAP_4x4(u0.s4567, u1.s4567, u2.s4567, u3.s4567, u4.s0123, u5.s0123, u6.s0123, u7.s0123); \
+    })
+
+#define TRANSPOSE_16x16(u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15)                                                \
+    ({                                                                                                                                       \
+        TRANSPOSE_8x8(u0.s01234567, u1.s01234567, u2.s01234567, u3.s01234567, u4.s01234567, u5.s01234567, u6.s01234567, u7.s01234567);       \
+        TRANSPOSE_8x8(u0.s89ABCDEF, u1.s89ABCDEF, u2.s89ABCDEF, u3.s89ABCDEF, u4.s89ABCDEF, u5.s89ABCDEF, u6.s89ABCDEF, u7.s89ABCDEF);       \
+        TRANSPOSE_8x8(u8.s01234567, u9.s01234567, u10.s01234567, u11.s01234567, u12.s01234567, u13.s01234567, u14.s01234567, u15.s01234567); \
+        TRANSPOSE_8x8(u8.s89ABCDEF, u9.s89ABCDEF, u10.s89ABCDEF, u11.s89ABCDEF, u12.s89ABCDEF, u13.s89ABCDEF, u14.s89ABCDEF, u15.s89ABCDEF); \
+        SWAP_8x8(u0.s89ABCDEF, u1.s89ABCDEF, u2.s89ABCDEF, u3.s89ABCDEF, u4.s89ABCDEF, u5.s89ABCDEF, u6.s89ABCDEF, u7.s89ABCDEF,             \
+                 u8.s01234567, u9.s01234567, u10.s01234567, u11.s01234567, u12.s01234567, u13.s01234567, u14.s01234567, u15.s01234567);      \
+    })
+
+#ifndef DATA_TYPE_IN_BYTES
+#error DATA_TYPE_IN_BYTES not set for the transpose OpenCL kernel
+#endif
+
+#if DATA_TYPE_IN_BYTES == 4
+#define DATA_TYPE uint
+#define TRANSPOSE() TRANSPOSE_4x4(u0, u1, u2, u3)
+#define VLOAD(x, y) vload4(x, y)
+#define VSTORE(x, y, z) vstore4(x, y, z)
+#define BLOCK_SIZE 4
+#elif DATA_TYPE_IN_BYTES == 2
+#define DATA_TYPE ushort
+#define TRANSPOSE() TRANSPOSE_8x8(u0, u1, u2, u3, u4, u5, u6, u7)
+#define VLOAD(x, y) vload8(x, y)
+#define VSTORE(x, y, z) vstore8(x, y, z)
+#define BLOCK_SIZE 8
+#elif DATA_TYPE_IN_BYTES == 1
+#define DATA_TYPE uchar
+#define TRANSPOSE() TRANSPOSE_16x16(u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15)
+#define VLOAD(x, y) vload16(x, y)
+#define VSTORE(x, y, z) vstore16(x, y, z)
+#define BLOCK_SIZE 16
+#else
+#error DATA_TYPE_IN_BYTES not supported for transpose
+#endif
+
+/** This OpenCL kernel computes the matrix transposition of input matrix
+ *
+ * @attention The number of bytes of the data type need to be passed at compile time using -DDATA_TYPE_IN_BYTES. DATA_TYPE_IN_BYTES can be:
+ *  -# -DDATA_TYPE_IN_BYTES=1 for transposing U8 or S8 matrices
+ *  -# -DDATA_TYPE_IN_BYTES=2 for transposing U16, S16 or FP16 matrices
+ *  -# -DDATA_TYPE_IN_BYTES=4 for transposing U32, S32 or FP32 matrices
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: same as src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void transpose(IMAGE_DECLARATION(src),
+                        IMAGE_DECLARATION(dst))
+{
+    uint x = get_global_id(0) * BLOCK_SIZE;
+    uint y = get_global_id(1) * BLOCK_SIZE;
+
+    // Compute source address
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    // Load the NxN block at (x, y)
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u0 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 0)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u1 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 1)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u2 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 2)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u3 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 3)));
+#if BLOCK_SIZE > 4
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u4 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 4)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u5 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 5)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u6 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 6)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u7 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 7)));
+#if BLOCK_SIZE == 16
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u8 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 8)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u9 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 9)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u10 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 10)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u11 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 11)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u12 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 12)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u13 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 13)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u14 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 14)));
+    VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+    u15 = VLOAD(0, (__global DATA_TYPE *)(offset(&src, 0, 15)));
+#endif /* BLOCK_SIZE == 16 */
+#endif /* BLOCK_SIZE > 4 */
+
+    // Transpose the block
+    TRANSPOSE();
+
+    // Store the block at (y, x)
+    uint dst_offset_in_bytes = y * DATA_TYPE_IN_BYTES + x * dst_stride_y + dst_offset_first_element_in_bytes;
+    VSTORE(u0, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 0 * dst_stride_y));
+    VSTORE(u1, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 1 * dst_stride_y));
+    VSTORE(u2, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 2 * dst_stride_y));
+    VSTORE(u3, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 3 * dst_stride_y));
+#if BLOCK_SIZE > 4
+    VSTORE(u4, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 4 * dst_stride_y));
+    VSTORE(u5, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 5 * dst_stride_y));
+    VSTORE(u6, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 6 * dst_stride_y));
+    VSTORE(u7, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 7 * dst_stride_y));
+#if BLOCK_SIZE == 16
+    VSTORE(u8, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 8 * dst_stride_y));
+    VSTORE(u9, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 9 * dst_stride_y));
+    VSTORE(u10, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 10 * dst_stride_y));
+    VSTORE(u11, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 11 * dst_stride_y));
+    VSTORE(u12, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 12 * dst_stride_y));
+    VSTORE(u13, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 13 * dst_stride_y));
+    VSTORE(u14, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 14 * dst_stride_y));
+    VSTORE(u15, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_in_bytes + 15 * dst_stride_y));
+#endif /* BLOCK_SIZE == 16 */
+#endif /* BLOCK_SIZE > 4 */
+}
diff --git a/src/core/CL/cl_kernels/types.h b/src/core/CL/cl_kernels/types.h
new file mode 100644
index 0000000000..87736465d2
--- /dev/null
+++ b/src/core/CL/cl_kernels/types.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TYPES_H
+#define ARM_COMPUTE_TYPES_H
+
+/** 2D Coordinates structure */
+typedef struct Coordinates2D
+{
+    int x; /**< The x coordinate. */
+    int y; /**< The y coordinate. */
+} Coordinates2D;
+
+/* Keypoint struct */
+typedef struct Keypoint
+{
+    int   x;               /**< The x coordinate. */
+    int   y;               /**< The y coordinate. */
+    float strength;        /**< The strength of the keypoint. Its definition is specific to the corner detector. */
+    float scale;           /**< Initialized to 0 by corner detectors. */
+    float orientation;     /**< Initialized to 0 by corner detectors. */
+    int   tracking_status; /**< A zero indicates a lost point. Initialized to 1 by corner detectors. */
+    float error;           /**< A tracking method specific error. Initialized to 0 by corner detectors. */
+} Keypoint;
+
+/** Detection window struct */
+typedef struct DetectionWindow
+{
+    ushort x;         /**< Top-left x coordinate */
+    ushort y;         /**< Top-left y coordinate */
+    ushort width;     /**< Width of the detection window */
+    ushort height;    /**< Height of the detection window */
+    ushort idx_class; /**< Index of the class */
+    float  score;     /**< Confidence value for the detection window */
+} DetectionWindow;
+#endif // ARM_COMPUTE_TYPES_H
diff --git a/src/core/CL/cl_kernels/warp_affine.cl b/src/core/CL/cl_kernels/warp_affine.cl
new file mode 100644
index 0000000000..0a4748f452
--- /dev/null
+++ b/src/core/CL/cl_kernels/warp_affine.cl
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "warp_helpers.h"
+
+/** Returns a vector of floats contaning the matrix coefficients. */
+inline const float8 build_affine_mtx()
+{
+    return (float8)(MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, 0, 0);
+}
+
+/** Transforms 4 2D coordinates using the formula:
+ *
+ *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
+ *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
+ *
+ * @param[in] coord 2D coordinate to transform.
+ * @param[in] mtx   affine matrix
+ *
+ * @return a int8 containing 4 2D transformed values.
+ */
+inline const float8 apply_affine_transform(const float2 coord, const float8 mtx)
+{
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+    // transform [x,x+1,x+2,x+3]
+    const float4 new_x = mad(/*A*/ in_x_coords, (float4)(mtx.s0) /*B*/, mad((float4)(coord.s1), (float4)(mtx.s2), (float4)(mtx.s4)));
+    // transform [y,y+1,y+2,y+3]
+    const float4 new_y = mad(in_x_coords, (float4)(mtx.s1), mad((float4)(coord.s1), (float4)(mtx.s3), (float4)(mtx.s5)));
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+}
+
+/** Performs an affine transform on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8.
+ *
+ * This kernel performs an affine transform with a 2x3 Matrix M with this method of pixel coordinate translation:
+ *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
+ *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
+ *   output(x,y) = input(x0,y0)
+ *
+ * @attention The matrix coefficients need to be passed at compile time:\n
+ * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=1 -DMAT3=2 -DMAT4=4 -DMAT5=2 "\n
+ * clBuildProgram( program, 0, NULL, build_options, NULL, NULL);
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8.
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
+ * @param[in]  width                             Width of the destination image
+ * @param[in]  height                            Height of the destination image
+ */
+__kernel void warp_affine_nearest_neighbour(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const int width,
+    const int height)
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+    vstore4(read_texels4(&in, convert_int8(clamp_to_border(apply_affine_transform(get_current_coords(), build_affine_mtx()), width, height))), 0, out.ptr);
+}
+
+/** Performs an affine transform on an image interpolating with the BILINEAR method. Input and output are single channel U8.
+ *
+ * @attention The matrix coefficients need to be passed at compile time:\n
+ * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=1 -DMAT3=2 -DMAT4=4 -DMAT5=2 "\n
+ * clBuildProgram( program, 0, NULL, build_options, NULL, NULL);
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8.
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
+ * @param[in]  width                             Width of the destination image
+ * @param[in]  height                            Height of the destination image
+ */
+__kernel void warp_affine_bilinear(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const int width,
+    const int height)
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+    vstore4(bilinear_interpolate(&in, clamp_to_border(apply_affine_transform(get_current_coords(), build_affine_mtx()), width, height), width, height), 0, out.ptr);
+}
diff --git a/src/core/CL/cl_kernels/warp_helpers.h b/src/core/CL/cl_kernels/warp_helpers.h
new file mode 100644
index 0000000000..26a8b859a4
--- /dev/null
+++ b/src/core/CL/cl_kernels/warp_helpers.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Clamps the given coordinates to the borders.
+ *
+ * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
+ * @param[in] width  Width of the image
+ * @param[in] height Height of the image
+ *
+ */
+inline const float8 clamp_to_border(float8 coords, const float width, const float height)
+{
+    const float4 clamped_x = clamp(coords.even, -1.0f, width);
+    const float4 clamped_y = clamp(coords.odd, -1.0f, height);
+    return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
+}
+
+/** Reads four texels from the input image. The coords vector is used to determine which texels to be read.
+ *
+ * @param[in] in     Pointer to the source image.
+ * @param[in] coords Vector of coordinates to be read from the image.
+ */
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) read_texels4(const Image *in, const int8 coords)
+{
+    return (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)offset(in, coords.s0, coords.s1)),
+                                         *((__global DATA_TYPE *)offset(in, coords.s2, coords.s3)),
+                                         *((__global DATA_TYPE *)offset(in, coords.s4, coords.s5)),
+                                         *((__global DATA_TYPE *)offset(in, coords.s6, coords.s7)));
+}
+
+/** Returns the current thread coordinates. */
+inline const float2 get_current_coords()
+{
+    return (float2)(get_global_id(0) * 4, get_global_id(1));
+}
+
+/** Given a texel coordinates this function will return the following array of coordinates:
+ * [ P, right neighbour, below neighbour, below right neighbour ]
+ *
+ * @note No checks to see if the coordinates are out of the image are done here.
+ *
+ * @param[in] coord Input coordinates
+ *
+ * @return vector of 8 floats with the coordinates, even positions are x and odd y.
+*/
+inline const float8 get_neighbour_coords(const float2 coord)
+{
+    return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1);
+}
+
+/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
+ *
+ * @param[in] in     Pointer to the source image.
+ * @param[in] coords Vector of four 2D coordinates. Even pos is x and odd y.
+ * @param[in] width  Width of the image
+ * @param[in] height Height of the image
+*/
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height)
+{
+    // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image.
+
+    // Sets the 4x4 coordinates for each of the four input texels
+    const float8  fc = floor(coords);
+    const float16 c1 = (float16)(
+                           clamp_to_border(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height),
+                           clamp_to_border(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height));
+    const float16 c2 = (float16)(
+                           clamp_to_border(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height),
+                           clamp_to_border(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height));
+    // Loads the values from the input image
+    const float16 t = (float16)(
+                          /* tl, tr, bl, br */
+                          * ((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)),
+                          *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)),
+                          *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)),
+                          *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)),
+                          *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)),
+                          *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)),
+                          *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)),
+                          *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf)));
+    const float8 a  = coords - fc;
+    const float8 b  = ((float8)(1.f)) - a;
+    const float4 fr = (float4)(
+                          ((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)),
+                          ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)),
+                          ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)),
+                          ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7)));
+    return CONVERT(fr, VEC_DATA_TYPE(DATA_TYPE, 4));
+}
diff --git a/src/core/CL/cl_kernels/warp_perspective.cl b/src/core/CL/cl_kernels/warp_perspective.cl
new file mode 100644
index 0000000000..863b6c9e96
--- /dev/null
+++ b/src/core/CL/cl_kernels/warp_perspective.cl
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "warp_helpers.h"
+
+/** Returns the perspective matrix */
+inline const float16 build_perspective_mtx()
+{
+    return (float16)(MAT0, MAT1, MAT2, MAT3, MAT4, MAT5, MAT6, MAT7, MAT8, 0, 0, 0, (float4)0);
+}
+
+/** Transforms four 2D coordinates using the formula:
+ *
+ *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
+ *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
+ *   z0 = M[3][1] * x + M[3][2] * y + M[3][3]
+ *
+ *   (x0/z0,y0/z0)
+ *
+ * @param[in] coord 2D coordinate to transform.
+ * @param[in] mtx   perspective matrix
+ *
+ * @return a vector float8 containing four 2D transformed values.
+ */
+inline const float8 apply_perspective_transform(const float2 coord, const float16 mtx)
+{
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+    // transform [z,z+1,z+2,z+3]
+    const float4 z = (float4)mad(in_x_coords, (float4)(mtx.s2), mad((float4)(coord.s1), (float4)(mtx.s5), (float4)(mtx.s8)));
+    // NOTE: Do not multiply x&y by 1.f/Z as this will result in loss of accuracy and mismatches with VX reference implementation
+    // transform [x,x+1,x+2,x+3]
+    const float4 new_x = (float4)mad(in_x_coords, (float4)(mtx.s0), mad((float4)(coord.s1), (float4)(mtx.s3), (float4)(mtx.s6))) / z;
+    // transform [y,y+1,y+2,y+3]
+    const float4 new_y = (float4)mad(in_x_coords, (float4)(mtx.s1), mad((float4)(coord.s1), (float4)(mtx.s4), (float4)(mtx.s7))) / z;
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+}
+
+/** Performs perspective transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8.
+ *
+ * This kernel performs perspective transform with a 3x3 Matrix M with this method of pixel coordinate translation:
+ *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
+ *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
+ *   z0 = M[3][1] * x + M[3][2] * y + M[3][3]
+
+ *   output(x,y) = input(x0/z0,y0/z0)
+ *
+ * @attention The matrix coefficients need to be passed at compile time:\n
+ * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=3 -DMAT3=4 -DMAT4=5 -DMAT5=6 -DMAT6=7 -DMAT7=8 -DMAT8=9"\n
+ * clBuildProgram( program, 0, NULL, build_options, NULL, NULL);
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8.
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
+ * @param[in]  width                             Width of the destination image
+ * @param[in]  height                            Height of the destination image
+ */
+__kernel void warp_perspective_nearest_neighbour(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const int width,
+    const int height)
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+    vstore4(read_texels4(&in, convert_int8(clamp_to_border(apply_perspective_transform(get_current_coords(), build_perspective_mtx()), width, height))), 0, out.ptr);
+}
+
+/** Performs a perspective transform on an image interpolating with the BILINEAR method. Input and output are single channel U8.
+ *
+ * @attention The matrix coefficients need to be passed at compile time:\n
+ * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=3 -DMAT3=4 -DMAT4=5 -DMAT5=6 -DMAT6=7 -DMAT7=8 -DMAT8=9"\n
+ * clBuildProgram( program, 0, NULL, build_options, NULL, NULL);
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8.
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
+ * @param[in]  width                             Width of the destination image
+ * @param[in]  height                            Height of the destination image
+ */
+__kernel void warp_perspective_bilinear(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const int width,
+    const int height)
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+    vstore4(bilinear_interpolate(&in, clamp_to_border(apply_perspective_transform(get_current_coords(), build_perspective_mtx()), width, height), width, height), 0, out.ptr);
+}
diff --git a/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp b/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp
new file mode 100644
index 0000000000..685b8e234e
--- /dev/null
+++ b/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLAbsoluteDifferenceKernel::CLAbsoluteDifferenceKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLAbsoluteDifferenceKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
+                             "The output image can only be U8 if both input images are U8");
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    // Set kernel build options
+    std::set<std::string> build_opts;
+    build_opts.insert("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
+    build_opts.insert("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
+    build_opts.insert("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("absdiff", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    ICLKernel::configure(win);
+}
+
+void CLAbsoluteDifferenceKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input1, slice);
+        add_2D_tensor_argument(idx, _input2, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLAccumulateKernel.cpp b/src/core/CL/kernels/CLAccumulateKernel.cpp
new file mode 100644
index 0000000000..6333f04e71
--- /dev/null
+++ b/src/core/CL/kernels/CLAccumulateKernel.cpp
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLAccumulateKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+void CLAccumulateKernel::configure(const ICLTensor *input, ICLTensor *accum)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16);
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("accumulate"));
+
+    // Make sure _kernel is initialized before calling the parent's configure
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    ICLSimple2DKernel::configure(input, accum, num_elems_processed_per_iteration);
+}
+
+void CLAccumulateWeightedKernel::configure(const ICLTensor *input, float alpha, ICLTensor *accum)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(alpha < 0.0 || alpha > 1.0);
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("accumulate_weighted"));
+
+    // Set static kernel arguments
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, alpha);
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    ICLSimple2DKernel::configure(input, accum, num_elems_processed_per_iteration);
+}
+
+void CLAccumulateSquaredKernel::configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON(shift > 15);
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("accumulate_squared"));
+
+    // Set static kernel arguments
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, shift);
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    ICLSimple2DKernel::configure(input, accum, num_elems_processed_per_iteration);
+}
diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
new file mode 100644
index 0000000000..83bbe6a3be
--- /dev/null
+++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+void CLActivationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+    // Set build options
+    std::set<std::string> build_opts;
+    build_opts.insert(("-D" + string_from_activation_func(act_info.activation())));
+    build_opts.insert(("-D" + ((is_data_type_float(input->info()->data_type())) ? std::string("TYPE_FP") : std::string("TYPE_INT"))));
+    build_opts.insert(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.insert(("-DA=" + val_to_string(act_info.a())));
+    build_opts.insert(("-DB=" + val_to_string(act_info.b())));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("activation_layer", build_opts));
+
+    // Make sure _kernel is initialized before calling the parent's configure
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    ICLSimple3DKernel::configure(input, output, num_elems_processed_per_iteration);
+}
diff --git a/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
new file mode 100644
index 0000000000..aaa62d0268
--- /dev/null
+++ b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cstddef>
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLArithmeticAdditionKernel::CLArithmeticAdditionKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLArithmeticAdditionKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    const bool has_float_out = is_data_type_float(output->info()->data_type());
+
+    // Check for invalid combination
+    if(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8))
+    {
+        ARM_COMPUTE_ERROR("You called with the wrong data types.");
+    }
+
+    // Set kernel build options
+    std::set<std::string> build_opts;
+    build_opts.emplace((policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE");
+    build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
+    build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
+    build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("arithmetic_add", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    ICLKernel::configure(win);
+}
+
+void CLArithmeticAdditionKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input1, slice);
+        add_2D_tensor_argument(idx, _input2, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
new file mode 100644
index 0000000000..4c847276da
--- /dev/null
+++ b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLArithmeticSubtractionKernel::CLArithmeticSubtractionKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLArithmeticSubtractionKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    // Check for invalid combination
+    if(output->info()->data_type() == DataType::U8)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    }
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    bool has_float_out = is_data_type_float(output->info()->data_type());
+
+    // Set kernel build options
+    std::set<std::string> build_opts;
+    build_opts.emplace((policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE");
+    build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
+    build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
+    build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("arithmetic_sub", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    ICLKernel::configure(win);
+}
+
+void CLArithmeticSubtractionKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input1, slice);
+        add_2D_tensor_argument(idx, _input2, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
new file mode 100644
index 0000000000..309a153b7a
--- /dev/null
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLBatchNormalizationLayerKernel::CLBatchNormalizationLayerKernel()
+    : _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _beta(nullptr), _gamma(nullptr), _epsilon(0)
+{
+}
+
+void CLBatchNormalizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma,
+                                                float epsilon)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mean, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(var, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(beta, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gamma, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, var);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != mean->info()->dimension(0));
+
+    // Set build options
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+
+    _input   = input;
+    _output  = output;
+    _mean    = mean;
+    _var     = var;
+    _beta    = beta;
+    _gamma   = gamma;
+    _epsilon = epsilon;
+
+    // Create kernel
+    std::string kernel_name = "batchnormalization_layer";
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Set kernel static arguments
+    unsigned int idx = 2 * num_arguments_per_3D_tensor() + 4 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
+    _kernel.setArg<cl_float>(idx++, _epsilon);
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = 4;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLBatchNormalizationLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice = window.first_slice_window_3D();
+
+    Window vector_slice = window.first_slice_window_1D();
+    vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+    unsigned int idx = 2 * num_arguments_per_3D_tensor();
+    add_1D_tensor_argument(idx, _mean, vector_slice);
+    add_1D_tensor_argument(idx, _var, vector_slice);
+    add_1D_tensor_argument(idx, _beta, vector_slice);
+    add_1D_tensor_argument(idx, _gamma, vector_slice);
+
+    do
+    {
+        idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}
diff --git a/src/core/CL/kernels/CLBitwiseAndKernel.cpp b/src/core/CL/kernels/CLBitwiseAndKernel.cpp
new file mode 100644
index 0000000000..5ea4a86da5
--- /dev/null
+++ b/src/core/CL/kernels/CLBitwiseAndKernel.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLBitwiseAndKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLBitwiseAndKernel::CLBitwiseAndKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+void CLBitwiseAndKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("bitwise_and"));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    ICLKernel::configure(win);
+}
+
+void CLBitwiseAndKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input1, slice);
+        add_2D_tensor_argument(idx, _input2, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLBitwiseNotKernel.cpp b/src/core/CL/kernels/CLBitwiseNotKernel.cpp
new file mode 100644
index 0000000000..0098e15ab6
--- /dev/null
+++ b/src/core/CL/kernels/CLBitwiseNotKernel.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLBitwiseNotKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+void CLBitwiseNotKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    _input  = input;
+    _output = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("bitwise_not"));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
+}
diff --git a/src/core/CL/kernels/CLBitwiseOrKernel.cpp b/src/core/CL/kernels/CLBitwiseOrKernel.cpp
new file mode 100644
index 0000000000..2eeef0a993
--- /dev/null
+++ b/src/core/CL/kernels/CLBitwiseOrKernel.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLBitwiseOrKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLBitwiseOrKernel::CLBitwiseOrKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLBitwiseOrKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("bitwise_or"));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    ICLKernel::configure(win);
+}
+
+void CLBitwiseOrKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input1, slice);
+        add_2D_tensor_argument(idx, _input2, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLBitwiseXorKernel.cpp b/src/core/CL/kernels/CLBitwiseXorKernel.cpp
new file mode 100644
index 0000000000..c19a78e1c4
--- /dev/null
+++ b/src/core/CL/kernels/CLBitwiseXorKernel.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLBitwiseXorKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLBitwiseXorKernel::CLBitwiseXorKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLBitwiseXorKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("bitwise_xor"));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    ICLKernel::configure(win);
+}
+
+void CLBitwiseXorKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input1, slice);
+        add_2D_tensor_argument(idx, _input2, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLBox3x3Kernel.cpp b/src/core/CL/kernels/CLBox3x3Kernel.cpp
new file mode 100644
index 0000000000..e113d30210
--- /dev/null
+++ b/src/core/CL/kernels/CLBox3x3Kernel.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLBox3x3Kernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+BorderSize CLBox3x3Kernel::border_size() const
+{
+    return 1;
+}
+
+void CLBox3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    _input  = input;
+    _output = output;
+
+    // Set build options
+    std::set<std::string> build_opts = { "-DMAT0=1", "-DMAT1=1", "-DMAT2=1",
+                                         "-DMAT3=1", "-DMAT4=1", "-DMAT5=1",
+                                         "-DMAT6=1", "-DMAT7=1", "-DMAT8=1",
+                                         "-DSCALE=9", "-DDATA_TYPE_OUT=uchar"
+                                       };
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution3x3_static", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLCannyEdgeKernel.cpp b/src/core/CL/kernels/CLCannyEdgeKernel.cpp
new file mode 100644
index 0000000000..5d06d34631
--- /dev/null
+++ b/src/core/CL/kernels/CLCannyEdgeKernel.cpp
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLCannyEdgeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+CLGradientKernel::CLGradientKernel()
+    : _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr)
+{
+}
+
+void CLGradientKernel::configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, int32_t norm_type)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MSG(data_size_from_type(gx->info()->data_type()) != data_size_from_type(gy->info()->data_type()),
+                             "Gx and Gy must have the same pixel size");
+    ARM_COMPUTE_ERROR_ON_MSG(data_size_from_type(gx->info()->data_type()) != data_size_from_type(magnitude->info()->data_type()),
+                             "Mag must have the same pixel size as Gx and Gy");
+
+    _gx        = gx;
+    _gy        = gy;
+    _magnitude = magnitude;
+    _phase     = phase;
+
+    // Create build opts
+    std::set<std::string> built_opts;
+    built_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(gx->info()->data_type()));
+    built_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(gx->info()->data_type()));
+
+    // Create kernel
+    const std::string kernel_name = (norm_type == 1) ? std::string("combine_gradients_L1") : std::string("combine_gradients_L2");
+    _kernel                       = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, built_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
+
+    Window win = calculate_max_window(*_gx->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal gx_access(_gx->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal gy_access(_gy->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal mag_access(_magnitude->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, gx_access, gy_access, mag_access, phase_access);
+
+    mag_access.set_valid_region(win, _gx->info()->valid_region());
+    phase_access.set_valid_region(win, _gx->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLGradientKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _gx, slice);
+        add_2D_tensor_argument(idx, _gy, slice);
+        add_2D_tensor_argument(idx, _magnitude, slice);
+        add_2D_tensor_argument(idx, _phase, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
+
+CLEdgeNonMaxSuppressionKernel::CLEdgeNonMaxSuppressionKernel()
+    : _magnitude(nullptr), _phase(nullptr), _output(nullptr)
+{
+}
+
+BorderSize CLEdgeNonMaxSuppressionKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLEdgeNonMaxSuppressionKernel::configure(const ICLTensor *magnitude, const ICLTensor *phase, ICLTensor *output, int32_t lower_thr, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::U32);
+
+    _magnitude = magnitude;
+    _phase     = phase;
+    _output    = output;
+
+    // Create build opts
+    std::set<std::string> built_opts;
+    built_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(magnitude->info()->data_type()));
+    built_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("suppress_non_maximum", built_opts));
+
+    // Set minimum threshold argument
+    unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, lower_thr);
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration    = 1;
+    constexpr unsigned int num_elems_read_written_per_iteration = 3;
+
+    Window win = calculate_max_window(*_magnitude->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle mag_access(_magnitude->info(), -border_size().left, -border_size().top,
+                                     num_elems_read_written_per_iteration, num_elems_read_written_per_iteration);
+    AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, mag_access, phase_access, output_access);
+
+    output_access.set_valid_region(win, _magnitude->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLEdgeNonMaxSuppressionKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _magnitude, slice);
+        add_2D_tensor_argument(idx, _phase, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
+
+CLEdgeTraceKernel::CLEdgeTraceKernel()
+    : _input(nullptr), _output(nullptr), _lower_thr(0), _upper_thr(0), _visited(nullptr), _recorded(nullptr), _l1_stack(nullptr), _l1_stack_counter(nullptr)
+{
+}
+
+void CLEdgeTraceKernel::configure(const ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr,
+                                  ICLTensor *visited, ICLTensor *recorded, ICLTensor *l1_stack, ICLTensor *l1_stack_counter)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::U32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(visited, 1, DataType::U32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(recorded, 1, DataType::U32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(l1_stack, 1, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(l1_stack_counter, 1, DataType::U8);
+
+    _input            = input;
+    _output           = output;
+    _lower_thr        = lower_thr;
+    _upper_thr        = upper_thr;
+    _visited          = visited;
+    _recorded         = recorded;
+    _l1_stack         = l1_stack;
+    _l1_stack_counter = l1_stack_counter;
+
+    // Create build opts
+    std::set<std::string> built_opts;
+    built_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
+    built_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("hysteresis", built_opts));
+
+    // Set constant kernel args
+    unsigned int width  = _input->info()->dimension(0);
+    unsigned int height = _input->info()->dimension(1);
+    unsigned int idx    = 6 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, static_cast<cl_uint>(_lower_thr));
+    _kernel.setArg(idx++, static_cast<cl_uint>(_upper_thr));
+    _kernel.setArg(idx++, static_cast<cl_uint>(width));
+    _kernel.setArg(idx++, static_cast<cl_uint>(height));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+    Window                 win                               = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal visited_access(_visited->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal recorded_access(_recorded->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal l1_stack_access(_l1_stack->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal l1_stack_counter_access(_l1_stack_counter->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(_input->info(), 0, num_elems_processed_per_iteration),
+                              output_access,
+                              visited_access,
+                              recorded_access,
+                              l1_stack_access,
+                              l1_stack_counter_access);
+
+    output_access.set_valid_region(win, _input->info()->valid_region());
+    visited_access.set_valid_region(win, _input->info()->valid_region());
+    recorded_access.set_valid_region(win, _input->info()->valid_region());
+    l1_stack_access.set_valid_region(win, _input->info()->valid_region());
+    l1_stack_counter_access.set_valid_region(win, _input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLEdgeTraceKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        add_2D_tensor_argument(idx, _visited, slice);
+        add_2D_tensor_argument(idx, _recorded, slice);
+        add_2D_tensor_argument(idx, _l1_stack, slice);
+        add_2D_tensor_argument(idx, _l1_stack_counter, slice);
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLChannelCombineKernel.cpp b/src/core/CL/kernels/CLChannelCombineKernel.cpp
new file mode 100644
index 0000000000..d729ebcfb3
--- /dev/null
+++ b/src/core/CL/kernels/CLChannelCombineKernel.cpp
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLChannelCombineKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLMultiImage.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/MultiImageInfo.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLChannelCombineKernel::CLChannelCombineKernel()
+    : _planes{ { nullptr } }, _output(nullptr), _output_multi(nullptr), _x_subsampling{ { 1, 1, 1 } }, _y_subsampling{ { 1, 1, 1 } }
+{
+}
+
+void CLChannelCombineKernel::configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane0, Format::U8);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane1, Format::U8);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane2, Format::U8);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::RGB888, Format::RGBA8888, Format::YUYV422, Format::UYVY422);
+
+    const Format fmt = output->info()->format();
+    _planes[0]       = plane0;
+    _planes[1]       = plane1;
+    _planes[2]       = plane2;
+    if(Format::RGBA8888 == fmt)
+    {
+        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane3, Format::U8);
+        _planes[3] = plane3;
+    }
+    else
+    {
+        _planes[3] = nullptr;
+    }
+    _output       = output;
+    _output_multi = nullptr;
+
+    // Half the processed elements for U,V channels due to sub-sampling of 2
+    if(Format::YUYV422 == fmt || Format::UYVY422 == fmt)
+    {
+        _x_subsampling = { { 1, 2, 2 } };
+        _y_subsampling = { { 1, 2, 2 } };
+    }
+    else
+    {
+        _x_subsampling = { { 1, 1, 1 } };
+        _y_subsampling = { { 1, 1, 1 } };
+    }
+
+    // Create kernel
+    std::string kernel_name = "channel_combine_" + string_from_format(fmt);
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+
+    // Configure window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal plane0_access(plane0->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowRectangle  plane1_access(plane1->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
+    AccessWindowRectangle  plane2_access(plane2->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
+    AccessWindowHorizontal plane3_access(plane3 == nullptr ? nullptr : plane3->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, plane0_access, plane1_access, plane2_access, plane3_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(plane0->info()->valid_region(),
+                                                       plane1->info()->valid_region(),
+                                                       plane2->info()->valid_region());
+    if(plane3 != nullptr)
+    {
+        valid_region = intersect_valid_regions(plane3->info()->valid_region(), valid_region);
+    }
+    output_access.set_valid_region(win, ValidRegion(valid_region.anchor, output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLChannelCombineKernel::configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane0);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane1);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane2);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane0, Format::U8);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane1, Format::U8);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane2, Format::U8);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
+
+    _planes[0]           = plane0;
+    _planes[1]           = plane1;
+    _planes[2]           = plane2;
+    _planes[3]           = nullptr;
+    _output              = nullptr;
+    _output_multi        = output;
+    bool has_two_planars = false;
+
+    // Set sub-sampling parameters for each plane
+    const Format          fmt = output->info()->format();
+    std::string           kernel_name;
+    std::set<std::string> build_opts;
+
+    if(Format::NV12 == fmt || Format::NV21 == fmt)
+    {
+        _x_subsampling = { { 1, 2, 2 } };
+        _y_subsampling = { { 1, 2, 2 } };
+        kernel_name    = "channel_combine_NV";
+        build_opts.emplace(Format::NV12 == fmt ? "-DNV12" : "-DNV21");
+        has_two_planars = true;
+    }
+    else
+    {
+        if(Format::IYUV == fmt)
+        {
+            _x_subsampling = { { 1, 2, 2 } };
+            _y_subsampling = { { 1, 2, 2 } };
+        }
+        else
+        {
+            _x_subsampling = { { 1, 1, 1 } };
+            _y_subsampling = { { 1, 1, 1 } };
+        }
+
+        kernel_name = "copy_planes_3p";
+        build_opts.emplace(Format::IYUV == fmt ? "-DIYUV" : "-DYUV444");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Configure window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*plane0->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_plane0_access(plane0->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowRectangle  input_plane1_access(plane1->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
+    AccessWindowRectangle  input_plane2_access(plane2->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
+    AccessWindowRectangle  output_plane0_access(output->plane(0)->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f, 1.f / _y_subsampling[1]);
+    AccessWindowRectangle  output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
+    AccessWindowRectangle  output_plane2_access(has_two_planars ? nullptr : output->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
+
+    update_window_and_padding(win,
+                              input_plane0_access, input_plane1_access, input_plane2_access,
+                              output_plane0_access, output_plane1_access, output_plane2_access);
+
+    ValidRegion plane0_valid_region  = plane0->info()->valid_region();
+    ValidRegion output_plane1_region = has_two_planars ? intersect_valid_regions(plane1->info()->valid_region(), plane2->info()->valid_region()) : plane2->info()->valid_region();
+    output_plane0_access.set_valid_region(win, ValidRegion(plane0_valid_region.anchor, output->plane(0)->info()->tensor_shape()));
+    output_plane1_access.set_valid_region(win, ValidRegion(output_plane1_region.anchor, output->plane(1)->info()->tensor_shape()));
+    output_plane2_access.set_valid_region(win, ValidRegion(plane2->info()->valid_region().anchor, output->plane(2)->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLChannelCombineKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        // Subsampling in plane 1
+        Window win_sub_plane1(slice);
+        win_sub_plane1.set(Window::DimX, Window::Dimension(win_sub_plane1.x().start() / _x_subsampling[1], win_sub_plane1.x().end() / _x_subsampling[1], win_sub_plane1.x().step() / _x_subsampling[1]));
+        win_sub_plane1.set(Window::DimY, Window::Dimension(win_sub_plane1.y().start() / _y_subsampling[1], win_sub_plane1.y().end() / _y_subsampling[1], 1));
+
+        // Subsampling in plane 2
+        Window win_sub_plane2(slice);
+        win_sub_plane2.set(Window::DimX, Window::Dimension(win_sub_plane2.x().start() / _x_subsampling[2], win_sub_plane2.x().end() / _x_subsampling[2], win_sub_plane2.x().step() / _x_subsampling[2]));
+        win_sub_plane2.set(Window::DimY, Window::Dimension(win_sub_plane2.y().start() / _y_subsampling[2], win_sub_plane2.y().end() / _y_subsampling[2], 1));
+
+        unsigned int idx = 0;
+
+        // Set inputs
+        add_2D_tensor_argument(idx, _planes[0], slice);
+        add_2D_tensor_argument(idx, _planes[1], win_sub_plane1);
+        add_2D_tensor_argument(idx, _planes[2], win_sub_plane2);
+
+        if(nullptr != _planes[3])
+        {
+            add_2D_tensor_argument(idx, _planes[3], slice);
+        }
+
+        // Set outputs
+        if(nullptr != _output) // Single planar output
+        {
+            add_2D_tensor_argument(idx, _output, slice);
+        }
+        else // Multi-planar output
+        {
+            // Reduce slice in case of subsampling to avoid out-of bounds access
+            slice.set(Window::DimY, Window::Dimension(slice.y().start() / _y_subsampling[1], slice.y().end() / _y_subsampling[1], 1));
+
+            add_2D_tensor_argument(idx, _output_multi->cl_plane(0), slice);
+            add_2D_tensor_argument(idx, _output_multi->cl_plane(1), win_sub_plane1);
+
+            if(3 == num_planes_from_format(_output_multi->info()->format()))
+            {
+                add_2D_tensor_argument(idx, _output_multi->cl_plane(2), win_sub_plane2);
+            }
+
+            _kernel.setArg(idx++, slice.y().end());
+        }
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLChannelExtractKernel.cpp b/src/core/CL/kernels/CLChannelExtractKernel.cpp
new file mode 100644
index 0000000000..541153316a
--- /dev/null
+++ b/src/core/CL/kernels/CLChannelExtractKernel.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLChannelExtractKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLMultiImage.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/MultiImageInfo.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLChannelExtractKernel::CLChannelExtractKernel()
+    : _input(nullptr), _output(nullptr), _num_elems_processed_per_iteration(8), _subsampling(1)
+{
+}
+
+void CLChannelExtractKernel::configure(const ICLTensor *input, Channel channel, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::RGB888, Format::RGBA8888, Format::YUYV422, Format::UYVY422);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8);
+    ARM_COMPUTE_ERROR_ON(static_cast<const void *>(input) == static_cast<void *>(output));
+
+    _input  = input;
+    _output = output;
+
+    // Check format
+    const Format format = input->info()->format();
+    ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(format, channel);
+
+    // Create kernel
+    std::string           kernel_name = "channel_extract_" + string_from_format(format);
+    std::set<std::string> build_opts  = { ("-DCHANNEL_" + string_from_channel(channel)) };
+    _kernel                           = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Half the processed elements for U,V channels due to sub-sampling of 2
+    _subsampling = ((Format::YUYV422 == format || Format::UYVY422 == format) && Channel::Y != channel) ? 2 : 1;
+
+    // Configure window
+    Window                 win = calculate_max_window(*input->info(), Steps(_num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, _num_elems_processed_per_iteration);
+    AccessWindowRectangle  output_access(input->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _subsampling, 1.f / _subsampling);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    ValidRegion input_valid_region = input->info()->valid_region();
+    output_access.set_valid_region(win, ValidRegion(std::move(input_valid_region.anchor), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLChannelExtractKernel::configure(const ICLMultiImage *input, Channel channel, ICLImage *output)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8);
+    ARM_COMPUTE_ERROR_ON(static_cast<const void *>(input) == static_cast<void *>(output));
+
+    // Get format
+    const Format fmt = input->info()->format();
+
+    // Get input plane
+    const ICLImage *input_plane = input->cl_plane(plane_idx_from_channel(fmt, channel));
+    ARM_COMPUTE_ERROR_ON(nullptr == input_plane);
+
+    _output      = output;
+    _input       = input_plane;
+    _subsampling = 1;
+
+    // Create kernel
+    std::string           kernel_name;
+    std::set<std::string> build_opts;
+    if(Channel::Y == channel || Format::IYUV == fmt || Format::YUV444 == fmt)
+    {
+        kernel_name = "copy_plane";
+    }
+    else
+    {
+        kernel_name = "channel_extract_" + string_from_format(fmt);
+        build_opts.insert(("-DCHANNEL_" + string_from_channel(channel)));
+    }
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Configure window
+    Window                 win = calculate_max_window(*input_plane->info(), Steps(_num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(input_plane->info(), 0, _num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input_plane->info(), 0, _num_elems_processed_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input_plane->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLChannelExtractKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        Window win_sub(slice);
+        win_sub.set(Window::DimX, Window::Dimension(win_sub.x().start() / _subsampling, win_sub.x().end() / _subsampling, win_sub.x().step() / _subsampling));
+        win_sub.set(Window::DimY, Window::Dimension(win_sub.y().start() / _subsampling, win_sub.y().end() / _subsampling, 1));
+
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        add_2D_tensor_argument(idx, _output, win_sub);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp
new file mode 100644
index 0000000000..ad66c39483
--- /dev/null
+++ b/src/core/CL/kernels/CLCol2ImKernel.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLCol2ImKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cmath>
+
+using namespace arm_compute;
+
+CLCol2ImKernel::CLCol2ImKernel()
+    : _input(nullptr), _output(nullptr), _convolved_dims()
+{
+}
+
+void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, std::pair<unsigned int, unsigned int> convolved_dims)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input          = input;
+    _output         = output;
+    _convolved_dims = convolved_dims;
+
+    // Create kernel
+    std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) };
+    _kernel                          = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("col2im", build_opts));
+
+    // Set static kernel arguments
+    unsigned int idx = num_arguments_per_2D_tensor() + num_arguments_per_3D_tensor();
+    _kernel.setArg<cl_uint>(idx++, _convolved_dims.first);
+
+    // Configure window
+    Window win = calculate_max_window(*input->info(), Steps());
+    // The CLCol2ImKernel doesn't need padding so update_window_and_padding() can be skipped
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    ICLKernel::configure(win);
+}
+
+void CLCol2ImKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    Window slice_in  = window.first_slice_window_2D();
+    Window slice_out = window.first_slice_window_3D();
+    do
+    {
+        // Set inputs
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice_in);
+        add_3D_tensor_argument(idx, _output, slice_out);
+        enqueue(queue, *this, slice_in);
+    }
+    while(window.slide_window_slice_2D(slice_in) && window.slide_window_slice_3D(slice_out));
+}
diff --git a/src/core/CL/kernels/CLColorConvertKernel.cpp b/src/core/CL/kernels/CLColorConvertKernel.cpp
new file mode 100644
index 0000000000..ead2b8f092
--- /dev/null
+++ b/src/core/CL/kernels/CLColorConvertKernel.cpp
@@ -0,0 +1,476 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLColorConvertKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLMultiImage.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/MultiImageInfo.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <sstream>
+
+using namespace arm_compute;
+
+CLColorConvertKernel::CLColorConvertKernel()
+    : _input(nullptr), _output(nullptr), _multi_input(nullptr), _multi_output(nullptr)
+{
+}
+
+void CLColorConvertKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    unsigned int num_elems_processed_per_iteration = 0;
+    switch(input->info()->format())
+    {
+        case Format::RGBA8888:
+        {
+            switch(output->info()->format())
+            {
+                case Format::RGB888:
+                    num_elems_processed_per_iteration = 16;
+                    break;
+                default:
+                    break;
+            }
+            break;
+        }
+        case Format::UYVY422:
+        case Format::YUYV422:
+        {
+            switch(output->info()->format())
+            {
+                case Format::RGB888:
+                case Format::RGBA8888:
+                    num_elems_processed_per_iteration = 8;
+                    break;
+                default:
+                    break;
+            }
+            break;
+        }
+        case Format::RGB888:
+        {
+            switch(output->info()->format())
+            {
+                case Format::RGBA8888:
+                    num_elems_processed_per_iteration = 16;
+                    break;
+                default:
+                    break;
+            }
+            break;
+        }
+        default:
+            break;
+    }
+    ARM_COMPUTE_ERROR_ON_MSG(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported",
+                             string_from_format(input->info()->format()).c_str(),
+                             string_from_format(output->info()->format()).c_str());
+
+    std::stringstream kernel_name;
+
+    kernel_name << string_from_format(input->info()->format());
+    kernel_name << "_to_";
+    kernel_name << string_from_format(output->info()->format());
+    kernel_name << "_bt709";
+
+    _input  = input;
+    _output = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str()));
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLColorConvertKernel::configure(const ICLMultiImage *input, ICLImage *output)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    unsigned int num_elems_processed_per_iteration = 0;
+
+    switch(input->info()->format())
+    {
+        case Format::NV12:
+        case Format::NV21:
+        case Format::IYUV:
+        {
+            switch(output->info()->format())
+            {
+                case Format::RGB888:
+                case Format::RGBA8888:
+                    num_elems_processed_per_iteration = 4;
+                    break;
+                default:
+                    break;
+            }
+            break;
+        }
+        default:
+            break;
+    }
+    ARM_COMPUTE_ERROR_ON_MSG(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported",
+                             string_from_format(input->info()->format()).c_str(),
+                             string_from_format(output->info()->format()).c_str());
+
+    std::stringstream kernel_name;
+
+    kernel_name << string_from_format(input->info()->format());
+    kernel_name << "_to_";
+    kernel_name << string_from_format(output->info()->format());
+    kernel_name << "_bt709";
+
+    _multi_input = input;
+    _output      = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str()));
+
+    // Configure kernel window
+    const bool  has_two_planes = (input->info()->format() == Format::NV12) || (input->info()->format() == Format::NV21);
+    const float sub_sampling   = (has_two_planes || (input->info()->format() == Format::IYUV)) ? 0.5f : 1;
+
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    win.set_dimension_step(Window::DimY, 2);
+
+    AccessWindowHorizontal plane0_access(input->plane(0)->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowRectangle  plane1_access(input->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1,
+                                         sub_sampling, sub_sampling);
+    AccessWindowRectangle plane2_access(has_two_planes ? nullptr : input->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1,
+                                        sub_sampling, sub_sampling);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              plane0_access, plane1_access, plane2_access,
+                              output_access);
+
+    ValidRegion intersect_region = intersect_valid_regions(input->plane(0)->info()->valid_region(), input->plane(1)->info()->valid_region(),
+                                                           input->plane(2)->info()->valid_region());
+    output_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLColorConvertKernel::configure(const ICLImage *input, ICLMultiImage *output)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    unsigned int num_elems_processed_per_iteration = 0;
+
+    bool  has_two_planes = (output->info()->format() == Format::NV12) || (output->info()->format() == Format::NV21);
+    float sub_sampling   = (has_two_planes || (output->info()->format() == Format::IYUV)) ? 0.5f : 1;
+
+    switch(input->info()->format())
+    {
+        case Format::RGB888:
+        case Format::RGBA8888:
+        {
+            switch(output->info()->format())
+            {
+                case Format::NV12:
+                case Format::IYUV:
+                    num_elems_processed_per_iteration = 2;
+                    break;
+                case Format::YUV444:
+                    num_elems_processed_per_iteration = 4;
+                    break;
+                default:
+                    break;
+            }
+            break;
+        }
+        case Format::UYVY422:
+        case Format::YUYV422:
+        {
+            switch(output->info()->format())
+            {
+                case Format::NV12:
+                case Format::IYUV:
+                    num_elems_processed_per_iteration = 8;
+                    break;
+                default:
+                    break;
+            }
+            break;
+        }
+        default:
+            break;
+    }
+    ARM_COMPUTE_ERROR_ON_MSG(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported",
+                             string_from_format(input->info()->format()).c_str(),
+                             string_from_format(output->info()->format()).c_str());
+
+    std::stringstream kernel_name;
+
+    kernel_name << string_from_format(input->info()->format());
+    kernel_name << "_to_";
+    kernel_name << string_from_format(output->info()->format());
+    kernel_name << "_bt709";
+
+    _input        = input;
+    _multi_output = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str()));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    if((input->info()->format() != Format::RGB888 || output->info()->format() != Format::YUV444) && (input->info()->format() != Format::RGBA8888 || output->info()->format() != Format::YUV444))
+    {
+        win.set_dimension_step(Window::DimY, 2);
+    }
+
+    AccessWindowHorizontal output_plane0_access(output->plane(0)->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowRectangle  output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, sub_sampling, sub_sampling);
+    AccessWindowRectangle  output_plane2_access(has_two_planes ? nullptr : output->plane(2)->info(), 0, 0,
+                                                num_elems_processed_per_iteration, 1, sub_sampling, sub_sampling);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+                              output_plane0_access,
+                              output_plane1_access,
+                              output_plane2_access);
+
+    ValidRegion input_region = input->info()->valid_region();
+
+    output_plane0_access.set_valid_region(win, ValidRegion(input_region.anchor, output->plane(0)->info()->tensor_shape()));
+    output_plane1_access.set_valid_region(win, ValidRegion(input_region.anchor, output->plane(1)->info()->tensor_shape()));
+    output_plane2_access.set_valid_region(win, ValidRegion(input_region.anchor, output->plane(2)->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLColorConvertKernel::configure(const ICLMultiImage *input, ICLMultiImage *output)
+{
+    unsigned int num_elems_processed_per_iteration = 0;
+    switch(input->info()->format())
+    {
+        case Format::NV12:
+        case Format::NV21:
+        {
+            switch(output->info()->format())
+            {
+                case Format::IYUV:
+                case Format::YUV444:
+                    num_elems_processed_per_iteration = 16;
+                    break;
+                default:
+                    break;
+            }
+            break;
+        }
+        case Format::IYUV:
+        {
+            switch(output->info()->format())
+            {
+                case Format::YUV444:
+                case Format::NV12:
+                    num_elems_processed_per_iteration = 16;
+                    break;
+                default:
+                    break;
+            }
+            break;
+        }
+        default:
+            break;
+    }
+    ARM_COMPUTE_ERROR_ON_MSG(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported",
+                             string_from_format(input->info()->format()).c_str(),
+                             string_from_format(output->info()->format()).c_str());
+
+    std::stringstream kernel_name;
+
+    kernel_name << string_from_format(input->info()->format());
+    kernel_name << "_to_";
+    kernel_name << string_from_format(output->info()->format());
+    kernel_name << "_bt709";
+
+    _multi_input  = input;
+    _multi_output = output;
+
+    // Create kernel
+    bool has_two_input_planars  = (input->info()->format() == Format::NV12) || (input->info()->format() == Format::NV21);
+    bool has_two_output_planars = (output->info()->format() == Format::NV12) || (output->info()->format() == Format::NV21);
+
+    float sub_sampling_input  = (has_two_input_planars || (input->info()->format() == Format::IYUV)) ? 0.5f : 1;
+    float sub_sampling_output = (has_two_output_planars || (output->info()->format() == Format::IYUV)) ? 0.5f : 1;
+
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str()));
+
+    Window win = calculate_max_window(*input->cl_plane(0)->info(), Steps(num_elems_processed_per_iteration));
+    win.set_dimension_step(Window::DimY, 2);
+
+    AccessWindowHorizontal input_plane0_access(input->plane(0)->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowRectangle  input_plane1_access(input->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1,
+                                               sub_sampling_input, sub_sampling_input);
+    AccessWindowRectangle input_plane2_access(has_two_input_planars ? nullptr : input->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1,
+                                              sub_sampling_input, sub_sampling_input);
+    AccessWindowHorizontal output_plane0_access(output->plane(0)->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowRectangle  output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, sub_sampling_output, sub_sampling_output);
+    AccessWindowRectangle  output_plane2_access(has_two_output_planars ? nullptr : output->plane(2)->info(), 0, 0,
+                                                num_elems_processed_per_iteration, 1, sub_sampling_output, sub_sampling_output);
+
+    update_window_and_padding(win,
+                              input_plane0_access, input_plane1_access, input_plane2_access,
+                              output_plane0_access, output_plane1_access, output_plane2_access);
+
+    ValidRegion intersect_region = intersect_valid_regions(input->plane(0)->info()->valid_region(), input->plane(1)->info()->valid_region(),
+                                                           input->plane(2)->info()->valid_region());
+    output_plane0_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->plane(0)->info()->tensor_shape()));
+    output_plane1_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->plane(1)->info()->tensor_shape()));
+    output_plane2_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->plane(2)->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLColorConvertKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    if(nullptr != _input && nullptr != _output)
+    {
+        do
+        {
+            unsigned int idx = 0;
+            add_2D_tensor_argument(idx, _input, slice);
+            add_2D_tensor_argument(idx, _output, slice);
+            enqueue(queue, *this, slice);
+        }
+        while(window.slide_window_slice_2D(slice));
+    }
+    else if(nullptr != _input && nullptr != _multi_output)
+    {
+        Format format = _multi_output->info()->format();
+        do
+        {
+            Window win_uv(slice);
+
+            if((Format::NV12 == format) || (Format::NV21 == format) || (Format::IYUV == format))
+            {
+                win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+                win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+            }
+            unsigned int idx = 0;
+            add_2D_tensor_argument(idx, _input, slice);
+            add_2D_tensor_argument(idx, _multi_output->cl_plane(0), slice);
+            for(int i = 1; i < 3 && (0 != _multi_output->cl_plane(i)->info()->num_dimensions()); ++i)
+            {
+                add_2D_tensor_argument(idx, _multi_output->cl_plane(i), win_uv);
+            }
+            enqueue(queue, *this, slice);
+        }
+        while(window.slide_window_slice_2D(slice));
+    }
+    else if(nullptr != _multi_input && nullptr != _output)
+    {
+        Format format = _multi_input->info()->format();
+        do
+        {
+            Window win_uv(slice);
+
+            if((Format::NV12 == format) || (Format::NV21 == format) || (Format::IYUV == format))
+            {
+                win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+                win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+            }
+
+            unsigned int idx = 0;
+            add_2D_tensor_argument(idx, _multi_input->cl_plane(0), slice);
+
+            for(int i = 1; i < 3 && (0 != _multi_input->cl_plane(i)->info()->num_dimensions()); ++i)
+            {
+                add_2D_tensor_argument(idx, _multi_input->cl_plane(i), win_uv);
+            }
+            add_2D_tensor_argument(idx, _output, slice);
+            enqueue(queue, *this, slice);
+        }
+        while(window.slide_window_slice_2D(slice));
+    }
+    else if(nullptr != _multi_input && nullptr != _multi_output)
+    {
+        Format in_format  = _multi_input->info()->format();
+        Format out_format = _multi_output->info()->format();
+        do
+        {
+            Window win_in_uv(slice);
+            if((Format::NV12 == in_format) || (Format::NV21 == in_format) || (Format::IYUV == in_format))
+            {
+                win_in_uv.set(Window::DimX, Window::Dimension(win_in_uv.x().start() / 2,
+                                                              win_in_uv.x().end() / 2, win_in_uv.x().step() / 2));
+                win_in_uv.set(Window::DimY, Window::Dimension(win_in_uv.y().start() / 2, win_in_uv.y().end() / 2, 1));
+            }
+            unsigned int idx = 0;
+            add_2D_tensor_argument(idx, _multi_input->cl_plane(0), slice);
+            for(int i = 1; i < 3 && (0 != _multi_input->cl_plane(i)->info()->num_dimensions()); ++i)
+            {
+                add_2D_tensor_argument(idx, _multi_input->cl_plane(i), win_in_uv);
+            }
+
+            Window win_out_uv(slice);
+            if((Format::NV12 == out_format) || (Format::NV21 == out_format) || (Format::IYUV == out_format))
+            {
+                win_out_uv.set(Window::DimX, Window::Dimension(win_out_uv.x().start() / 2,
+                                                               win_out_uv.x().end() / 2, win_out_uv.x().step() / 2));
+                win_out_uv.set(Window::DimY, Window::Dimension(win_out_uv.y().start() / 2, win_out_uv.y().end() / 2, 1));
+            }
+
+            add_2D_tensor_argument(idx, _multi_output->cl_plane(0), slice);
+            for(int i = 1; i < 3 && (0 != _multi_output->cl_plane(i)->info()->num_dimensions()); ++i)
+            {
+                add_2D_tensor_argument(idx, _multi_output->cl_plane(i), win_out_uv);
+            }
+            enqueue(queue, *this, slice);
+        }
+        while(window.slide_window_slice_2D(slice));
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not supported");
+    }
+}
diff --git a/src/core/CL/kernels/CLConvolutionKernel.cpp b/src/core/CL/kernels/CLConvolutionKernel.cpp
new file mode 100644
index 0000000000..bdfe398a1d
--- /dev/null
+++ b/src/core/CL/kernels/CLConvolutionKernel.cpp
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+#define MAX_MATRIX_SIZE 81
+
+/****************************************************************************************\
+ *                                 Square Convolution                                *
+\****************************************************************************************/
+
+template <unsigned int matrix_size>
+BorderSize             CLConvolutionKernel<matrix_size>::border_size() const
+{
+    return BorderSize(matrix_size / 2);
+}
+
+template <unsigned int matrix_size>
+void CLConvolutionKernel<matrix_size>::configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON(conv == nullptr);
+
+    _input  = input;
+    _output = output;
+
+    std::stringstream     kernel_name;
+    std::set<std::string> options;
+    kernel_name << "convolution" << matrix_size << "x" << matrix_size << "_static";
+
+    if(scale == 0)
+    {
+        scale = calculate_matrix_scale(conv, matrix_size);
+    }
+
+    for(unsigned int i = 0; i < matrix_size * matrix_size; i++)
+    {
+        std::stringstream mat_str;
+        mat_str << "-DMAT" << i << "=" << conv[i];
+        options.insert(mat_str.str());
+    }
+
+    options.insert("-DSCALE=" + val_to_string(scale));
+
+    DataType data_type = data_type_for_convolution_matrix(conv, matrix_size * matrix_size);
+    options.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+
+    std::stringstream out_type;
+    out_type << "-DDATA_TYPE_OUT=" << get_cl_type_from_data_type(output->info()->data_type());
+    options.insert(out_type.str());
+
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str(), options));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_rows_read_per_iteration       = matrix_size;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+/****************************************************************************************\
+ *                                 Separable Convolution                                *
+\****************************************************************************************/
+template <unsigned int matrix_size>
+CLSeparableConvolutionHorKernel<matrix_size>::CLSeparableConvolutionHorKernel()
+    : _border_size(0)
+{
+}
+
+template <unsigned int matrix_size>
+BorderSize             CLSeparableConvolutionHorKernel<matrix_size>::border_size() const
+{
+    return _border_size;
+}
+
+template <unsigned int matrix_size>
+void CLSeparableConvolutionHorKernel<matrix_size>::configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::S16, DataType::S32);
+
+    ARM_COMPUTE_ERROR_ON((matrix_size != 5) && (matrix_size != 7) && (matrix_size != 9));
+
+    _input       = input;
+    _output      = output;
+    _border_size = BorderSize(border_undefined ? 0 : matrix_size / 2, matrix_size / 2);
+
+    // Set build options
+    std::set<std::string> build_opts;
+
+    int16_t mat[matrix_size * matrix_size] = { 0 };
+    memcpy(mat, conv, matrix_size * sizeof(int16_t));
+
+    for(unsigned int j = 0; j < matrix_size * matrix_size; j++)
+    {
+        build_opts.insert("-DMAT" + val_to_string(j) + "=" + val_to_string(mat[j]));
+    }
+
+    build_opts.insert("-DSCALE=0");
+
+    build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution_separable1x" + val_to_string(matrix_size) + "_static", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+
+    Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowHorizontal input_access(input->info(), -border_size().left, num_elems_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+template <unsigned int matrix_size>
+BorderSize             CLSeparableConvolutionVertKernel<matrix_size>::border_size() const
+{
+    return BorderSize(matrix_size / 2, 0);
+}
+
+template <unsigned int matrix_size>
+void CLSeparableConvolutionVertKernel<matrix_size>::configure(const ICLTensor *input, ICLTensor *output,
+                                                              const int16_t *conv, uint32_t scale, bool border_undefined, DataType data_type)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON((matrix_size != 5) && (matrix_size != 7) && (matrix_size != 9));
+    ARM_COMPUTE_ERROR_ON(scale == 0);
+
+    _input  = input;
+    _output = output;
+
+    std::set<std::string> build_opts;
+
+    int16_t mat[matrix_size * matrix_size] = { 0 };
+    memcpy(mat + matrix_size, conv, matrix_size * sizeof(int16_t));
+
+    for(unsigned int j = 0; j < matrix_size * matrix_size; j++)
+    {
+        build_opts.insert("-DMAT" + val_to_string(j) + "=" + val_to_string(mat[j]));
+    }
+
+    build_opts.insert("-DSCALE=" + val_to_string(scale));
+
+    build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+
+    build_opts.insert("-DCOMPUTE_TYPE=" + get_cl_type_from_data_type(data_type));
+
+    std::stringstream out_type;
+    out_type << "-DDATA_TYPE_OUT=" << get_cl_type_from_data_type(output->info()->data_type());
+    build_opts.insert(out_type.str());
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution_separable" + val_to_string(matrix_size) + "x1_static", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = matrix_size;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+/****************************************************************************************\
+ *                                 Rectangle Convolution                                *
+\****************************************************************************************/
+
+CLConvolutionRectangleKernel::CLConvolutionRectangleKernel()
+    : _border_size(0), _input(nullptr), _output(nullptr)
+{
+}
+
+BorderSize CLConvolutionRectangleKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLConvolutionRectangleKernel::configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON(nullptr == conv);
+    ARM_COMPUTE_ERROR_ON(3 != width && 5 != width && 7 != width && 9 != width);
+    ARM_COMPUTE_ERROR_ON(3 != height && 5 != height && 7 != height && 9 != height);
+    ARM_COMPUTE_ERROR_ON(0 == scale);
+
+    _input       = input;
+    _output      = output;
+    _border_size = BorderSize(height / 2, width / 2);
+
+    std::set<std::string> options;
+
+    std::stringstream output_type;
+    output_type << "-DDATA_TYPE_OUT=" << get_cl_type_from_data_type(output->info()->data_type());
+    options.insert(output_type.str());
+
+    uint32_t matrix_size = width * height;
+
+    int16_t mat[MAX_MATRIX_SIZE] = { 0 };
+
+    memcpy(mat, conv, matrix_size * sizeof(int16_t));
+
+    for(unsigned int j = 0; j < MAX_MATRIX_SIZE; j++)
+    {
+        options.insert("-DMAT" + val_to_string(j) + "=" + val_to_string(mat[j]));
+    }
+
+    options.insert("-DSCALE=" + val_to_string(scale));
+
+    DataType data_type = data_type_for_convolution_matrix(conv, matrix_size);
+    options.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+
+    options.insert("-DMATRIX_WIDTH=" + val_to_string(width));
+    options.insert("-DMATRIX_HEIGHT=" + val_to_string(height));
+
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution_rectangle", options));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    const unsigned int     num_rows_read_per_iteration       = height;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLConvolutionRectangleKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
+
+template class arm_compute::CLConvolutionKernel<3>;
+template class arm_compute::CLConvolutionKernel<5>;
+template class arm_compute::CLConvolutionKernel<7>;
+template class arm_compute::CLConvolutionKernel<9>;
+template class arm_compute::CLSeparableConvolutionVertKernel<5>;
+template class arm_compute::CLSeparableConvolutionVertKernel<7>;
+template class arm_compute::CLSeparableConvolutionVertKernel<9>;
+template class arm_compute::CLSeparableConvolutionHorKernel<5>;
+template class arm_compute::CLSeparableConvolutionHorKernel<7>;
+template class arm_compute::CLSeparableConvolutionHorKernel<9>;
diff --git a/src/core/CL/kernels/CLDepthConcatenateKernel.cpp b/src/core/CL/kernels/CLDepthConcatenateKernel.cpp
new file mode 100644
index 0000000000..73f1ba15df
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthConcatenateKernel.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLDepthConcatenateKernel::CLDepthConcatenateKernel()
+    : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0)
+{
+}
+
+BorderSize CLDepthConcatenateKernel::border_size() const
+{
+    return BorderSize(_top_bottom, _left_right);
+}
+
+void CLDepthConcatenateKernel::configure(const ICLTensor *input, unsigned int depth_offset, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
+
+    // The gaps between the two lowest dimensions of input and output need to be divisible by 2
+    // Otherwise it is not clear how the padding should be added onto the input tensor
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2);
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2);
+
+    _input  = input;
+    _output = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_depth"));
+
+    // Configure kernel window
+    _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2;
+    _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2;
+
+    const unsigned int offset_to_first_elements_in_bytes = depth_offset * output->info()->strides_in_bytes()[2] + _left_right * output->info()->strides_in_bytes()[0] + _top_bottom *
+                                                           output->info()->strides_in_bytes()[1];
+
+    const unsigned int num_elems_processed_per_iteration = 4;
+    const unsigned int num_elems_read_per_iteration      = 4;
+    const unsigned int num_rows_read_per_iteration       = 1;
+
+    // The window needs to be based on input as we copy all the depths of input
+    Window win = calculate_max_enlarged_window(*input->info(), Steps(num_elems_processed_per_iteration), border_size());
+
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); // Skip the input and output parameters
+    _kernel.setArg<unsigned int>(idx, offset_to_first_elements_in_bytes);
+
+    ICLKernel::configure(win);
+}
+
+void CLDepthConcatenateKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLDepthConvertKernel.cpp b/src/core/CL/kernels/CLDepthConvertKernel.cpp
new file mode 100644
index 0000000000..24608bd17c
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthConvertKernel.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthConvertKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cstddef>
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+void CLDepthConvertKernel::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32);
+    ARM_COMPUTE_ERROR_ON(input == output);
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == output->info()->data_type(), "Input and output data types must be different");
+    ARM_COMPUTE_ERROR_ON(shift >= 8);
+
+    // Check if convertion is supported
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::U16 && output->info()->data_type() != DataType::S16
+                                                                            && output->info()->data_type() != DataType::U32 && output->info()->data_type() != DataType::S32),
+                             "Only data types supported [in] U8 -> [out] U16, S16, U32, S32");
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32
+                                                                             && output->info()->data_type() != DataType::S32),
+                             "Only data types supported [in] U16 ->  [out] U8, U32, S32");
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32
+                                                                             && output->info()->data_type() != DataType::S32),
+                             "Only data types supported [in] S16 ->  [out] U8, U32, S32");
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U32 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U16
+                                                                             && output->info()->data_type() != DataType::S16),
+                             "Only data types supported [in] U32 ->  [out] U8, U16, S16");
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S32 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U16
+                                                                             && output->info()->data_type() != DataType::S16),
+                             "Only data types supported [in] S32 ->  [out] U8, U16, S16");
+
+    // Get data sizes
+    const size_t input_size  = data_size_from_type(input->info()->data_type());
+    const size_t output_size = data_size_from_type(output->info()->data_type());
+
+    // Construct kernel name and build options
+    std::string           kernel_name = "convert_depth";
+    std::set<std::string> build_opts;
+    if(input_size > output_size)
+    {
+        kernel_name += "_down";
+        build_opts.insert((policy == ConvertPolicy::WRAP) ? "-DWRAP" : "-DSATURATE");
+    }
+    else
+    {
+        kernel_name += "_up";
+    }
+    build_opts.insert("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.insert("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Set shift arg
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, shift);
+
+    // Configure kernel
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
+}
diff --git a/src/core/CL/kernels/CLDerivativeKernel.cpp b/src/core/CL/kernels/CLDerivativeKernel.cpp
new file mode 100644
index 0000000000..36ba06d528
--- /dev/null
+++ b/src/core/CL/kernels/CLDerivativeKernel.cpp
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDerivativeKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLDerivativeKernel::CLDerivativeKernel()
+    : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_derivative_x(false), _run_derivative_y(false)
+{
+}
+
+BorderSize CLDerivativeKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLDerivativeKernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_derivative_x = output_x != nullptr;
+    _run_derivative_y = output_y != nullptr;
+
+    if(_run_derivative_x)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
+    }
+
+    if(_run_derivative_y)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
+    }
+
+    _input    = input;
+    _output_x = output_x;
+    _output_y = output_y;
+
+    // Set build options
+    std::set<std::string> build_opts;
+
+    if(_run_derivative_x)
+    {
+        build_opts.insert("-DGRAD_X");
+    }
+
+    if(_run_derivative_y)
+    {
+        build_opts.insert("-DGRAD_Y");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("derivative", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    constexpr unsigned int num_read_rows_per_iteration       = 3;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), 0, 0, 0, 0);
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_processed_per_iteration);
+    if(_run_derivative_x && _run_derivative_y)
+    {
+        input_access = AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_processed_per_iteration, num_read_rows_per_iteration);
+    }
+    else if(_run_derivative_x)
+    {
+        input_access = AccessWindowHorizontal(input->info(), -border_size().left, num_elems_processed_per_iteration);
+    }
+    else if(_run_derivative_y)
+    {
+        input_access = AccessWindowRectangle(input->info(), 0, -border_size().top, num_elems_processed_per_iteration, num_read_rows_per_iteration);
+    }
+
+    update_window_and_padding(win,
+                              input_access,
+                              output_x_access,
+                              output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLDerivativeKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+
+        if(_run_derivative_x)
+        {
+            add_2D_tensor_argument(idx, _output_x, slice);
+        }
+
+        if(_run_derivative_y)
+        {
+            add_2D_tensor_argument(idx, _output_y, slice);
+        }
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLDilateKernel.cpp b/src/core/CL/kernels/CLDilateKernel.cpp
new file mode 100644
index 0000000000..3abd747011
--- /dev/null
+++ b/src/core/CL/kernels/CLDilateKernel.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDilateKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+BorderSize CLDilateKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLDilateKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("dilate"));
+
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLErodeKernel.cpp b/src/core/CL/kernels/CLErodeKernel.cpp
new file mode 100644
index 0000000000..a7aa88fc5c
--- /dev/null
+++ b/src/core/CL/kernels/CLErodeKernel.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLErodeKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+BorderSize CLErodeKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLErodeKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("erode"));
+
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_rows_read_pes_iteration       = 3;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_pes_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLFastCornersKernel.cpp b/src/core/CL/kernels/CLFastCornersKernel.cpp
new file mode 100644
index 0000000000..1d4d776730
--- /dev/null
+++ b/src/core/CL/kernels/CLFastCornersKernel.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLFastCornersKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLFastCornersKernel::CLFastCornersKernel()
+    : ICLKernel(), _input(nullptr), _output(nullptr)
+{
+}
+
+BorderSize CLFastCornersKernel::border_size() const
+{
+    return BorderSize(3);
+}
+
+void CLFastCornersKernel::configure(const ICLImage *input, ICLImage *output, float threshold, bool non_max_suppression, BorderMode border_mode)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MSG(border_mode != BorderMode::UNDEFINED, "Not implemented");
+
+    _input  = input;
+    _output = output;
+
+    // Create build options
+    std::set<std::string> build_opts;
+
+    if(non_max_suppression)
+    {
+        build_opts.emplace("-DUSE_MAXSUPPRESSION");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("fast_corners", build_opts));
+
+    // Set static kernel arguments
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); // Skip the input and output parameters
+    _kernel.setArg<cl_float>(idx, static_cast<float>(threshold));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+    constexpr unsigned int num_elems_read_per_iteration      = 7;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_mode == BorderMode::UNDEFINED, BorderSize(3));
+
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_mode == BorderMode::UNDEFINED, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLFastCornersKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
+
+CLCopyToArrayKernel::CLCopyToArrayKernel()
+    : ICLKernel(), _input(nullptr), _corners(nullptr), _num_buffer(nullptr)
+{
+}
+
+void CLCopyToArrayKernel::configure(const ICLImage *input, bool update_number, ICLKeyPointArray *corners, cl::Buffer *num_buffers)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(corners == nullptr);
+    ARM_COMPUTE_ERROR_ON(num_buffers == nullptr);
+
+    _input      = input;
+    _corners    = corners;
+    _num_buffer = num_buffers;
+
+    std::set<std::string> build_opts;
+
+    if(update_number)
+    {
+        build_opts.emplace("-DUPDATE_NUMBER");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("copy_to_keypoint", build_opts));
+
+    //Get how many pixels skipped in the x dimension in the previous stages
+    unsigned int offset = _input->info()->valid_region().anchor.x();
+
+    // Set static kernel arguments
+    unsigned int idx = num_arguments_per_2D_tensor(); // Skip the input and output parameters
+    _kernel.setArg<unsigned int>(idx++, corners->max_num_values());
+    _kernel.setArg<cl_uint>(idx++, offset);
+    _kernel.setArg(idx++, *_num_buffer);
+    _kernel.setArg(idx++, _corners->cl_buffer());
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+    Window                 win                               = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
+    ICLKernel::configure(win);
+}
+
+void CLCopyToArrayKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    //Initialise the _num_buffer as it used as both input and output
+    static const unsigned int zero_init = 0;
+    queue.enqueueWriteBuffer(*_num_buffer, CL_FALSE, 0, sizeof(unsigned int), &zero_init);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
new file mode 100644
index 0000000000..981aad665a
--- /dev/null
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cstdint>
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+CLFillBorderKernel::CLFillBorderKernel()
+    : ICLKernel(), _tensor(nullptr)
+{
+}
+
+bool CLFillBorderKernel::is_parallelisable() const
+{
+    return false;
+}
+
+template <class T>
+void CLFillBorderKernel::set_constant_border(unsigned int idx, const PixelValue &constant_border_value)
+{
+    T value;
+    constant_border_value.get(value);
+    ICLKernel::add_argument<T>(idx, static_cast<T>(value));
+}
+
+void CLFillBorderKernel::configure(ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(tensor == nullptr);
+    ARM_COMPUTE_ERROR_ON(tensor->info()->num_channels() != 1);
+
+    border_size.limit(tensor->info()->padding());
+
+    // If there is no border: early exit
+    if(border_size.empty() || border_mode == BorderMode::UNDEFINED)
+    {
+        return;
+    }
+
+    // Select appropriate kernel
+    std::string kernel_name = "fill_image_borders_" + lower_string(string_from_border_mode(border_mode));
+
+    // Define select type required by replicate border > 1
+    const DataType dt          = tensor->info()->data_type();
+    std::string    select_type = get_cl_type_from_data_type(dt);
+    if(is_data_type_float(dt))
+    {
+        select_type = (DataType::F32 == dt) ? "int" : "short";
+    }
+
+    // Define build options
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
+    build_opts.emplace(("-DSELECT_TYPE=" + select_type));
+    build_opts.emplace(("-DBORDER_SIZE_TOP=" + val_to_string(border_size.top)));
+    build_opts.emplace(("-DBORDER_SIZE_BOTTOM=" + val_to_string(border_size.bottom)));
+    build_opts.emplace(("-DBORDER_SIZE_LEFT=" + val_to_string(border_size.left)));
+    build_opts.emplace(("-DBORDER_SIZE_RIGHT=" + val_to_string(border_size.right)));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+    _tensor = tensor;
+
+    // Create static kernel arguments
+    const unsigned int valid_width  = tensor->info()->valid_region().shape[0];
+    const unsigned int valid_height = tensor->info()->valid_region().shape[1];
+    const cl_int2      valid_region_coords =
+    {
+        {
+            static_cast<cl_int>(tensor->info()->valid_region().anchor[0]),
+            static_cast<cl_int>(tensor->info()->valid_region().anchor[1]),
+        }
+    };
+    const unsigned int total_valid_width = border_size.left + valid_width + border_size.right;
+
+    // Set static kernel arguments
+    unsigned int idx = num_arguments_per_2D_tensor(); //Skip the tensor parameters
+    ICLKernel::add_argument<cl_uint>(idx, valid_width);
+    ICLKernel::add_argument<cl_uint>(idx, valid_height);
+    ICLKernel::add_argument<cl_int2>(idx, valid_region_coords);
+    if(BorderMode::CONSTANT == border_mode)
+    {
+        switch(dt)
+        {
+            case DataType::U8:
+                set_constant_border<uint8_t>(idx, constant_border_value);
+                break;
+            case DataType::U16:
+                set_constant_border<uint16_t>(idx, constant_border_value);
+                break;
+            case DataType::S16:
+                set_constant_border<int16_t>(idx, constant_border_value);
+                break;
+            case DataType::U32:
+                set_constant_border<uint32_t>(idx, constant_border_value);
+                break;
+            case DataType::S32:
+                set_constant_border<int32_t>(idx, constant_border_value);
+                break;
+            case DataType::F32:
+                static_assert(sizeof(float) == 4, "Float must be 32 bit");
+                set_constant_border<float>(idx, constant_border_value);
+                break;
+            case DataType::F16:
+                static_assert(sizeof(cl_half) == 2, "Half must be 16 bit");
+                set_constant_border<cl_half>(idx, constant_border_value);
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Not handled");
+        }
+    }
+
+    // Configure kernel window
+    Window win;
+    win.set(Window::DimX, Window::Dimension(0, total_valid_width + valid_height));
+    win.set(Window::DimY, Window::Dimension(0, 1, 1));
+    win.use_tensor_dimensions(tensor->info(), Window::DimZ);
+    ICLKernel::configure(win);
+}
+
+void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    // Border mode undefined or border width == 0
+    if(_kernel() == nullptr)
+    {
+        return;
+    }
+
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _tensor, slice);
+        enqueue(queue, *this, slice, cl::NullRange);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
new file mode 100644
index 0000000000..71d42c5606
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLGEMMInterleave4x4Kernel::CLGEMMInterleave4x4Kernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != input->info()->dimension(0) * 4);
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != std::ceil(static_cast<float>(input->info()->dimension(1)) / 4.0f));
+
+    _input  = input;
+    _output = output;
+
+    // Create kernel
+    std::string data_type_name;
+    data_type_name = val_to_string(input->info()->element_size() * 8) + "bit";
+    _kernel        = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_interleave4x4_" + data_type_name));
+
+    // Configure kernel window
+    const unsigned int     num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input->info()->data_type());
+    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+    const unsigned int     num_elems_written_per_iteration     = num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, 1, 4.f, 0.25f);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLGEMMInterleave4x4Kernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    /*
+     * This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
+     *         |a00 a01 a02 a03|
+     *         |a10 a11 a12 a13|
+     *         |a20 a21 a22 a23| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 |
+     *         |a30 a31 a32 a33|
+     *
+     * After this operation, the output matrix will have the following shape: [ height * 4, width / 4 ]
+     */
+    Window in_slice  = window.first_slice_window_2D();
+    Window out_slice = window.first_slice_window_2D();
+
+    // Change x and y steps for the slide of output tensor
+    out_slice.scale(Window::DimX, 4.f);
+    out_slice.scale(Window::DimY, 0.25f);
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, in_slice);
+        add_2D_tensor_argument(idx, _output, out_slice);
+        enqueue(queue, *this, in_slice);
+    }
+    while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+}
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
new file mode 100644
index 0000000000..c6e05b92a2
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+CLGEMMLowpMatrixMultiplyKernel::CLGEMMLowpMatrixMultiplyKernel()
+    : _input0(nullptr), _input1(nullptr), _output(nullptr)
+{
+}
+
+void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output,
+                                               int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+
+    _input0 = input0;
+    _input1 = input1;
+    _output = output;
+
+    // Create kernel and set static arguments
+    std::set<std::string> build_opts = { ("-DWIDTH_MATRIX_B=" + val_to_string(input1->info()->dimension(0))) };
+    _kernel                          = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_u8", build_opts));
+    unsigned int idx                 = 3 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg<int32_t>(idx++, a_offset);
+    _kernel.setArg<int32_t>(idx++, b_offset);
+    _kernel.setArg<int32_t>(idx++, output_offset);
+    _kernel.setArg<int32_t>(idx++, output_mult_int);
+    _kernel.setArg<int32_t>(idx++, shift);
+
+    // Configure window
+    constexpr unsigned int num_elems_processed_per_iteration_x = 16;
+    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+    constexpr unsigned int num_elems_read_per_iteration_input0 = 4;
+    constexpr unsigned int num_elems_read_per_iteration_input1 = 16;
+
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_read_per_iteration_input0, 1);
+    AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_read_per_iteration_input1, 1);
+    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+    update_window_and_padding(win, input0_access, input1_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLGEMMLowpMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice          = window.first_slice_window_2D();
+    Window slice_matrix_b = slice;
+    slice_matrix_b.set(Window::DimX, Window::Dimension(0, _input1->info()->dimension(0), 1));
+    slice_matrix_b.set(Window::DimY, Window::Dimension(0, _input1->info()->dimension(1), 1));
+    slice_matrix_b.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    do
+    {
+        Window slice_b = slice;
+        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+        if(_input1->info()->num_dimensions() < 3)
+        {
+            slice_b = slice_matrix_b;
+        }
+
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input0, slice);
+        add_2D_tensor_argument(idx, _input1, slice_b);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
new file mode 100644
index 0000000000..289873c23f
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+CLGEMMMatrixAccumulateBiasesKernel::CLGEMMMatrixAccumulateBiasesKernel()
+    : _accum(nullptr), _biases(nullptr)
+{
+}
+
+void CLGEMMMatrixAccumulateBiasesKernel::configure(ICLTensor *accum, const ICLTensor *biases)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
+    ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() != 1);
+
+    _biases = biases;
+    _accum  = accum;
+
+    // Create kernel
+    std::string data_type_name = lower_string(string_from_data_type(accum->info()->data_type()));
+    _kernel                    = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_accumulate_biases_" + data_type_name));
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / data_size_from_type(accum->info()->data_type());
+
+    Window win = calculate_max_window(*_accum->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowStatic     biases_access(biases->info(), 0, 0, biases->info()->dimension(0), biases->info()->dimension(1));
+    AccessWindowHorizontal accum_access(_accum->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, biases_access, accum_access);
+
+    ICLKernel::configure(win);
+}
+
+void CLGEMMMatrixAccumulateBiasesKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    Window accum_slice = window.first_slice_window_2D();
+
+    Window biases_slice(accum_slice);
+    biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    // Run kernel
+    do
+    {
+        // Set arguments
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _accum, accum_slice);
+        add_1D_tensor_argument(idx, _biases, biases_slice);
+
+        enqueue(queue, *this, accum_slice);
+    }
+    while(window.slide_window_slice_2D(accum_slice));
+}
diff --git a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
new file mode 100644
index 0000000000..343838f2f9
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLGEMMMatrixAdditionKernel::CLGEMMMatrixAdditionKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLGEMMMatrixAdditionKernel::configure(const ICLTensor *input, ICLTensor *output, const float beta)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1));
+
+    _input                                               = input;
+    _output                                              = output;
+    const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / data_size_from_type(input->info()->data_type());
+
+    std::ostringstream ma_arguments;
+    ma_arguments << "-DBETA=" << beta;
+    std::set<std::string> build_opts;
+    build_opts.emplace(ma_arguments.str());
+
+    // Create kernel
+    std::string data_type_name = lower_string(string_from_data_type(input->info()->data_type()));
+    _kernel                    = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_ma_" + data_type_name), build_opts));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLGEMMMatrixAdditionKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
new file mode 100644
index 0000000000..d7388e8579
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/AccessWindowTranspose.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel()
+    : _input0(nullptr), _input1(nullptr), _output(nullptr)
+{
+}
+
+void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+    if(output->info()->dimension(1) == 1)
+    {
+        ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
+    }
+
+    _input0 = input0;
+    _input1 = input1;
+    _output = output;
+
+    if(output->info()->dimension(1) == 196)
+    {
+        _lws_hint = cl::NDRange(1, 7);
+    }
+    else
+    {
+        _lws_hint = cl::NDRange(8, 8);
+    }
+
+    std::ostringstream mm_arguments;
+    mm_arguments << "-DWIDTH_MATRIX_B=" << input1->info()->dimension(0) << " ";
+    mm_arguments << "-DALPHA=" << alpha << " ";
+    std::set<std::string> build_opts;
+
+    // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
+    if(output->info()->dimension(1) == 1)
+    {
+        mm_arguments << "-DWIDTH_VECTOR_A=" << input0->info()->dimension(0) << " ";
+        build_opts.emplace(mm_arguments.str());
+
+        // Create kernel
+        std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type()));
+        _kernel                    = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_vm_" + data_type_name), build_opts));
+
+        // Configure window kernel
+        const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type());
+
+        Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
+
+        AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+        AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+        AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+
+        update_window_and_padding(win, input0_access, input1_access, output_access);
+
+        output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+        ICLKernel::configure(win);
+    }
+    else
+    {
+        build_opts.emplace(mm_arguments.str());
+
+        // Create kernel
+        std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type()));
+
+        if(data_type_name == "f32")
+        {
+            GPUTarget arch_target = get_arch_from_target(get_target());
+            _kernel               = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_f32_" + string_from_target(arch_target), build_opts));
+        }
+        else
+        {
+            _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_" + data_type_name, build_opts));
+        }
+
+        // Configure window kernel
+        const unsigned int     num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type());
+        constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+
+        Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+        AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
+        AccessWindowTranspose input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f);
+        AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+        update_window_and_padding(win, input0_access, input1_access, output_access);
+
+        output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+        ICLKernel::configure(win);
+    }
+}
+
+void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice          = window.first_slice_window_2D();
+    Window slice_matrix_b = slice;
+    slice_matrix_b.set(Window::DimX, Window::Dimension(0, _input1->info()->dimension(0), 1));
+    slice_matrix_b.set(Window::DimY, Window::Dimension(0, _input1->info()->dimension(1), 1));
+    slice_matrix_b.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    do
+    {
+        Window slice_b = slice;
+        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+        if(_input1->info()->num_dimensions() < 3)
+        {
+            slice_b = slice_matrix_b;
+        }
+
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input0, slice);
+        add_2D_tensor_argument(idx, _input1, slice_b);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice, _lws_hint);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
new file mode 100644
index 0000000000..ecee1abd72
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+
+using namespace arm_compute;
+
+void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    TensorShape  output_shape{ input->info()->tensor_shape() };
+    const size_t transpose_w = 16 / input->info()->element_size();
+    output_shape.set(0, input->info()->dimension(1) * transpose_w);
+    output_shape.set(1, static_cast<size_t>(std::ceil((input->info()->dimension(0) / static_cast<float>(transpose_w)))));
+
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+    _input                                               = input;
+    _output                                              = output;
+    const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / data_size_from_type(input->info()->data_type());
+
+    /*
+     * Following an example of how the transposition1xW works when the input data type is F32
+     *
+     *         |a00 a01 a02 a03|
+     *         |a10 a11 a12 a13|
+     *         |a20 a21 a22 a23| = | a00 a01 a02 a03 || a10 a11 a12 a13 || a20 a21 a22 a23 || a30 a31 a32 a33 |
+     *         |a30 a31 a32 a33|
+     *
+     * If the input data type is F32, the output matrix will have the following shape: [ height * 4, width / 4 ]
+     * If the input data type is F16, the output matrix will have the following shape: [ height * 8, width / 8 ]
+     */
+    // Create kernel
+    std::string data_type_name = lower_string(string_from_data_type(input->info()->data_type()));
+    std::string kernel_name    = "gemm_transpose1x" + val_to_string(num_elems_processed_per_iteration) + "_" + data_type_name;
+    _kernel                    = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+
+    // Configure window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    float scale_x = 1.f;
+
+    switch(input->info()->data_type())
+    {
+        case DataType::U8:
+            scale_x = 16.f;
+            break;
+        case DataType::F16:
+            scale_x = 8.f;
+            break;
+        case DataType::F32:
+            scale_x = 4.f;
+            break;
+        default:
+            // Do nothing
+            break;
+    }
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowTranspose  output_access(output->info(), 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLGEMMTranspose1xWKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    // Output is transposed
+    Window out_window(window);
+    out_window.set(Window::DimX, window.y());
+    out_window.set(Window::DimY, window.x());
+
+    Window in_slice  = window.first_slice_window_2D();
+    Window out_slice = out_window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, in_slice);
+        add_2D_tensor_argument(idx, _output, out_slice);
+        enqueue(queue, *this, in_slice, _lws_hint);
+    }
+    while(window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
+}
diff --git a/src/core/CL/kernels/CLGaussian3x3Kernel.cpp b/src/core/CL/kernels/CLGaussian3x3Kernel.cpp
new file mode 100644
index 0000000000..e5bc3f9656
--- /dev/null
+++ b/src/core/CL/kernels/CLGaussian3x3Kernel.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+BorderSize CLGaussian3x3Kernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLGaussian3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    _input  = input;
+    _output = output;
+
+    // Set build options
+    std::set<std::string> build_opts = { "-DMAT0=1", "-DMAT1=2", "-DMAT2=1",
+                                         "-DMAT3=2", "-DMAT4=4", "-DMAT5=2",
+                                         "-DMAT6=1", "-DMAT7=2", "-DMAT8=1",
+                                         "-DSCALE=16", "-DDATA_TYPE_OUT=uchar"
+                                       };
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution3x3_static", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLGaussian5x5Kernel.cpp b/src/core/CL/kernels/CLGaussian5x5Kernel.cpp
new file mode 100644
index 0000000000..bd523c883d
--- /dev/null
+++ b/src/core/CL/kernels/CLGaussian5x5Kernel.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h"
+
+#include <cstdint>
+
+using namespace arm_compute;
+
+void CLGaussian5x5HorKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+    const int16_t matrix[] = { 1, 4, 6, 4, 1 };
+
+    // Set arguments
+    CLSeparableConvolution5x5HorKernel::configure(input, output, matrix, border_undefined);
+}
+
+void CLGaussian5x5VertKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+    const uint32_t scale    = 256;
+    const int16_t  matrix[] = { 1, 4, 6, 4, 1 };
+
+    // Set arguments
+    CLSeparableConvolution5x5VertKernel::configure(input, output, matrix, scale, border_undefined);
+}
diff --git a/src/core/CL/kernels/CLGaussianPyramidKernel.cpp b/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
new file mode 100644
index 0000000000..34a228c717
--- /dev/null
+++ b/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+CLGaussianPyramidHorKernel::CLGaussianPyramidHorKernel()
+    : _border_size(0), _l2_load_offset(0)
+{
+}
+
+BorderSize CLGaussianPyramidHorKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLGaussianPyramidHorKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != 2 * output->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1));
+
+    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
+    }
+
+    _input       = input;
+    _output      = output;
+    _border_size = BorderSize(border_undefined ? 0 : 2, 2);
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gaussian1x5_sub_x"));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    constexpr unsigned int num_elems_read_per_iteration      = 20;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr float        scale_x                           = 0.5f;
+
+    Window                 win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration, scale_x);
+
+    // Sub sampling selects odd pixels (1, 3, 5, ...) for images with even
+    // width and even pixels (0, 2, 4, ...) for images with odd width. (Whether
+    // a pixel is even or odd is determined based on the tensor shape not the
+    // valid region!)
+    // Thus the offset from which the first pixel (L2) for the convolution is
+    // loaded depends on the anchor and shape of the valid region.
+    // In the case of an even shape (= even image width) we need to load L2
+    // from -2 if the anchor is odd and from -1 if the anchor is even. That
+    // makes sure that L2 is always loaded from an odd pixel.
+    // On the other hand, for an odd shape (= odd image width) we need to load
+    // L2 from -1 if the anchor is odd and from -2 if the anchor is even to
+    // achieve the opposite effect.
+    // The condition can be simplified to checking whether anchor + shape is
+    // odd (-2) or even (-1) as only adding an odd and an even number will have
+    // an odd result.
+    _l2_load_offset = -border_size().left;
+
+    if((_input->info()->valid_region().anchor[0] + _input->info()->valid_region().shape[0]) % 2 == 0)
+    {
+        _l2_load_offset += 1;
+    }
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), _l2_load_offset, num_elems_read_per_iteration),
+                              output_access);
+
+    ValidRegion valid_region = input->info()->valid_region();
+    valid_region.anchor.set(0, std::ceil((valid_region.anchor[0] + (border_undefined ? border_size().left : 0)) / 2.f));
+    valid_region.shape.set(0, (valid_region.shape[0] - (border_undefined ? border_size().right : 0)) / 2 - valid_region.anchor[0]);
+
+    output_access.set_valid_region(win, valid_region);
+
+    ICLKernel::configure(win);
+}
+
+void CLGaussianPyramidHorKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window win_in(window);
+    win_in.shift(Window::DimX, _l2_load_offset);
+
+    //The output is half the width of the input:
+    Window win_out(window);
+    win_out.scale(Window::DimX, 0.5f);
+
+    Window slice_in  = win_in.first_slice_window_2D();
+    Window slice_out = win_out.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice_in);
+        add_2D_tensor_argument(idx, _output, slice_out);
+        enqueue(queue, *this, slice_out);
+    }
+    while(win_in.slide_window_slice_2D(slice_in) && win_out.slide_window_slice_2D(slice_out));
+}
+
+CLGaussianPyramidVertKernel::CLGaussianPyramidVertKernel()
+    : _t2_load_offset(0)
+{
+}
+
+BorderSize CLGaussianPyramidVertKernel::border_size() const
+{
+    return BorderSize(2, 0);
+}
+
+void CLGaussianPyramidVertKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != 2 * output->info()->dimension(1));
+
+    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
+    }
+
+    _input  = input;
+    _output = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gaussian5x1_sub_y"));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_rows_processed_per_iteration  = 2;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 8;
+    constexpr unsigned int num_rows_per_iteration            = 5;
+    constexpr float        scale_y                           = 0.5f;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_rows_processed_per_iteration),
+                                      border_undefined, border_size());
+    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_per_iteration, 1.f, scale_y);
+
+    // Determine whether we need to load even or odd rows. See above for a
+    // detailed explanation.
+    _t2_load_offset = -border_size().top;
+
+    if((_input->info()->valid_region().anchor[1] + _input->info()->valid_region().shape[1]) % 2 == 0)
+    {
+        _t2_load_offset += 1;
+    }
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), 0, _t2_load_offset, num_elems_read_per_iteration, num_rows_per_iteration),
+                              output_access);
+
+    ValidRegion valid_region = input->info()->valid_region();
+    valid_region.anchor.set(1, std::ceil((valid_region.anchor[1] + (border_undefined ? border_size().top : 0)) / 2.f));
+    valid_region.shape.set(1, (valid_region.shape[1] - (border_undefined ? border_size().bottom : 0)) / 2 - valid_region.anchor[1]);
+
+    output_access.set_valid_region(win, valid_region);
+
+    ICLKernel::configure(win);
+}
+
+void CLGaussianPyramidVertKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(window.x().step() != 8);
+    ARM_COMPUTE_ERROR_ON(window.y().step() % 2);
+
+    Window win_in(window);
+    win_in.shift(Window::DimY, _t2_load_offset);
+
+    Window win_out(window);
+    win_out.scale(Window::DimY, 0.5f);
+
+    Window slice_in  = win_in.first_slice_window_2D();
+    Window slice_out = win_out.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice_in);
+        add_2D_tensor_argument(idx, _output, slice_out);
+        enqueue(queue, *this, slice_out);
+    }
+    while(win_in.slide_window_slice_2D(slice_in) && win_out.slide_window_slice_2D(slice_out));
+}
diff --git a/src/core/CL/kernels/CLHOGDescriptorKernel.cpp b/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
new file mode 100644
index 0000000000..87659c4ba9
--- /dev/null
+++ b/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+CLHOGOrientationBinningKernel::CLHOGOrientationBinningKernel()
+    : _input_magnitude(nullptr), _input_phase(nullptr), _output(nullptr), _cell_size()
+{
+}
+
+void CLHOGOrientationBinningKernel::configure(const ICLTensor *input_magnitude, const ICLTensor *input_phase, ICLTensor *output, const HOGInfo *hog_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_magnitude, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_phase, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(hog_info == nullptr);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, hog_info->num_bins(), DataType::F32);
+    ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimX) != input_phase->info()->dimension(Window::DimX));
+    ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimY) != input_phase->info()->dimension(Window::DimY));
+
+    _input_magnitude = input_magnitude;
+    _input_phase     = input_phase;
+    _output          = output;
+    _cell_size       = hog_info->cell_size();
+
+    float phase_scale = (PhaseType::SIGNED == hog_info->phase_type() ? hog_info->num_bins() / 360.0f : hog_info->num_bins() / 180.0f);
+    phase_scale *= (PhaseType::SIGNED == hog_info->phase_type() ? 360.0f / 255.0f : 1.0f);
+
+    std::stringstream args_str;
+    args_str << "-DCELL_WIDTH=" << hog_info->cell_size().width << " ";
+    args_str << "-DCELL_HEIGHT=" << hog_info->cell_size().height << " ";
+    args_str << "-DNUM_BINS=" << hog_info->num_bins() << " ";
+    args_str << "-DPHASE_SCALE=" << phase_scale << " ";
+
+    // Construct kernel name
+    std::set<std::string> build_opts = {};
+    build_opts.insert(args_str.str());
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("hog_orientation_binning", build_opts));
+
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+    constexpr unsigned int num_elems_read_per_iteration      = 1;
+    const unsigned int     num_rows_read_per_iteration       = hog_info->cell_size().height;
+    constexpr unsigned int num_elems_written_per_iteration   = 1;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input_magnitude->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              AccessWindowRectangle(input_phase->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_access);
+
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLHOGOrientationBinningKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        // Compute slice for the magnitude and phase tensors
+        Window slice_mag_phase = window.first_slice_window_2D();
+        slice_mag_phase.set(Window::DimX, Window::Dimension(window.x().start() * _cell_size.width, window.x().start() * _cell_size.width, _cell_size.width));
+        slice_mag_phase.set(Window::DimY, Window::Dimension(window.y().start() * _cell_size.height, window.y().start() * _cell_size.height, _cell_size.height));
+
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input_magnitude, slice_mag_phase);
+        add_2D_tensor_argument(idx, _input_phase, slice_mag_phase);
+        add_2D_tensor_argument(idx, _output, slice);
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
+
+CLHOGBlockNormalizationKernel::CLHOGBlockNormalizationKernel()
+    : _input(nullptr), _output(nullptr), _num_cells_per_block_stride()
+{
+}
+
+void CLHOGBlockNormalizationKernel::configure(const ICLTensor *input, ICLTensor *output, const HOGInfo *hog_info)
+{
+    ARM_COMPUTE_ERROR_ON(hog_info == nullptr);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, hog_info->num_bins(), DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
+
+    // Number of cells per block
+    const Size2D num_cells_per_block(hog_info->block_size().width / hog_info->cell_size().width,
+                                     hog_info->block_size().height / hog_info->cell_size().height);
+
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, hog_info->num_bins() * num_cells_per_block.area(), DataType::F32);
+
+    // Number of cells per block stride
+    const Size2D num_cells_per_block_stride(hog_info->block_stride().width / hog_info->cell_size().width,
+                                            hog_info->block_stride().height / hog_info->cell_size().height);
+
+    _input                      = input;
+    _output                     = output;
+    _num_cells_per_block_stride = num_cells_per_block_stride;
+
+    std::stringstream args_str;
+    args_str << "-DL2_HYST_THRESHOLD=" << hog_info->l2_hyst_threshold() << " ";
+    args_str << "-DNUM_CELLS_PER_BLOCK_HEIGHT=" << num_cells_per_block.height << " ";
+    args_str << "-DNUM_BINS_PER_BLOCK_X=" << num_cells_per_block.width *hog_info->num_bins() << " ";
+    args_str << "-DNUM_BINS_PER_BLOCK=" << _output->info()->num_channels() << " ";
+    args_str << "-DL2_NORM=" << static_cast<int>(HOGNormType::L2_NORM) << " ";
+    args_str << "-DL1_NORM=" << static_cast<int>(HOGNormType::L1_NORM) << " ";
+    args_str << "-DL2HYS_NORM=" << static_cast<int>(HOGNormType::L2HYS_NORM) << " ";
+    args_str << "-DHOG_NORM_TYPE=" << static_cast<int>(hog_info->normalization_type()) << " ";
+
+    // Construct kernel name
+    std::set<std::string> build_opts = {};
+    build_opts.insert(args_str.str());
+
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("hog_block_normalization", build_opts));
+
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+    constexpr unsigned int num_elems_read_per_iteration      = 1;
+    const unsigned int     num_rows_read_per_iteration       = num_cells_per_block.height;
+    constexpr unsigned int num_elems_written_per_iteration   = 1;
+    const unsigned int     num_rows_written_per_iteration    = num_cells_per_block.height;
+
+    // Configure kernel window
+    Window                win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLHOGBlockNormalizationKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        // Compute slice for the magnitude and phase tensors
+        Window slice_in = window.first_slice_window_2D();
+        slice_in.set_dimension_step(Window::DimX, _num_cells_per_block_stride.width);
+        slice_in.set_dimension_step(Window::DimY, _num_cells_per_block_stride.height);
+
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice_in);
+        add_2D_tensor_argument(idx, _output, slice);
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLHOGDetectorKernel.cpp b/src/core/CL/kernels/CLHOGDetectorKernel.cpp
new file mode 100644
index 0000000000..0f9a98950d
--- /dev/null
+++ b/src/core/CL/kernels/CLHOGDetectorKernel.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLHOG.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLHOGDetectorKernel::CLHOGDetectorKernel()
+    : _input(nullptr), _detection_windows(), _num_detection_windows(nullptr)
+{
+}
+
+void CLHOGDetectorKernel::configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, cl::Buffer *num_detection_windows, const Size2D &detection_window_stride,
+                                    float threshold, uint16_t idx_class)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(hog == nullptr);
+    ARM_COMPUTE_ERROR_ON(detection_windows == nullptr);
+    ARM_COMPUTE_ERROR_ON(num_detection_windows == nullptr);
+    ARM_COMPUTE_ERROR_ON((detection_window_stride.width % hog->info()->block_stride().width) != 0);
+    ARM_COMPUTE_ERROR_ON((detection_window_stride.height % hog->info()->block_stride().height) != 0);
+
+    const Size2D &detection_window_size = hog->info()->detection_window_size();
+    const Size2D &block_size            = hog->info()->block_size();
+    const Size2D &block_stride          = hog->info()->block_stride();
+
+    _input                 = input;
+    _detection_windows     = detection_windows;
+    _num_detection_windows = num_detection_windows;
+
+    const unsigned int num_bins_per_descriptor_x   = ((detection_window_size.width - block_size.width) / block_stride.width + 1) * input->info()->num_channels();
+    const unsigned int num_blocks_per_descriptor_y = (detection_window_size.height - block_size.height) / block_stride.height + 1;
+
+    ARM_COMPUTE_ERROR_ON((num_bins_per_descriptor_x * num_blocks_per_descriptor_y + 1) != hog->info()->descriptor_size());
+
+    std::stringstream args_str;
+    args_str << "-DNUM_BLOCKS_PER_DESCRIPTOR_Y=" << num_blocks_per_descriptor_y << " ";
+    args_str << "-DNUM_BINS_PER_DESCRIPTOR_X=" << num_bins_per_descriptor_x << " ";
+    args_str << "-DTHRESHOLD=" << threshold << " ";
+    args_str << "-DMAX_NUM_DETECTION_WINDOWS=" << detection_windows->max_num_values() << " ";
+    args_str << "-DIDX_CLASS=" << idx_class << " ";
+    args_str << "-DBLOCK_STRIDE_WIDTH=" << block_stride.width << " ";
+    args_str << "-DBLOCK_STRIDE_HEIGHT=" << block_stride.height << " ";
+    args_str << "-DDETECTION_WINDOW_WIDTH=" << detection_window_size.width << " ";
+    args_str << "-DDETECTION_WINDOW_HEIGHT=" << detection_window_size.height << " ";
+
+    // Construct kernel name
+    std::set<std::string> build_opts = {};
+    build_opts.insert(args_str.str());
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("hog_detector", build_opts));
+
+    // Set static kernel arguments
+    unsigned int idx = num_arguments_per_2D_tensor(); // Skip the input parameters
+    _kernel.setArg(idx++, hog->cl_buffer());
+    _kernel.setArg(idx++, detection_windows->cl_buffer());
+    _kernel.setArg(idx++, *_num_detection_windows);
+
+    // Get the number of blocks along the x and y directions of the input tensor
+    const ValidRegion &valid_region = input->info()->valid_region();
+    const size_t       num_blocks_x = valid_region.shape[0];
+    const size_t       num_blocks_y = valid_region.shape[1];
+
+    // Get the number of blocks along the x and y directions of the detection window
+    const size_t num_blocks_per_detection_window_x = detection_window_size.width / block_stride.width;
+    const size_t num_blocks_per_detection_window_y = detection_window_size.height / block_stride.height;
+
+    const size_t window_step_x = detection_window_stride.width / block_stride.width;
+    const size_t window_step_y = detection_window_stride.height / block_stride.height;
+
+    // Configure kernel window
+    Window win;
+    win.set(Window::DimX, Window::Dimension(0, floor_to_multiple(num_blocks_x - num_blocks_per_detection_window_x, window_step_x), window_step_x));
+    win.set(Window::DimY, Window::Dimension(0, floor_to_multiple(num_blocks_y - num_blocks_per_detection_window_y, window_step_y), window_step_y));
+
+    constexpr unsigned int num_elems_read_per_iteration = 1;
+    const unsigned int     num_rows_read_per_iteration  = num_blocks_per_descriptor_y;
+
+    update_window_and_padding(win, AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration));
+
+    ICLKernel::configure(win);
+}
+
+void CLHOGDetectorKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLHarrisCornersKernel.cpp b/src/core/CL/kernels/CLHarrisCornersKernel.cpp
new file mode 100644
index 0000000000..9fc34a7760
--- /dev/null
+++ b/src/core/CL/kernels/CLHarrisCornersKernel.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+CLHarrisScoreKernel::CLHarrisScoreKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr), _sensitivity(), _strength_thresh(), _norm_factor(), _border_size(0)
+{
+}
+
+BorderSize CLHarrisScoreKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLHarrisScoreKernel::configure(const ICLImage *input1, const ICLImage *input2, ICLImage *output,
+                                    int32_t block_size, float norm_factor, float strength_thresh, float sensitivity,
+                                    bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+    ARM_COMPUTE_ERROR_ON(!(block_size == 3 || block_size == 5 || block_size == 7));
+    ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
+
+    _input1          = input1;
+    _input2          = input2;
+    _output          = output;
+    _sensitivity     = sensitivity;
+    _strength_thresh = strength_thresh;
+    _norm_factor     = norm_factor;
+    _border_size     = BorderSize(block_size / 2);
+
+    // Select kernel
+    std::stringstream harris_score_kernel_name;
+    harris_score_kernel_name << "harris_score_" << block_size << "x" << block_size;
+
+    // Create build options
+    std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())) };
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(harris_score_kernel_name.str(), build_opts));
+
+    // Set static kernel arguments
+    unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, sensitivity);
+    _kernel.setArg(idx++, strength_thresh);
+    _kernel.setArg(idx++, norm_factor);
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
+    constexpr unsigned int num_elems_written_per_iteration   = 4;
+    constexpr unsigned int num_elems_read_per_iteration      = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window win = calculate_max_window(*_input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input1_access(input1->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowRectangle  input2_access(input2->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), input2->info()->valid_region());
+    output_access.set_valid_region(win, valid_region, border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLHarrisScoreKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input1, slice);
+        add_2D_tensor_argument(idx, _input2, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLHistogramKernel.cpp b/src/core/CL/kernels/CLHistogramKernel.cpp
new file mode 100644
index 0000000000..87ee5fb74e
--- /dev/null
+++ b/src/core/CL/kernels/CLHistogramKernel.cpp
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLHistogramKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLDistribution1D.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cstring>
+#include <string>
+
+using namespace arm_compute;
+
+// each thread handle 16 pixels
+constexpr signed int pixels_per_item = 16;
+
+// local work group size in X dimension
+constexpr unsigned int local_x_size = 16;
+
+CLHistogramKernel::CLHistogramKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLHistogramKernel::configure(const ICLImage *input, ICLDistribution1D *output)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+    // Check input size
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+
+    // Check offset
+    ARM_COMPUTE_ERROR_ON_MSG(0 > output->offset() || output->offset() > 256, "Offset is larger than the image value range.");
+
+    // Check range
+    ARM_COMPUTE_ERROR_ON_MSG(output->range() > 256 /* max range */, "Range larger than the image value range.");
+
+    _input  = input;
+    _output = output;
+
+    if(_input->info()->dimension(0) < pixels_per_item)
+    {
+        return;
+    }
+
+    unsigned int num_bins    = _output->num_bins();
+    unsigned int window_size = _output->window();
+    unsigned int offset      = _output->offset();
+    unsigned int range       = _output->range();
+    unsigned int offrange    = offset + range;
+    unsigned int bin_size    = _output->size();
+    unsigned int buffer_size = bin_size + 1; // We need one extra place for pixels that don't meet the conditions
+
+    // Create kernel
+    bool        is_fixed_size = (256 == num_bins) && (1 == window_size) && (0 == offset) && (256 == offrange);
+    std::string kernel_name   = is_fixed_size ? "hist_local_kernel_fixed" : "hist_local_kernel";
+    _kernel                   = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+
+    // Set static kernel arguments
+    unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, buffer_size, nullptr);
+    _kernel.setArg(idx++, _output->cl_buffer());
+    if(!is_fixed_size)
+    {
+        _kernel.setArg<cl_uint>(idx++, num_bins);
+        _kernel.setArg<cl_uint>(idx++, offset);
+        _kernel.setArg<cl_uint>(idx++, range);
+        _kernel.setArg<cl_uint>(idx++, offrange);
+    }
+
+    // We only run histogram on Image, therefore only 2 dimensions here
+    unsigned int end_position = (_input->info()->dimension(0) / pixels_per_item) * pixels_per_item;
+
+    // Configure kernel window
+    Window win;
+    win.set(0, Window::Dimension(0, end_position, pixels_per_item));
+    win.set(1, Window::Dimension(0, _input->info()->dimension(1)));
+
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, pixels_per_item));
+
+    ICLKernel::configure(win);
+}
+
+void CLHistogramKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    if(_input->info()->dimension(0) < pixels_per_item)
+    {
+        return;
+    }
+
+    _output->map(queue, true);
+    ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
+    memset(_output->buffer(), 0, _output->size());
+    _output->unmap(queue);
+
+    Window      slice = window.first_slice_window_2D();
+    cl::NDRange lws   = cl::NDRange(local_x_size, 1);
+
+    do
+    {
+        /* Run the core part which has width can be divided by 16 */
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+
+        enqueue(queue, *this, slice, lws);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
+
+CLHistogramBorderKernel::CLHistogramBorderKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLHistogramBorderKernel::configure(const ICLImage *input, ICLDistribution1D *output)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+    // Check input size
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+
+    // Check offset
+    ARM_COMPUTE_ERROR_ON_MSG(0 > output->offset() || output->offset() > 256, "Offset is larger than the image value range.");
+
+    // Check range
+    ARM_COMPUTE_ERROR_ON_MSG(output->range() > 256 /* max range */, "Range larger than the image value range.");
+
+    // We only run histogram on Image, therefore only 2 dimensions here
+    unsigned int start_position = (input->info()->dimension(0) / pixels_per_item) * pixels_per_item;
+
+    if(start_position >= input->info()->dimension(0))
+    {
+        return; // no need to run histogram border kernel
+    }
+
+    _input  = input;
+    _output = output;
+
+    unsigned int num_bins    = _output->num_bins();
+    unsigned int window_size = _output->window();
+    unsigned int offset      = _output->offset();
+    unsigned int range       = _output->range();
+    unsigned int offrange    = offset + range;
+
+    // Create kernel
+    bool        is_fixed_size = (256 == num_bins) && (1 == window_size) && (0 == offset) && (256 == offrange);
+    std::string kernel_name   = is_fixed_size ? "hist_border_kernel_fixed" : "hist_border_kernel";
+    _kernel                   = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+
+    // Set static kernel arguments
+    unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, _output->cl_buffer());
+    if(!is_fixed_size)
+    {
+        _kernel.setArg<cl_uint>(idx++, num_bins);
+        _kernel.setArg<cl_uint>(idx++, offset);
+        _kernel.setArg<cl_uint>(idx++, range);
+        _kernel.setArg<cl_uint>(idx++, offrange);
+    }
+
+    // Configure kernel window
+    Window win;
+    win.set(0, Window::Dimension(start_position, _input->info()->dimension(0)));
+    win.set(1, Window::Dimension(0, _input->info()->dimension(1)));
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, 1));
+    ICLKernel::configure(win);
+}
+
+void CLHistogramBorderKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    if(window.x().start() >= window.x().end())
+    {
+        return;
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    cl::NDRange lws = cl::NDRange(1, 1);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        /* Run the border part which has width cannot be divided by 16 */
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+
+        enqueue(queue, *this, slice, lws);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
new file mode 100644
index 0000000000..8c0fe26666
--- /dev/null
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cmath>
+#include <tuple>
+
+using namespace arm_compute;
+
+CLIm2ColKernel::CLIm2ColKernel()
+    : _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_size(0), _num_elems_processed_per_iteration(1), _run_func(nullptr)
+{
+}
+
+void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, std::pair<unsigned int, unsigned int> convolved_dims, const PadStrideInfo &conv_info, bool has_bias)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+
+    _input  = input;
+    _output = output;
+
+    // Create kernel
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.emplace((has_bias ? "-DHAS_BIAS" : ""));
+
+    int pad_x    = 0;
+    int pad_y    = 0;
+    int stride_x = 0;
+    int stride_y = 0;
+    std::tie(pad_x, pad_y)       = conv_info.pad();
+    std::tie(stride_x, stride_y) = conv_info.stride();
+
+    const bool run_img2col_reduced = (output->info()->dimension(0) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))) && (TensorShape::num_max_dimensions >= 4)
+                                     && (std::equal(input->info()->tensor_shape().cbegin() + 3,
+                                                    input->info()->tensor_shape().cend(),
+                                                    output->info()->tensor_shape().cbegin() + 1))
+                                     && ((stride_x == 1) && (stride_y == 1) && (pad_x == 0) && (pad_y == 0));
+
+    if(!run_img2col_reduced)
+    {
+        _convolved_dims                    = convolved_dims;
+        _conv_info                         = conv_info;
+        _kernel_size                       = std::sqrt((output->info()->dimension(0) - (has_bias ? 1 : 0)) / input->info()->dimension(2));
+        _num_elems_processed_per_iteration = output->info()->dimension(0);
+
+        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("im2col_generic", build_opts));
+
+        // Create static kernel arguments
+        const cl_int2 input_dims =
+        {
+            {
+                static_cast<cl_int>(input->info()->dimension(0)),
+                static_cast<cl_int>(input->info()->dimension(1)),
+            }
+        };
+        const cl_int2 strides =
+        {
+            {
+                stride_x,
+                stride_y,
+            }
+        };
+        const cl_int2 paddings =
+        {
+            {
+                pad_x,
+                pad_y,
+            }
+        };
+
+        // Set static kernel arguments
+        unsigned int idx = num_arguments_per_2D_tensor() + num_arguments_per_3D_tensor();
+        _kernel.setArg<cl_int>(idx++, _kernel_size);
+        _kernel.setArg<cl_int>(idx++, input->info()->dimension(2) /* depth */);
+        _kernel.setArg<cl_int>(idx++, _convolved_dims.first /* output width */);
+        _kernel.setArg<cl_int2>(idx++, input_dims);
+        _kernel.setArg<cl_int2>(idx++, strides);
+        _kernel.setArg<cl_int2>(idx++, paddings);
+
+        _run_func = &CLIm2ColKernel::run_generic;
+    }
+    else
+    {
+        _num_elems_processed_per_iteration = 1;
+        _kernel                            = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("im2col_reduced", build_opts));
+        _run_func                          = &CLIm2ColKernel::run_reduced;
+    }
+
+    // Configure  kernel window
+    Window win = calculate_max_window(*input->info(), Steps());
+    // The CLIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    ICLKernel::configure(win);
+}
+
+void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON(_run_func == nullptr);
+    (this->*_run_func)(window, queue);
+}
+
+void CLIm2ColKernel::run_generic(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    int pad_x    = 0;
+    int pad_y    = 0;
+    int stride_x = 0;
+    int stride_y = 0;
+    std::tie(pad_x, pad_y)       = _conv_info.pad();
+    std::tie(stride_x, stride_y) = _conv_info.stride();
+
+    // Get initial windows
+    Window slice     = window.first_slice_window_3D();
+    Window slice_in  = window.first_slice_window_3D();
+    Window slice_out = window.first_slice_window_3D();
+
+    // Setup slice
+    slice.set(Window::DimX, Window::Dimension(0, static_cast<int>(_convolved_dims.first), 1));
+    slice.set(Window::DimY, Window::Dimension(0, static_cast<int>(_convolved_dims.second), 1));
+    slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    // Setup input slice
+    // The first three dimensions of the input are increased by the inner loops
+    slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    // Setup output slice
+    slice_out.set(Window::DimX, Window::Dimension(0, _output->info()->dimension(0), _num_elems_processed_per_iteration));
+    slice_out.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 1));
+    slice_out.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    do
+    {
+        // Set inputs
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice_in);
+        add_2D_tensor_argument(idx, _output, slice_out);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out) && window.slide_window_slice_3D(slice_in));
+}
+
+void CLIm2ColKernel::run_reduced(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    Window out_window;
+    out_window.use_tensor_dimensions(_output->info());
+
+    Window out_slice = out_window.first_slice_window_1D();
+    Window in_slice  = window.first_slice_window_3D();
+
+    // Run kernel
+    do
+    {
+        // Set arguments
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, in_slice);
+        add_1D_tensor_argument(idx, _output, out_slice);
+
+        _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(0));
+        _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(1));
+        enqueue(queue, *this, in_slice);
+    }
+    while(window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice));
+}
diff --git a/src/core/CL/kernels/CLIntegralImageKernel.cpp b/src/core/CL/kernels/CLIntegralImageKernel.cpp
new file mode 100644
index 0000000000..69ede457df
--- /dev/null
+++ b/src/core/CL/kernels/CLIntegralImageKernel.cpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLIntegralImageKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cstddef>
+
+using namespace arm_compute;
+
+void CLIntegralImageHorKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32);
+
+    _input  = input;
+    _output = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("integral_horizontal"));
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = input->info()->dimension(0);
+    const unsigned int num_elems_accessed_per_iteration  = ceil_to_multiple(num_elems_processed_per_iteration, 16);
+
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_accessed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), 0, num_elems_accessed_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+CLIntegralImageVertKernel::CLIntegralImageVertKernel()
+    : _in_out(nullptr)
+{
+}
+
+void CLIntegralImageVertKernel::configure(ICLTensor *in_out)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(in_out, 1, DataType::U32);
+
+    _in_out = in_out;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("integral_vertical"));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration_x = 8;
+    const unsigned int     num_elems_processed_per_iteration_y = in_out->info()->dimension(Window::DimY);
+
+    Window win = calculate_max_window(*in_out->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    AccessWindowRectangle in_out_access(in_out->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+    update_window_and_padding(win, in_out_access);
+
+    in_out_access.set_valid_region(win, in_out->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLIntegralImageVertKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const size_t height = _in_out->info()->dimension(1);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _in_out, slice);
+        _kernel.setArg<cl_uint>(idx++, height);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLLKTrackerKernel.cpp b/src/core/CL/kernels/CLLKTrackerKernel.cpp
new file mode 100644
index 0000000000..12cdd0ec93
--- /dev/null
+++ b/src/core/CL/kernels/CLLKTrackerKernel.cpp
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+
+using namespace arm_compute;
+
+void CLLKTrackerInitKernel::configure(const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates,
+                                      ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
+                                      bool use_initial_estimate, size_t level, size_t num_levels, float pyramid_scale)
+
+{
+    ARM_COMPUTE_ERROR_ON(old_points == nullptr);
+    ARM_COMPUTE_ERROR_ON(old_points_internal == nullptr);
+    ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr);
+
+    const float scale = std::pow(pyramid_scale, level);
+
+    // Create kernel
+    std::string kernel_name = "init_level";
+    if(level == (num_levels - 1))
+    {
+        kernel_name += (use_initial_estimate) ? std::string("_max_initial_estimate") : std::string("_max");
+    }
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+
+    // Set static kernel arguments
+    unsigned int idx = 0;
+    if(level == (num_levels - 1))
+    {
+        _kernel.setArg(idx++, old_points->cl_buffer());
+        if(use_initial_estimate)
+        {
+            _kernel.setArg(idx++, new_points_estimates->cl_buffer());
+        }
+    }
+    _kernel.setArg(idx++, old_points_internal->cl_buffer());
+    _kernel.setArg(idx++, new_points_internal->cl_buffer());
+    _kernel.setArg<cl_float>(idx++, scale);
+
+    // Configure kernel window
+    Window window;
+    window.set(Window::DimX, Window::Dimension(0, old_points->num_values(), 1));
+    window.set(Window::DimY, Window::Dimension(0, 1, 1));
+    ICLKernel::configure(window);
+}
+
+void CLLKTrackerInitKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    enqueue(queue, *this, window);
+}
+
+void CLLKTrackerFinalizeKernel::configure(ICLLKInternalKeypointArray *new_points_internal, ICLKeyPointArray *new_points)
+
+{
+    ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr);
+    ARM_COMPUTE_ERROR_ON(new_points == nullptr);
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("finalize"));
+
+    // Set static kernel arguments
+    unsigned int idx = 0;
+    _kernel.setArg(idx++, new_points_internal->cl_buffer());
+    _kernel.setArg(idx++, new_points->cl_buffer());
+
+    // Configure kernel window
+    Window window;
+    window.set(Window::DimX, Window::Dimension(0, new_points_internal->num_values(), 1));
+    window.set(Window::DimY, Window::Dimension(0, 1, 1));
+    ICLKernel::configure(window);
+}
+
+void CLLKTrackerFinalizeKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    enqueue(queue, *this, window);
+}
+
+CLLKTrackerStage0Kernel::CLLKTrackerStage0Kernel()
+    : _old_input(nullptr), _old_scharr_gx(nullptr), _old_scharr_gy(nullptr)
+{
+}
+
+void CLLKTrackerStage0Kernel::configure(const ICLTensor *old_input, const ICLTensor *old_scharr_gx, const ICLTensor *old_scharr_gy,
+                                        ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
+                                        ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
+                                        size_t window_dimension, size_t level)
+
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_scharr_gx, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_scharr_gy, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON(old_points_internal == nullptr);
+    ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr);
+    ARM_COMPUTE_ERROR_ON(coeff_table == nullptr);
+    ARM_COMPUTE_ERROR_ON(old_ival == nullptr);
+
+    _old_input     = old_input;
+    _old_scharr_gx = old_scharr_gx;
+    _old_scharr_gy = old_scharr_gy;
+
+    // Configure kernel window
+    Window window;
+    window.set(Window::DimX, Window::Dimension(0, new_points_internal->num_values(), 1));
+    window.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    const ValidRegion valid_region = intersect_valid_regions(
+                                         old_input->info()->valid_region(),
+                                         old_scharr_gx->info()->valid_region(),
+                                         old_scharr_gy->info()->valid_region());
+
+    update_window_and_padding(window,
+                              AccessWindowStatic(old_input->info(), valid_region.start(0), valid_region.start(1),
+                                                 valid_region.end(0), valid_region.end(1)),
+                              AccessWindowStatic(old_scharr_gx->info(), valid_region.start(0), valid_region.start(1),
+                                                 valid_region.end(0), valid_region.end(1)),
+                              AccessWindowStatic(old_scharr_gy->info(), valid_region.start(0), valid_region.start(1),
+                                                 valid_region.end(0), valid_region.end(1)));
+
+    ICLKernel::configure(window);
+
+    // Initialize required variables
+    const int       level0              = (level == 0) ? 1 : 0;
+    const int       window_size         = window_dimension;
+    const int       window_size_squared = window_dimension * window_dimension;
+    const int       window_size_half    = window_dimension / 2;
+    const float     eig_const           = 1.0f / (2.0f * window_size_squared);
+    const cl_float3 border_limits =
+    {
+        {
+            // -1 because we load 2 values at once for bilinear interpolation
+            static_cast<cl_float>(valid_region.end(0) - window_size - 1),
+            static_cast<cl_float>(valid_region.end(1) - window_size - 1),
+            static_cast<cl_float>(valid_region.start(0))
+        }
+    };
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("lktracker_stage0"));
+
+    // Set arguments
+    unsigned int idx = 3 * num_arguments_per_2D_tensor();
+    _kernel.setArg(idx++, old_points_internal->cl_buffer());
+    _kernel.setArg(idx++, new_points_internal->cl_buffer());
+    _kernel.setArg(idx++, coeff_table->cl_buffer());
+    _kernel.setArg(idx++, old_ival->cl_buffer());
+    _kernel.setArg<cl_int>(idx++, window_size);
+    _kernel.setArg<cl_int>(idx++, window_size_squared);
+    _kernel.setArg<cl_int>(idx++, window_size_half);
+    _kernel.setArg<cl_float3>(idx++, border_limits);
+    _kernel.setArg<cl_float>(idx++, eig_const);
+    _kernel.setArg<cl_int>(idx++, level0);
+}
+
+void CLLKTrackerStage0Kernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    // Set static tensor arguments. Setting here as allocation might be deferred.
+    unsigned int idx = 0;
+    add_2D_tensor_argument(idx, _old_input, window);
+    add_2D_tensor_argument(idx, _old_scharr_gx, window);
+    add_2D_tensor_argument(idx, _old_scharr_gy, window);
+
+    enqueue(queue, *this, window);
+}
+
+CLLKTrackerStage1Kernel::CLLKTrackerStage1Kernel()
+    : _new_input(nullptr)
+{
+}
+
+void CLLKTrackerStage1Kernel::configure(const ICLTensor *new_input, ICLLKInternalKeypointArray *new_points_internal, ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
+                                        Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, size_t level)
+
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(new_input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr);
+    ARM_COMPUTE_ERROR_ON(coeff_table == nullptr);
+    ARM_COMPUTE_ERROR_ON(old_ival == nullptr);
+
+    _new_input = new_input;
+
+    // Configure kernel window
+    Window window;
+    window.set(Window::DimX, Window::Dimension(0, new_points_internal->num_values(), 1));
+    window.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    const ValidRegion &valid_region = new_input->info()->valid_region();
+
+    update_window_and_padding(window,
+                              AccessWindowStatic(new_input->info(), valid_region.start(0), valid_region.start(1),
+                                                 valid_region.end(0), valid_region.end(1)));
+
+    ICLKernel::configure(window);
+
+    // Initialize required variables
+    const int       level0              = (level == 0) ? 1 : 0;
+    const int       window_size         = window_dimension;
+    const int       window_size_squared = window_dimension * window_dimension;
+    const int       window_size_half    = window_dimension / 2;
+    const float     eig_const           = 1.0f / (2.0f * window_size_squared);
+    const cl_float3 border_limits =
+    {
+        {
+            // -1 because we load 2 values at once for bilinear interpolation
+            static_cast<cl_float>(valid_region.end(0) - window_size - 1),
+            static_cast<cl_float>(valid_region.end(1) - window_size - 1),
+            static_cast<cl_float>(valid_region.start(0))
+        }
+    };
+    const int term_iteration = (termination == Termination::TERM_CRITERIA_ITERATIONS || termination == Termination::TERM_CRITERIA_BOTH) ? 1 : 0;
+    const int term_epsilon   = (termination == Termination::TERM_CRITERIA_EPSILON || termination == Termination::TERM_CRITERIA_BOTH) ? 1 : 0;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("lktracker_stage1"));
+
+    // Set static kernel arguments
+    unsigned int idx = num_arguments_per_2D_tensor();
+    _kernel.setArg(idx++, new_points_internal->cl_buffer());
+    _kernel.setArg(idx++, coeff_table->cl_buffer());
+    _kernel.setArg(idx++, old_ival->cl_buffer());
+    _kernel.setArg<cl_int>(idx++, window_size);
+    _kernel.setArg<cl_int>(idx++, window_size_squared);
+    _kernel.setArg<cl_int>(idx++, window_size_half);
+    _kernel.setArg<cl_int>(idx++, num_iterations);
+    _kernel.setArg<cl_float>(idx++, epsilon);
+    _kernel.setArg<cl_float3>(idx++, border_limits);
+    _kernel.setArg<cl_float>(idx++, eig_const);
+    _kernel.setArg<cl_int>(idx++, level0);
+    _kernel.setArg<cl_int>(idx++, term_iteration);
+    _kernel.setArg<cl_int>(idx++, term_epsilon);
+}
+
+void CLLKTrackerStage1Kernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    // Set static tensor arguments. Setting here as allocation might be deferred.
+    unsigned int idx = 0;
+    add_2D_tensor_argument(idx, _new_input, window);
+
+    enqueue(queue, *this, window);
+}
diff --git a/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
new file mode 100644
index 0000000000..794a1bc56e
--- /dev/null
+++ b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+CLLocallyConnectedMatrixMultiplyKernel::CLLocallyConnectedMatrixMultiplyKernel()
+    : _input0(nullptr), _input1(nullptr), _output(nullptr)
+{
+}
+
+void CLLocallyConnectedMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+    ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
+
+    _input0 = input0;
+    _input1 = input1;
+    _output = output;
+
+    if(output->info()->dimension(1) == 196)
+    {
+        _lws_hint = cl::NDRange(1, 7);
+    }
+    else
+    {
+        _lws_hint = cl::NDRange(8, 8);
+    }
+
+    std::ostringstream    mm_arguments;
+    std::set<std::string> build_opts;
+
+    mm_arguments << "-DWIDTH_VECTOR_A=" << input0->info()->dimension(0) << " ";
+    build_opts.emplace(mm_arguments.str());
+
+    // Create kernel
+    std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type()));
+    _kernel                    = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_lc_vm_" + data_type_name), build_opts));
+
+    // Configure window kernel
+    const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type());
+
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
+
+    AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+    AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+
+    update_window_and_padding(win, input0_access, input1_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLLocallyConnectedMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    Window matrix_b_window;
+    matrix_b_window.use_tensor_dimensions(_input1->info());
+    Window slice_matrix_b = matrix_b_window.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input0, slice);
+        add_3D_tensor_argument(idx, _input1, slice_matrix_b);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice, _lws_hint);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp b/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp
new file mode 100644
index 0000000000..c504189169
--- /dev/null
+++ b/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLMagnitudePhaseKernel::CLMagnitudePhaseKernel()
+    : _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr), _run_mag(false), _run_phase(false)
+{
+}
+
+void CLMagnitudePhaseKernel::configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase,
+                                       MagnitudeType mag_type, PhaseType phase_type)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON((magnitude == nullptr) && (phase == nullptr));
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, gy);
+
+    _run_mag   = (magnitude != nullptr);
+    _run_phase = (phase != nullptr);
+    if(_run_mag)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::S16, DataType::S32);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, magnitude);
+    }
+    if(_run_phase)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
+    }
+
+    if(!_run_mag && !_run_phase)
+    {
+        ARM_COMPUTE_ERROR("At least one output must be NOT NULL");
+    }
+
+    _gx        = gx;
+    _gy        = gy;
+    _magnitude = magnitude;
+    _phase     = phase;
+
+    // Construct kernel name
+    std::set<std::string> build_opts = {};
+
+    // Add magnitude type
+    if(_run_mag)
+    {
+        switch(mag_type)
+        {
+            case MagnitudeType::L1NORM:
+                build_opts.insert("-DMAGNITUDE=1");
+                break;
+            case MagnitudeType::L2NORM:
+                build_opts.insert("-DMAGNITUDE=2");
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Unsupported magnitude calculation type.");
+                build_opts.insert("-DMAGNITUDE=0");
+                break;
+        }
+    }
+
+    // Add phase type
+    if(_run_phase)
+    {
+        switch(phase_type)
+        {
+            case PhaseType::UNSIGNED:
+                build_opts.insert("-DPHASE=1");
+                break;
+            case PhaseType::SIGNED:
+                build_opts.insert("-DPHASE=2");
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Unsupported phase calculation type.");
+                build_opts.insert("-DPHASE=0");
+                break;
+        }
+    }
+
+    // Add data_type
+    build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(gx->info()->data_type()));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("magnitude_phase", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*gx->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal gx_access(gx->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal gy_access(gy->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_magnitude_access(magnitude == nullptr ? nullptr : magnitude->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_phase_access(phase == nullptr ? nullptr : phase->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              gx_access, gy_access,
+                              output_magnitude_access, output_phase_access);
+
+    ValidRegion valid_region = intersect_valid_regions(gx->info()->valid_region(),
+                                                       gy->info()->valid_region());
+    output_magnitude_access.set_valid_region(win, valid_region);
+    output_phase_access.set_valid_region(win, valid_region);
+
+    ICLKernel::configure(win);
+}
+
+void CLMagnitudePhaseKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _gx, slice);
+        add_2D_tensor_argument(idx, _gy, slice);
+
+        if(_run_mag)
+        {
+            add_2D_tensor_argument(idx, _magnitude, slice);
+        }
+
+        if(_run_phase)
+        {
+            add_2D_tensor_argument(idx, _phase, slice);
+        }
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLMeanStdDevKernel.cpp b/src/core/CL/kernels/CLMeanStdDevKernel.cpp
new file mode 100644
index 0000000000..b0b748f466
--- /dev/null
+++ b/src/core/CL/kernels/CLMeanStdDevKernel.cpp
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLMeanStdDevKernel::CLMeanStdDevKernel()
+    : _input(nullptr), _mean(nullptr), _stddev(nullptr), _global_sum(nullptr), _global_sum_squared(nullptr)
+{
+}
+
+void CLMeanStdDevKernel::configure(const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev, cl::Buffer *global_sum_squared)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == mean);
+    ARM_COMPUTE_ERROR_ON(nullptr == global_sum);
+    ARM_COMPUTE_ERROR_ON(stddev && nullptr == global_sum_squared);
+
+    _input              = input;
+    _mean               = mean;
+    _stddev             = stddev;
+    _global_sum         = global_sum;
+    _global_sum_squared = global_sum_squared;
+
+    // Create kernel
+    std::set<std::string> build_opts;
+
+    if(_stddev != nullptr)
+    {
+        build_opts.insert("-DSTDDEV");
+    }
+
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("mean_stddev_accumulate", build_opts));
+
+    // Set fixed arguments
+    unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input parameters
+
+    _kernel.setArg(idx++, static_cast<cl_uint>(input->info()->dimension(1)));
+    _kernel.setArg(idx++, *_global_sum);
+
+    if(_stddev != nullptr)
+    {
+        _kernel.setArg(idx++, *_global_sum_squared);
+    }
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration_x = 8;
+    const unsigned int     num_elems_processed_per_iteration_y = input->info()->dimension(1);
+
+    Window                win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+    update_window_and_padding(win, input_access);
+
+    ICLKernel::configure(win);
+}
+
+void CLMeanStdDevKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    // Clear sums
+    static const cl_ulong zero = 0;
+    queue.enqueueWriteBuffer(*_global_sum, CL_FALSE, 0, sizeof(cl_ulong), &zero);
+
+    if(_stddev != nullptr)
+    {
+        queue.enqueueWriteBuffer(*_global_sum_squared, CL_FALSE, 0, sizeof(cl_ulong), &zero);
+    }
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        // Set slice step equal to height to force gws[1] to 1,
+        // as each thread calculates the sum across all rows and columns equal to the number of elements processed by each work-item
+        slice.set_dimension_step(Window::DimY, _input->info()->dimension(1));
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+
+    // Calculate mean and stddev
+    cl_ulong    global_sum         = 0;
+    cl_ulong    global_sum_squared = 0;
+    const float num_pixels         = _input->info()->dimension(0) * _input->info()->dimension(1);
+
+    queue.enqueueReadBuffer(*_global_sum, CL_TRUE, 0, sizeof(cl_ulong), static_cast<void *>(&global_sum));
+    const float mean = global_sum / num_pixels;
+    *_mean           = mean;
+
+    if(_stddev != nullptr)
+    {
+        queue.enqueueReadBuffer(*_global_sum_squared, CL_TRUE, 0, sizeof(cl_ulong), static_cast<void *>(&global_sum_squared));
+        *_stddev = std::sqrt((global_sum_squared / num_pixels) - (mean * mean));
+    }
+}
diff --git a/src/core/CL/kernels/CLMedian3x3Kernel.cpp b/src/core/CL/kernels/CLMedian3x3Kernel.cpp
new file mode 100644
index 0000000000..95334c7b5f
--- /dev/null
+++ b/src/core/CL/kernels/CLMedian3x3Kernel.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+BorderSize CLMedian3x3Kernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLMedian3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    _input  = input;
+    _output = output;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("non_linear_filter_box3x3", { "-DMEDIAN" }));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLMinMaxLocationKernel.cpp b/src/core/CL/kernels/CLMinMaxLocationKernel.cpp
new file mode 100644
index 0000000000..939a53b03a
--- /dev/null
+++ b/src/core/CL/kernels/CLMinMaxLocationKernel.cpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <climits>
+
+using namespace arm_compute;
+
+CLMinMaxKernel::CLMinMaxKernel()
+    : _input(nullptr), _min_max(), _data_type_max_min()
+{
+}
+
+void CLMinMaxKernel::configure(const ICLImage *input, cl::Buffer *min_max)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON(min_max == nullptr);
+
+    _input                                               = input;
+    _min_max                                             = min_max;
+    const unsigned int num_elems_processed_per_iteration = input->info()->dimension(0);
+
+    switch(input->info()->data_type())
+    {
+        case DataType::U8:
+            _data_type_max_min[0] = UCHAR_MAX;
+            _data_type_max_min[1] = 0;
+            break;
+        case DataType::S16:
+            _data_type_max_min[0] = SHRT_MAX;
+            _data_type_max_min[1] = SHRT_MIN;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("You called with the wrong image data types");
+    }
+
+    // Set kernel build options
+    std::set<std::string> build_opts;
+    build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.emplace("-DDATA_TYPE_MAX=" + val_to_string<int>(_data_type_max_min[0]));
+    build_opts.emplace("-DDATA_TYPE_MIN=" + val_to_string<int>(_data_type_max_min[1]));
+    build_opts.emplace((0 != (num_elems_processed_per_iteration % max_cl_vector_width)) ? "-DNON_MULTIPLE_OF_16" : "");
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("minmax", build_opts));
+
+    // Set fixed arguments
+    unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, *_min_max);
+    _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
+    ICLKernel::configure(win);
+}
+
+void CLMinMaxKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    // Reset mininum and maximum values
+    queue.enqueueWriteBuffer(*_min_max, CL_FALSE /* blocking */, 0, _data_type_max_min.size() * sizeof(int), _data_type_max_min.data());
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
+
+CLMinMaxLocationKernel::CLMinMaxLocationKernel()
+    : _input(nullptr), _min_max_count(nullptr)
+{
+}
+
+void CLMinMaxLocationKernel::configure(const ICLImage *input, cl::Buffer *min_max, cl::Buffer *min_max_count, ICLCoordinates2DArray *min_loc, ICLCoordinates2DArray *max_loc)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON(min_max == nullptr);
+    ARM_COMPUTE_ERROR_ON(min_max_count == nullptr && min_loc == nullptr && max_loc == nullptr);
+
+    _input         = input;
+    _min_max_count = min_max_count;
+
+    // Set kernel build options
+    std::set<std::string> build_opts;
+    build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.emplace((min_max_count != nullptr) ? "-DCOUNT_MIN_MAX" : "");
+    build_opts.emplace((min_loc != nullptr) ? "-DLOCATE_MIN" : "");
+    build_opts.emplace((max_loc != nullptr) ? "-DLOCATE_MAX" : "");
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("minmaxloc", build_opts));
+
+    // Set static arguments
+    unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, *min_max);
+    _kernel.setArg(idx++, *min_max_count);
+    if(min_loc != nullptr)
+    {
+        _kernel.setArg(idx++, min_loc->cl_buffer());
+        _kernel.setArg<cl_uint>(idx++, min_loc->max_num_values());
+    }
+    if(max_loc != nullptr)
+    {
+        _kernel.setArg(idx++, max_loc->cl_buffer());
+        _kernel.setArg<cl_uint>(idx++, max_loc->max_num_values());
+    }
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+    Window                 win                               = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
+    ICLKernel::configure(win);
+}
+
+void CLMinMaxLocationKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    static const unsigned int zero_count = 0;
+    queue.enqueueWriteBuffer(*_min_max_count, CL_FALSE, 0 * sizeof(zero_count), sizeof(zero_count), &zero_count);
+    queue.enqueueWriteBuffer(*_min_max_count, CL_FALSE, 1 * sizeof(zero_count), sizeof(zero_count), &zero_count);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLNonLinearFilterKernel.cpp b/src/core/CL/kernels/CLNonLinearFilterKernel.cpp
new file mode 100644
index 0000000000..6afa5822ba
--- /dev/null
+++ b/src/core/CL/kernels/CLNonLinearFilterKernel.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+CLNonLinearFilterKernel::CLNonLinearFilterKernel()
+    : _border_size(0)
+{
+}
+
+BorderSize CLNonLinearFilterKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLNonLinearFilterKernel::configure(const ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function,
+                                        unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
+                                        bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(mask_size != 3 && mask_size != 5);
+    ARM_COMPUTE_ERROR_ON_MSG(pattern == MatrixPattern::OTHER, "MatrixPattern::OTHER is not supported!");
+    ARM_COMPUTE_UNUSED(mask);
+
+    _input       = input;
+    _output      = output;
+    _border_size = BorderSize(mask_size / 2);
+
+    // Define build options
+    std::set<std::string> build_opts;
+    build_opts.emplace("-D" + string_from_non_linear_filter_function(function));
+
+    // Define kernel
+    std::string pattern_name = string_from_matrix_pattern(pattern);
+    std::transform(pattern_name.begin(), pattern_name.end(), pattern_name.begin(), ::tolower);
+    std::stringstream ss;
+    ss << "non_linear_filter_" << pattern_name << mask_size << "x" << mask_size;
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(ss.str(), build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    const unsigned int     num_rows_read_per_iteration       = mask_size;
+
+    Window win = calculate_max_window(*input->info(), num_elems_processed_per_iteration, border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp b/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp
new file mode 100644
index 0000000000..6a96b0effd
--- /dev/null
+++ b/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+BorderSize CLNonMaximaSuppression3x3Kernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLNonMaximaSuppression3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::F32);
+
+    _input  = input;
+    _output = output;
+
+    // Create kernel
+    std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) };
+    _kernel                          = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("non_max_suppression", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
new file mode 100644
index 0000000000..106a5113db
--- /dev/null
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLNormalizationLayerKernel::CLNormalizationLayerKernel()
+    : _input(nullptr), _squared_input(nullptr), _output(nullptr), _border_size(0)
+{
+}
+
+BorderSize CLNormalizationLayerKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLNormalizationLayerKernel::configure(const ICLTensor *input, const ICLTensor *squared_input, ICLTensor *output, NormalizationLayerInfo norm_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16, DataType::U16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::U16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
+    ARM_COMPUTE_ERROR_ON_MSG(norm_info.type() == NormType::IN_MAP_2D, "2D In-Map Normalization not implemented");
+
+    // Set build options
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+
+    _input         = input;
+    _squared_input = squared_input;
+    _output        = output;
+
+    const bool         is_in_map    = (norm_info.type() == NormType::IN_MAP_1D);
+    const unsigned int border_width = is_in_map ? std::min(norm_info.norm_size() / 2, 3U) : 0;
+    _border_size                    = BorderSize(0, border_width);
+
+    // Create kernel
+    std::string kernel_name = (norm_info.type() == NormType::IN_MAP_1D) ? "normalization_layer_in_map_1D" : "normalization_layer_cross_map";
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Set kernel static arguments
+    unsigned int idx = 3 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+    _kernel.setArg<cl_float>(idx++, norm_info.scale_coeff());
+    _kernel.setArg<cl_float>(idx++, norm_info.beta());
+    _kernel.setArg<cl_float>(idx++, norm_info.kappa());
+    _kernel.setArg<cl_uint>(idx++, norm_info.norm_size() / 2);
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = (is_in_map) ? 4 : 1;
+    const unsigned int num_elems_read_per_iteration      = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), -_border_size.left, num_elems_read_per_iteration);
+    AccessWindowHorizontal squared_input_access(squared_input->info(), -_border_size.left, num_elems_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, squared_input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLNormalizationLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice = window.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _squared_input, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}
diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
new file mode 100644
index 0000000000..84eb434bc9
--- /dev/null
+++ b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLPixelWiseMultiplicationKernel::CLPixelWiseMultiplicationKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLPixelWiseMultiplicationKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
+                                                ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
+                             "Output can only be U8 if both inputs are U8");
+    ARM_COMPUTE_ERROR_ON_MSG(scale < 0, "Scale cannot be negative. ");
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    int scale_int = -1;
+    // Extract sign, exponent and mantissa
+    int   exponent            = 0;
+    float normalized_mantissa = std::frexp(scale, &exponent);
+    // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
+    // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
+    // Moreover, it will be negative as we deal with 1/2^n
+    if((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1))
+    {
+        // Store the positive exponent. We know that we compute 1/2^n
+        // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
+        scale_int = std::abs(exponent - 1);
+    }
+
+    std::string data_type;
+    std::string compute_type;
+    // Check if it has float inputs and output
+    if(is_data_type_float(input1->info()->data_type()) || is_data_type_float(input2->info()->data_type()))
+    {
+        scale_int    = -1;
+        compute_type = (DataType::F32 == input1->info()->data_type() || DataType::F32 == input2->info()->data_type()) ? "float" : "half";
+        data_type    = "DATA_TYPE_FLOAT";
+    }
+    else
+    {
+        compute_type = (DataType::S16 == input1->info()->data_type() || DataType::S16 == input2->info()->data_type()) ? "int" : "ushort";
+        data_type    = "DATA_TYPE_INT";
+    }
+
+    // Construct kernel name
+    std::string kernel_name = "pixelwise_mul";
+    kernel_name += (scale_int >= 0) ? "_int" : "_float";
+
+    // Set kernel build options
+    std::set<std::string> build_opts;
+    build_opts.emplace((overflow_policy == ConvertPolicy::WRAP || is_data_type_float(output->info()->data_type())) ? "-DWRAP" : "-DSATURATE");
+    build_opts.emplace((rounding_policy == RoundingPolicy::TO_ZERO) ? "-DROUND=_rtz" : "-DROUND=_rte");
+    build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
+    build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
+    build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+    build_opts.emplace("-DDATA_TYPE_RES=" + compute_type);
+    build_opts.emplace("-D" + data_type);
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Set scale argument
+    unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the inputs and output parameters
+
+    if(scale_int >= 0)
+    {
+        _kernel.setArg(idx++, scale_int);
+    }
+    else
+    {
+        _kernel.setArg(idx++, scale);
+    }
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+    output_access.set_valid_region(win, valid_region);
+
+    ICLKernel::configure(win);
+}
+
+void CLPixelWiseMultiplicationKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input1, slice);
+        add_2D_tensor_argument(idx, _input2, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
new file mode 100644
index 0000000000..dc5ae4ec7a
--- /dev/null
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+#include <tuple>
+
+using namespace arm_compute;
+
+CLPoolingLayerKernel::CLPoolingLayerKernel()
+    : _input(nullptr), _output(nullptr), _pool_info(), _border_size(0)
+{
+}
+
+BorderSize CLPoolingLayerKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info)
+{
+    int                   pool_pad_x      = 0;
+    int                   pool_pad_y      = 0;
+    int                   pool_stride_x   = 0;
+    int                   pool_stride_y   = 0;
+    unsigned int          pooled_w        = 0;
+    unsigned int          pooled_h        = 0;
+    const PoolingType     pool_type       = pool_info.pool_type();
+    const int             pool_size       = pool_info.pool_size();
+    const PadStrideInfo   pad_stride_info = pool_info.pad_stride_info();
+    DimensionRoundingType pool_round      = pad_stride_info.round();
+    std::tie(pool_pad_x, pool_pad_y)       = pad_stride_info.pad();
+    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
+
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON(2 != pool_size && 3 != pool_size);
+    ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size);
+
+    // Check output dimensions
+    std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0),
+                                                     input->info()->dimension(1),
+                                                     pool_size,
+                                                     pool_stride_x, pool_stride_y,
+                                                     pool_pad_x, pool_pad_y,
+                                                     pool_round);
+    ARM_COMPUTE_UNUSED(pooled_w);
+    ARM_COMPUTE_UNUSED(pooled_h);
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pooled_w) || (output->info()->dimension(1) != pooled_h));
+
+    const int input_width   = input->info()->dimension(0);
+    const int input_height  = input->info()->dimension(1);
+    const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + pool_size) - input_width;
+    const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
+
+    // Set instance variables
+    _input              = input;
+    _output             = output;
+    _pool_info          = pool_info;
+    _border_size        = BorderSize(pool_pad_y, pool_pad_x);
+    _border_size.right  = std::max(upper_bound_w, pool_pad_x);
+    _border_size.bottom = std::max(upper_bound_h, pool_pad_y);
+
+    // Set build options
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.emplace(("-DPOOL_" + ((PoolingType::MAX == pool_type) ? std::string("MAX") : std::string("AVG"))));
+
+    // Create kernel
+    std::string kernel_name = "pooling_layer_" + val_to_string(pool_size);
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Set static kernel arguments
+    if(pool_type == PoolingType::AVG)
+    {
+        // Create static kernel arguments
+        const cl_int2 max_dims =
+        {
+            {
+                static_cast<cl_int>(input->info()->dimension(0)) + pool_pad_x,
+                static_cast<cl_int>(input->info()->dimension(1)) + pool_pad_y,
+            }
+        };
+        const cl_int2 strides =
+        {
+            {
+                pool_stride_x,
+                pool_stride_y,
+            }
+        };
+        const cl_int2 paddings =
+        {
+            {
+                pool_pad_x,
+                pool_pad_y,
+            }
+        };
+
+        // Set static kernel arguments
+        unsigned int idx = 2 * num_arguments_per_3D_tensor();
+        _kernel.setArg<cl_int2>(idx++, max_dims);
+        _kernel.setArg<cl_int2>(idx++, strides);
+        _kernel.setArg<cl_int2>(idx++, paddings);
+    }
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = 1;
+
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowStatic     input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    unsigned int pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0;
+    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+
+    Window slice = window.first_slice_window_3D();
+
+    do
+    {
+        // Upsample input by pool size
+        Window in_slice(slice);
+        in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - pool_pad_x, in_slice.x().end() * pool_stride_x, pool_stride_x));
+        in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - pool_pad_y, in_slice.y().end() * pool_stride_y, pool_stride_y));
+
+        // Set inputs
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, in_slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}
diff --git a/src/core/CL/kernels/CLRemapKernel.cpp b/src/core/CL/kernels/CLRemapKernel.cpp
new file mode 100644
index 0000000000..e63a5ef7c6
--- /dev/null
+++ b/src/core/CL/kernels/CLRemapKernel.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLRemapKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <algorithm>
+
+using namespace arm_compute;
+
+CLRemapKernel::CLRemapKernel()
+    : _input(nullptr), _output(nullptr), _map_x(nullptr), _map_y(nullptr)
+{
+}
+
+BorderSize CLRemapKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLRemapKernel::configure(const ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported!");
+
+    _input  = input;
+    _output = output;
+    _map_x  = map_x;
+    _map_y  = map_y;
+
+    // Create kernel
+    std::set<std::string> build_opts         = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) };
+    std::string           interpolation_name = string_from_interpolation_policy(policy);
+    std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
+    std::string kernel_name = "remap_" + interpolation_name;
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Configure window
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
+    const int              border_offset                     = (border_undefined) ? 0 : border_size().left;
+
+    Window             win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowStatic input_access(output->info(), -border_offset, -border_offset,
+                                    _output->info()->dimension(0) + border_offset, _output->info()->dimension(1) + border_offset);
+    AccessWindowHorizontal output_access(input->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+
+    // Set static arguments
+    unsigned int idx = 4 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg<cl_float>(idx++, input->info()->dimension(0));
+    _kernel.setArg<cl_float>(idx++, input->info()->dimension(1));
+}
+
+void CLRemapKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        add_2D_tensor_argument(idx, _map_x, slice);
+        add_2D_tensor_argument(idx, _map_y, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp
new file mode 100644
index 0000000000..d74e837ace
--- /dev/null
+++ b/src/core/CL/kernels/CLScaleKernel.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLScaleKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+BorderSize CLScaleKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input  = input;
+    _output = output;
+
+    /* Compute the ratio between source width/height and destination width/height */
+    const auto wr = static_cast<float>(input->info()->dimension(0)) / static_cast<float>(output->info()->dimension(0));
+    const auto hr = static_cast<float>(input->info()->dimension(1)) / static_cast<float>(output->info()->dimension(1));
+
+    /* Area interpolation behaves as Nearest Neighbour in case of up-sampling */
+    if(policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+    {
+        policy = InterpolationPolicy::NEAREST_NEIGHBOR;
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR_ON(policy == InterpolationPolicy::AREA);
+    }
+
+    // Create kernel
+    std::set<std::string> build_opts         = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) };
+    std::string           interpolation_name = string_from_interpolation_policy(policy);
+    std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
+    std::string kernel_name = "scale_" + interpolation_name;
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
+    const int              border_offset                     = (border_undefined) ? 0 : border_size().left;
+
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowStatic input_access(input->info(), -border_offset, -border_offset,
+                                    input->info()->dimension(0) + border_offset, input->info()->dimension(1) + border_offset);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+
+    // Set static kernel arguments
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg<float>(idx++, input->info()->dimension(0));
+    _kernel.setArg<float>(idx++, input->info()->dimension(1));
+    _kernel.setArg<float>(idx++, output->info()->dimension(0));
+    _kernel.setArg<float>(idx++, output->info()->dimension(1));
+}
diff --git a/src/core/CL/kernels/CLScharr3x3Kernel.cpp b/src/core/CL/kernels/CLScharr3x3Kernel.cpp
new file mode 100644
index 0000000000..913ef592d4
--- /dev/null
+++ b/src/core/CL/kernels/CLScharr3x3Kernel.cpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLScharr3x3Kernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLScharr3x3Kernel::CLScharr3x3Kernel()
+    : _run_scharr_x(false), _run_scharr_y(false), _input(nullptr), _output_x(nullptr), _output_y(nullptr)
+{
+}
+
+BorderSize CLScharr3x3Kernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLScharr3x3Kernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_scharr_x = output_x != nullptr;
+    _run_scharr_y = output_y != nullptr;
+
+    if(_run_scharr_x)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
+    }
+
+    if(_run_scharr_y)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
+    }
+
+    _input    = input;
+    _output_x = output_x;
+    _output_y = output_y;
+
+    // Set build options
+    std::set<std::string> build_opts;
+
+    if(_run_scharr_x)
+    {
+        build_opts.insert("-DGRAD_X");
+    }
+
+    if(_run_scharr_y)
+    {
+        build_opts.insert("-DGRAD_Y");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("scharr3x3", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_x_access, output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLScharr3x3Kernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+
+        if(_run_scharr_x)
+        {
+            add_2D_tensor_argument(idx, _output_x, slice);
+        }
+
+        if(_run_scharr_y)
+        {
+            add_2D_tensor_argument(idx, _output_y, slice);
+        }
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLSobel3x3Kernel.cpp b/src/core/CL/kernels/CLSobel3x3Kernel.cpp
new file mode 100644
index 0000000000..436aaa498a
--- /dev/null
+++ b/src/core/CL/kernels/CLSobel3x3Kernel.cpp
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLSobel3x3Kernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLSobel3x3Kernel::CLSobel3x3Kernel()
+    : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
+{
+}
+
+BorderSize CLSobel3x3Kernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLSobel3x3Kernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_sobel_x = output_x != nullptr;
+    _run_sobel_y = output_y != nullptr;
+
+    if(_run_sobel_x)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
+    }
+
+    if(_run_sobel_y)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
+    }
+
+    _input    = input;
+    _output_x = output_x;
+    _output_y = output_y;
+
+    // Set build options
+    std::set<std::string> build_opts;
+
+    if(_run_sobel_x)
+    {
+        build_opts.insert("-DGRAD_X");
+    }
+
+    if(_run_sobel_y)
+    {
+        build_opts.insert("-DGRAD_Y");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("sobel3x3", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_x_access, output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLSobel3x3Kernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+
+        if(_run_sobel_x)
+        {
+            add_2D_tensor_argument(idx, _output_x, slice);
+        }
+
+        if(_run_sobel_y)
+        {
+            add_2D_tensor_argument(idx, _output_y, slice);
+        }
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLSobel5x5Kernel.cpp b/src/core/CL/kernels/CLSobel5x5Kernel.cpp
new file mode 100644
index 0000000000..4c0316f19e
--- /dev/null
+++ b/src/core/CL/kernels/CLSobel5x5Kernel.cpp
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLSobel5x5Kernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLSobel5x5HorKernel::CLSobel5x5HorKernel()
+    : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0)
+{
+}
+
+BorderSize CLSobel5x5HorKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLSobel5x5HorKernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_sobel_x = output_x != nullptr;
+    _run_sobel_y = output_y != nullptr;
+
+    if(_run_sobel_x)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
+    }
+
+    if(_run_sobel_y)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
+    }
+
+    _input       = input;
+    _output_x    = output_x;
+    _output_y    = output_y;
+    _border_size = BorderSize(border_undefined ? 0 : 2, 2);
+
+    // Set build options
+    std::set<std::string> build_opts;
+
+    if(_run_sobel_x)
+    {
+        build_opts.insert("-DGRAD_X");
+    }
+
+    if(_run_sobel_y)
+    {
+        build_opts.insert("-DGRAD_Y");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("sobel_separable1x5", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+
+    Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowHorizontal input_access(input->info(), -border_size().left, num_elems_read_per_iteration);
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_x_access, output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLSobel5x5HorKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+
+        if(_run_sobel_x)
+        {
+            add_2D_tensor_argument(idx, _output_x, slice);
+        }
+
+        if(_run_sobel_y)
+        {
+            add_2D_tensor_argument(idx, _output_y, slice);
+        }
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
+
+CLSobel5x5VertKernel::CLSobel5x5VertKernel()
+    : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
+{
+}
+
+BorderSize CLSobel5x5VertKernel::border_size() const
+{
+    return BorderSize(2, 0);
+}
+
+void CLSobel5x5VertKernel::configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_sobel_x = output_x != nullptr;
+    _run_sobel_y = output_y != nullptr;
+
+    if(_run_sobel_x)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_x, 1, DataType::S16);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
+    }
+
+    if(_run_sobel_y)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_y, 1, DataType::S16);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
+    }
+
+    _input_x  = input_x;
+    _input_y  = input_y;
+    _output_x = output_x;
+    _output_y = output_y;
+
+    // Set build options
+    std::set<std::string> build_opts;
+
+    if(_run_sobel_x)
+    {
+        build_opts.insert("-DGRAD_X");
+    }
+
+    if(_run_sobel_y)
+    {
+        build_opts.insert("-DGRAD_Y");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("sobel_separable5x1", build_opts));
+
+    const ICLTensor *input = _run_sobel_x ? _input_x : _input_y;
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 5;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_x_access(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowRectangle  input_y_access(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_x_access, input_y_access, output_x_access, output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLSobel5x5VertKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+
+        if(_run_sobel_x)
+        {
+            add_2D_tensor_argument(idx, _input_x, slice);
+            add_2D_tensor_argument(idx, _output_x, slice);
+        }
+
+        if(_run_sobel_y)
+        {
+            add_2D_tensor_argument(idx, _input_y, slice);
+            add_2D_tensor_argument(idx, _output_y, slice);
+        }
+
+        _kernel.setArg(idx++, 0 /*dummy*/);
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLSobel7x7Kernel.cpp b/src/core/CL/kernels/CLSobel7x7Kernel.cpp
new file mode 100644
index 0000000000..a477953cfb
--- /dev/null
+++ b/src/core/CL/kernels/CLSobel7x7Kernel.cpp
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLSobel7x7Kernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLSobel7x7HorKernel::CLSobel7x7HorKernel()
+    : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0)
+{
+}
+
+BorderSize CLSobel7x7HorKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLSobel7x7HorKernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_sobel_x = output_x != nullptr;
+    _run_sobel_y = output_y != nullptr;
+
+    if(_run_sobel_x)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S32);
+    }
+
+    if(_run_sobel_y)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S32);
+    }
+
+    _input       = input;
+    _output_x    = output_x;
+    _output_y    = output_y;
+    _border_size = BorderSize(border_undefined ? 0 : 3, 3);
+
+    // Construct kernel name
+    std::string kernel_name = "sobel_separable1x7";
+
+    // Set build options
+    std::set<std::string> build_opts;
+
+    if(_run_sobel_x)
+    {
+        build_opts.insert("-DGRAD_X");
+    }
+
+    if(_run_sobel_y)
+    {
+        build_opts.insert("-DGRAD_Y");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+
+    Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowHorizontal input_access(input->info(), -border_size().left, num_elems_read_per_iteration);
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_x_access, output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLSobel7x7HorKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice);
+
+        if(_run_sobel_x)
+        {
+            add_2D_tensor_argument(idx, _output_x, slice);
+        }
+
+        if(_run_sobel_y)
+        {
+            add_2D_tensor_argument(idx, _output_y, slice);
+        }
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
+
+CLSobel7x7VertKernel::CLSobel7x7VertKernel()
+    : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
+{
+}
+
+BorderSize CLSobel7x7VertKernel::border_size() const
+{
+    return BorderSize(3, 0);
+}
+
+void CLSobel7x7VertKernel::configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_sobel_x = output_x != nullptr;
+    _run_sobel_y = output_y != nullptr;
+
+    if(_run_sobel_x)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_x, 1, DataType::S32);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S32);
+    }
+
+    if(_run_sobel_y)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_y, 1, DataType::S32);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S32);
+    }
+
+    _input_x  = input_x;
+    _input_y  = input_y;
+    _output_x = output_x;
+    _output_y = output_y;
+
+    // Set build options
+    std::set<std::string> build_opts;
+
+    if(_run_sobel_x)
+    {
+        build_opts.insert("-DGRAD_X");
+    }
+
+    if(_run_sobel_y)
+    {
+        build_opts.insert("-DGRAD_Y");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("sobel_separable7x1", build_opts));
+
+    const ICLTensor *input = _run_sobel_x ? _input_x : _input_y;
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 7;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  input_x_access(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowRectangle  input_y_access(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_x_access, input_y_access, output_x_access, output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    ICLKernel::configure(win);
+}
+
+void CLSobel7x7VertKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+
+        if(_run_sobel_x)
+        {
+            add_2D_tensor_argument(idx, _input_x, slice);
+            add_2D_tensor_argument(idx, _output_x, slice);
+        }
+
+        if(_run_sobel_y)
+        {
+            add_2D_tensor_argument(idx, _input_y, slice);
+            add_2D_tensor_argument(idx, _output_y, slice);
+        }
+
+        _kernel.setArg(idx++, 0 /*dummy*/);
+
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
new file mode 100644
index 0000000000..0470d5243e
--- /dev/null
+++ b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+void CLLogits1DMaxKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input  = input;
+    _output = output;
+
+    // The kernel loops over all elements in steps of 16
+    const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 16);
+
+    // Set build options
+    std::set<std::string> build_opts{ "-DUSE_" + string_from_data_type(input->info()->data_type()) };
+
+    // Tell the kernel that the width is not a multiple of 16
+    if((input->info()->dimension(0) % max_cl_vector_width) != 0)
+    {
+        build_opts.emplace("-DNON_MULTIPLE_OF_16");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_max", build_opts));
+
+    // Set fixed arguments
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_written_per_iteration = 1;
+
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+CLLogits1DShiftExpSumKernel::CLLogits1DShiftExpSumKernel()
+    : _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr)
+{
+}
+
+void CLLogits1DShiftExpSumKernel::configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(max, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(sum, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, max, sum);
+
+    _input  = input;
+    _max    = max;
+    _output = output;
+    _sum    = sum;
+
+    // The kernel loops over all elements in steps of 16
+    const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 16);
+
+    // Set build options
+    std::set<std::string> build_opts{ "-DUSE_" + string_from_data_type(input->info()->data_type()) };
+
+    // Tell the kernel that the width is not a multiple of 16
+    if((input->info()->dimension(0) % max_cl_vector_width) != 0)
+    {
+        build_opts.emplace("-DNON_MULTIPLE_OF_16");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_shift_exp_sum", build_opts));
+
+    // Set fixed arguments
+    unsigned int idx = 4 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
+
+    // Configure window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal max_access(max->info(), 0, 1);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal sum_access(sum->info(), 0, 1);
+
+    update_window_and_padding(win, input_access, max_access, output_access, sum_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+    sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLLogits1DShiftExpSumKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        // Set inputs
+        add_2D_tensor_argument(idx, _input, slice);
+        add_2D_tensor_argument(idx, _max, slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        add_2D_tensor_argument(idx, _sum, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
+
+CLLogits1DNormKernel::CLLogits1DNormKernel()
+    : _input(nullptr), _sum(nullptr), _output(nullptr)
+{
+}
+
+void CLLogits1DNormKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(sum, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, sum);
+
+    _input  = input;
+    _sum    = sum;
+    _output = output;
+
+    // Set build options
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DUSE_" + string_from_data_type(input->info()->data_type())));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_norm", build_opts));
+
+    // Configure window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     sum_access(sum->info(), 0, 0, 1, sum->info()->dimension(1));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, sum_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
+
+void CLLogits1DNormKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        Window sum_slice = slice;
+        sum_slice.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        unsigned int idx = 0;
+        // Set inputs
+        add_2D_tensor_argument(idx, _input, slice);
+        add_2D_tensor_argument(idx, _sum, sum_slice);
+        add_2D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLTableLookupKernel.cpp b/src/core/CL/kernels/CLTableLookupKernel.cpp
new file mode 100644
index 0000000000..bbdaa37410
--- /dev/null
+++ b/src/core/CL/kernels/CLTableLookupKernel.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLTableLookupKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLLut.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cstdint>
+#include <string>
+
+using namespace arm_compute;
+
+void CLTableLookupKernel::configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON(lut == nullptr);
+    ARM_COMPUTE_ERROR_ON(DataType::U8 != lut->type() && DataType::S16 != lut->type());
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    // Create kernel
+    std::string kernel_name = (DataType::S16 == lut->type()) ? "tablelookup_S16" : "tablelookup_U8";
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+
+    // Set lut argument
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, lut->cl_buffer());
+    if(DataType::S16 == lut->type())
+    {
+        _kernel.setArg(idx++, lut->index_offset());
+        _kernel.setArg(idx++, static_cast<uint32_t>(lut->num_elements()));
+    }
+
+    // Configure kernel
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
+}
diff --git a/src/core/CL/kernels/CLThresholdKernel.cpp b/src/core/CL/kernels/CLThresholdKernel.cpp
new file mode 100644
index 0000000000..6e07cefc77
--- /dev/null
+++ b/src/core/CL/kernels/CLThresholdKernel.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLThresholdKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <string>
+
+using namespace arm_compute;
+
+void CLThresholdKernel::configure(const ICLTensor *input, ICLTensor *output, uint8_t threshold,
+                                  uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    // Construct kernel name
+    std::string kernel_name = "threshold";
+
+    switch(type)
+    {
+        case ThresholdType::BINARY:
+            kernel_name += "_binary";
+            break;
+        case ThresholdType::RANGE:
+            kernel_name += "_range";
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Thresholding type not recognized");
+            break;
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+
+    // Set arguments
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg(idx++, false_value);
+    _kernel.setArg(idx++, true_value);
+    _kernel.setArg(idx++, threshold);
+
+    if(ThresholdType::RANGE == type)
+    {
+        _kernel.setArg(idx++, upper);
+    }
+
+    // Make sure _kernel is initialized before calling the parent's configure
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
+}
diff --git a/src/core/CL/kernels/CLTransposeKernel.cpp b/src/core/CL/kernels/CLTransposeKernel.cpp
new file mode 100644
index 0000000000..2ee6fcb9dc
--- /dev/null
+++ b/src/core/CL/kernels/CLTransposeKernel.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
+
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+void CLTransposeKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    TensorShape  output_shape{ input->info()->tensor_shape() };
+    const size_t w_out = input->info()->dimension(1);
+    const size_t h_out = input->info()->dimension(0);
+    output_shape.set(0, w_out);
+    output_shape.set(1, h_out);
+
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+    _input    = input;
+    _output   = output;
+    _lws_hint = cl::NDRange(2, 8);
+
+    std::set<std::string> build_opts;
+    std::ostringstream    data_type_in_bytes;
+    data_type_in_bytes << input->info()->element_size();
+    build_opts.emplace("-DDATA_TYPE_IN_BYTES=" + data_type_in_bytes.str());
+
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("transpose", build_opts));
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / input->info()->element_size();
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_elems_processed_per_iteration));
+
+    AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+    AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLWarpAffineKernel.cpp b/src/core/CL/kernels/CLWarpAffineKernel.cpp
new file mode 100644
index 0000000000..e549dbc258
--- /dev/null
+++ b/src/core/CL/kernels/CLWarpAffineKernel.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWarpAffineKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cstddef>
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+namespace
+{
+void options_add_matrix(std::set<std::string> &options, const float *matrix, size_t size)
+{
+    for(size_t i = 0; i < size; ++i)
+    {
+        std::stringstream mat_str;
+        mat_str << "-DMAT" << i << "=" << matrix[i] << " ";
+        options.insert(mat_str.str());
+    }
+}
+} // namespace
+
+BorderSize CLWarpAffineKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLWarpAffineKernel::configure(const ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(InterpolationPolicy::AREA == policy);
+
+    _input  = input;
+    _output = output;
+
+    // Create build options
+    std::set<std::string> options;
+    options_add_matrix(options, matrix, 6);
+    options.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+
+    // Create kernel
+    std::string interpolation_name = string_from_interpolation_policy(policy);
+    std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
+    std::string kernel_name = "warp_affine_" + interpolation_name;
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, options));
+
+    // Set static kernel arguments
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg<cl_int>(idx++, input->info()->dimension(0));
+    _kernel.setArg<cl_int>(idx++, input->info()->dimension(1));
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = 4;
+
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     output_access(output->info(), 0, 0, output->info()->dimension(0), output->info()->dimension(1));
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp b/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp
new file mode 100644
index 0000000000..fddb580750
--- /dev/null
+++ b/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cstddef>
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+namespace
+{
+inline void options_add_matrix(std::set<std::string> &options, const float *matrix, size_t size)
+{
+    for(size_t i = 0; i < size; ++i)
+    {
+        std::stringstream mat_str;
+        mat_str << "-DMAT" << i << "=" << matrix[i] << " ";
+        options.insert(mat_str.str());
+    }
+}
+} // namespace
+
+BorderSize CLWarpPerspectiveKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void CLWarpPerspectiveKernel::configure(const ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(InterpolationPolicy::AREA == policy);
+
+    _input  = input;
+    _output = output;
+
+    // Create build options
+    std::set<std::string> options;
+    options_add_matrix(options, matrix, 9);
+    options.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+
+    // Create kernel
+    std::string interpolation_name = string_from_interpolation_policy(policy);
+    std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
+    std::string kernel_name = "warp_perspective_" + interpolation_name;
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, options));
+
+    // Set static kernel arguments
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+    _kernel.setArg<cl_int>(idx++, input->info()->dimension(0));
+    _kernel.setArg<cl_int>(idx++, input->info()->dimension(1));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
+
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     output_access(output->info(), 0, 0, output->info()->dimension(0), output->info()->dimension(1));
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
new file mode 100644
index 0000000000..018f272921
--- /dev/null
+++ b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+CLWeightsReshapeKernel::CLWeightsReshapeKernel(bool is_shared)
+    : _is_shared(is_shared), _input(nullptr), _biases(nullptr), _output(nullptr)
+{
+}
+
+void CLWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    if(_is_shared)
+    {
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(4) != (output->info()->dimension(2)));
+        ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 5);
+        ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() > 3);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 4);
+        ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() > 2);
+    }
+
+    // Check biases
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32);
+    }
+
+    _biases = biases;
+    _output = output;
+    _input  = input;
+
+    // Create build options
+    std::set<std::string> build_opts;
+    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.emplace(((biases != nullptr) ? "-DHAS_BIAS" : ""));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reshape_to_columns", build_opts));
+
+    // Set static arguments
+    unsigned int idx = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor();
+    idx += (biases != nullptr) ? num_arguments_per_1D_tensor() : 0;
+    _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(0));
+    _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(1));
+    _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(2));
+    _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(3));
+
+    // Configure window
+    Window win = calculate_max_window(*input->info(), Steps());
+    // The CLWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    ICLKernel::configure(win);
+}
+
+CLConvolutionLayerWeightsReshapeKernel::CLConvolutionLayerWeightsReshapeKernel()
+    : CLWeightsReshapeKernel(false)
+{
+}
+
+void CLConvolutionLayerWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    Window out_window;
+    out_window.use_tensor_dimensions(_output->info());
+
+    Window in_slice  = window.first_slice_window_3D();
+    Window out_slice = out_window.first_slice_window_2D();
+
+    // Set arguments
+    unsigned idx = 0;
+    add_3D_tensor_argument(idx, _input, in_slice);
+    add_2D_tensor_argument(idx, _output, out_slice);
+    if(_biases != nullptr)
+    {
+        Window biases_slice;
+        biases_slice.set(Window::DimX, Window::Dimension(0, _biases->info()->tensor_shape().x(), 1));
+        add_1D_tensor_argument(idx, _biases, biases_slice);
+    }
+
+    // Run kernel
+    enqueue(queue, *this, in_slice);
+}
+
+CLLocallyConnectedLayerWeightsReshapeKernel::CLLocallyConnectedLayerWeightsReshapeKernel()
+    : CLWeightsReshapeKernel(true)
+{
+}
+
+void CLLocallyConnectedLayerWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    Window out_window;
+    out_window.use_tensor_dimensions(_output->info());
+
+    Window in_slice  = window.first_slice_window_3D();
+    Window out_slice = out_window.first_slice_window_2D();
+
+    Window biases_window;
+    Window biases_slice;
+
+    if(_biases != nullptr)
+    {
+        biases_window.use_tensor_dimensions(_biases->info());
+        biases_slice = biases_window.first_slice_window_1D();
+    }
+
+    do
+    {
+        // Set arguments
+        unsigned idx = 0;
+        add_3D_tensor_argument(idx, _input, in_slice);
+        add_2D_tensor_argument(idx, _output, out_slice);
+        if(_biases != nullptr)
+        {
+            add_1D_tensor_argument(idx, _biases, biases_slice);
+            biases_window.slide_window_slice_1D(biases_slice);
+        }
+
+        // Run kernel
+        enqueue(queue, *this, in_slice);
+    }
+    while(window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_2D(out_slice));
+}
diff --git a/src/core/CPP/ICPPSimpleKernel.cpp b/src/core/CPP/ICPPSimpleKernel.cpp
new file mode 100644
index 0000000000..9d18a9c165
--- /dev/null
+++ b/src/core/CPP/ICPPSimpleKernel.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CPP/ICPPSimpleKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+
+using namespace arm_compute;
+
+ICPPSimpleKernel::ICPPSimpleKernel()
+    : _input{ nullptr }, _output{ nullptr }
+{
+}
+
+void ICPPSimpleKernel::configure(const ITensor *input, ITensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined, const BorderSize &border_size)
+{
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size);
+
+    ICPPKernel::configure(win);
+}
diff --git a/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp b/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp
new file mode 100644
index 0000000000..884da2861b
--- /dev/null
+++ b/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+namespace
+{
+inline void check_corner(float x, float y, float strength, InternalKeypoint *output, int32_t *num_corner_candidates, std::mutex *corner_candidates_mutex)
+{
+    if(strength != 0.0f)
+    {
+        /* Set index and update num_corner_candidate */
+        std::unique_lock<std::mutex> lock(*corner_candidates_mutex);
+
+        const int32_t idx = *num_corner_candidates;
+
+        *num_corner_candidates += 1;
+
+        lock.unlock();
+
+        /* Add keypoint */
+        output[idx] = std::make_tuple(x, y, strength);
+    }
+}
+
+inline void corner_candidates(const float *__restrict input, InternalKeypoint *__restrict output, int32_t x, int32_t y, int32_t *num_corner_candidates, std::mutex *corner_candidates_mutex)
+{
+    check_corner(x + 0, y, *(input + 0), output, num_corner_candidates, corner_candidates_mutex);
+    check_corner(x + 1, y, *(input + 1), output, num_corner_candidates, corner_candidates_mutex);
+    check_corner(x + 2, y, *(input + 2), output, num_corner_candidates, corner_candidates_mutex);
+    check_corner(x + 3, y, *(input + 3), output, num_corner_candidates, corner_candidates_mutex);
+}
+} // namespace
+
+bool keypoint_compare(const InternalKeypoint &lhs, const InternalKeypoint &rhs)
+{
+    return std::get<2>(lhs) > std::get<2>(rhs);
+}
+
+CPPCornerCandidatesKernel::CPPCornerCandidatesKernel()
+    : _num_corner_candidates(nullptr), _corner_candidates_mutex(), _input(nullptr), _output(nullptr)
+{
+}
+
+void CPPCornerCandidatesKernel::configure(const IImage *input, InternalKeypoint *output, int32_t *num_corner_candidates)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    ARM_COMPUTE_ERROR_ON(nullptr == num_corner_candidates);
+    ARM_COMPUTE_ERROR_ON(*num_corner_candidates != 0);
+
+    _input                 = input;
+    _output                = output;
+    _num_corner_candidates = num_corner_candidates;
+
+    const unsigned int num_elems_processed_per_iteration = 4;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
+
+    INEKernel::configure(win);
+}
+
+void CPPCornerCandidatesKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    Iterator input(_input, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        corner_candidates(reinterpret_cast<float *>(input.ptr()), &_output[0], id.x(), id.y(), _num_corner_candidates, &_corner_candidates_mutex);
+    },
+    input);
+}
diff --git a/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp b/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp
new file mode 100644
index 0000000000..62bfdd60ba
--- /dev/null
+++ b/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <algorithm>
+#include <cmath>
+
+using namespace arm_compute;
+
+namespace
+{
+bool compare_detection_window(const DetectionWindow &lhs, const DetectionWindow &rhs)
+{
+    return lhs.score > rhs.score;
+}
+} // namespace
+
+CPPDetectionWindowNonMaximaSuppressionKernel::CPPDetectionWindowNonMaximaSuppressionKernel()
+    : _input_output(nullptr), _min_distance(0.0f)
+{
+}
+
+bool CPPDetectionWindowNonMaximaSuppressionKernel::is_parallelisable() const
+{
+    return false;
+}
+
+void CPPDetectionWindowNonMaximaSuppressionKernel::configure(IDetectionWindowArray *input_output, float min_distance)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input_output);
+
+    _input_output = input_output;
+    _min_distance = min_distance;
+
+    IKernel::configure(Window()); // Default 1 iteration window
+}
+
+void CPPDetectionWindowNonMaximaSuppressionKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_input_output->buffer() == nullptr);
+
+    const size_t num_candidates = _input_output->num_values();
+    size_t       num_detections = 0;
+
+    // Sort list of candidates
+    std::sort(_input_output->buffer(), _input_output->buffer() + num_candidates, compare_detection_window);
+
+    const float min_distance_pow2 = _min_distance * _min_distance;
+
+    // Euclidean distance
+    for(size_t i = 0; i < num_candidates; ++i)
+    {
+        if(0.0f != _input_output->at(i).score)
+        {
+            DetectionWindow cur;
+            cur.x         = _input_output->at(i).x;
+            cur.y         = _input_output->at(i).y;
+            cur.width     = _input_output->at(i).width;
+            cur.height    = _input_output->at(i).height;
+            cur.idx_class = _input_output->at(i).idx_class;
+            cur.score     = _input_output->at(i).score;
+
+            // Store window
+            _input_output->at(num_detections) = cur;
+
+            ++num_detections;
+
+            const float xc = cur.x + cur.width * 0.5f;
+            const float yc = cur.y + cur.height * 0.5f;
+
+            for(size_t k = i + 1; k < num_candidates; ++k)
+            {
+                const float xn = _input_output->at(k).x + _input_output->at(k).width * 0.5f;
+                const float yn = _input_output->at(k).y + _input_output->at(k).height * 0.5f;
+
+                const float dx = std::fabs(xn - xc);
+                const float dy = std::fabs(yn - yc);
+
+                if(dx < _min_distance && dy < _min_distance)
+                {
+                    const float d = dx * dx + dy * dy;
+
+                    if(d < min_distance_pow2)
+                    {
+                        // Invalidate keypoint
+                        _input_output->at(k).score = 0.0f;
+                    }
+                }
+            }
+        }
+    }
+
+    _input_output->resize(num_detections);
+}
diff --git a/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp b/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp
new file mode 100644
index 0000000000..09d3ccffa4
--- /dev/null
+++ b/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+
+using namespace arm_compute;
+
+namespace
+{
+bool keypoint_compare(const InternalKeypoint &lhs, const InternalKeypoint &rhs)
+{
+    return std::get<2>(lhs) > std::get<2>(rhs);
+}
+} // namespace
+
+CPPSortEuclideanDistanceKernel::CPPSortEuclideanDistanceKernel()
+    : _num_corner_candidates(), _min_distance(0.0f), _in_out(nullptr), _output(nullptr)
+{
+}
+
+void CPPSortEuclideanDistanceKernel::configure(InternalKeypoint *in_out, IKeyPointArray *output, const int32_t *num_corner_candidates, float min_distance)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == in_out);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    ARM_COMPUTE_ERROR_ON(nullptr == num_corner_candidates);
+    ARM_COMPUTE_ERROR_ON(!((min_distance > 0) && (min_distance <= 30)));
+
+    _in_out                = in_out;
+    _output                = output;
+    _min_distance          = min_distance * min_distance; // We compare squares of distances
+    _num_corner_candidates = num_corner_candidates;
+    ICPPKernel::configure(Window()); // Default 1 iteration window
+}
+
+bool CPPSortEuclideanDistanceKernel::is_parallelisable() const
+{
+    return false;
+}
+
+void CPPSortEuclideanDistanceKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICPPKernel::window(), window);
+
+    const int32_t num_corner_candidates = *_num_corner_candidates;
+
+    /* Sort list of corner candidates */
+    std::sort(_in_out, _in_out + num_corner_candidates, keypoint_compare);
+
+    /* Euclidean distance */
+    for(int32_t i = 0; i < num_corner_candidates; ++i)
+    {
+        if(std::get<2>(_in_out[i]) != 0.0f)
+        {
+            KeyPoint   keypt;
+            const auto xc = std::get<0>(_in_out[i]);
+            const auto yc = std::get<1>(_in_out[i]);
+
+            keypt.x               = xc;
+            keypt.y               = yc;
+            keypt.strength        = std::get<2>(_in_out[i]);
+            keypt.tracking_status = 1;
+
+            /* Store corner */
+            _output->push_back(keypt);
+            for(int32_t k = i + 1; k < num_corner_candidates; ++k)
+            {
+                const float dx = std::fabs(std::get<0>(_in_out[k]) - xc);
+                const float dy = std::fabs(std::get<1>(_in_out[k]) - yc);
+
+                if((dx < _min_distance) && (dy < _min_distance))
+                {
+                    const float d = (dx * dx + dy * dy);
+
+                    if(d < _min_distance)
+                    {
+                        /* Invalidate keypoint */
+                        std::get<2>(_in_out[k]) = 0.0f;
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/src/core/Error.cpp b/src/core/Error.cpp
new file mode 100644
index 0000000000..389e390736
--- /dev/null
+++ b/src/core/Error.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Error.h"
+
+#include <cstdarg>
+#include <cstdio>
+#include <iostream>
+#include <stdexcept>
+
+void arm_compute::error(const char *function, const char *file, const int line, const char *msg, ...)
+{
+    char    out[512];
+    va_list args;
+    va_start(args, msg);
+    int offset = snprintf(out, sizeof(out), "in %s %s:%d: ", function, file, line);
+    vsnprintf(out + offset, sizeof(out) - offset, msg, args);
+    va_end(args);
+
+    throw std::runtime_error(std::string(out));
+}
+
+void arm_compute::debug(const char *function, const char *file, const int line, const char *msg, ...)
+{
+    char    out[512];
+    va_list args;
+    va_start(args, msg);
+    int offset = snprintf(out, sizeof(out), "in %s %s:%d: ", function, file, line);
+    vsnprintf(out + offset, sizeof(out) - offset, msg, args);
+    va_end(args);
+    std::cout << std::string(out) << std::endl;
+}
diff --git a/src/core/HOGInfo.cpp b/src/core/HOGInfo.cpp
new file mode 100644
index 0000000000..1b6175e68f
--- /dev/null
+++ b/src/core/HOGInfo.cpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/HOGInfo.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+HOGInfo::HOGInfo()
+    : _cell_size(), _block_size(), _detection_window_size(), _block_stride(), _num_bins(0), _normalization_type(HOGNormType::L2HYS_NORM), _l2_hyst_threshold(0.0f), _phase_type(PhaseType::UNSIGNED),
+      _descriptor_size(0)
+{
+}
+
+HOGInfo::HOGInfo(const Size2D &cell_size, const Size2D &block_size, const Size2D &detection_window_size, const Size2D &block_stride, size_t num_bins,
+                 HOGNormType normalization_type, float l2_hyst_threshold, PhaseType phase_type)
+    : HOGInfo()
+{
+    init(cell_size, block_size, detection_window_size, block_stride, num_bins, normalization_type, l2_hyst_threshold, phase_type);
+}
+
+void HOGInfo::init(const Size2D &cell_size, const Size2D &block_size, const Size2D &detection_window_size, const Size2D &block_stride, size_t num_bins,
+                   HOGNormType normalization_type, float l2_hyst_threshold, PhaseType phase_type)
+{
+    ARM_COMPUTE_ERROR_ON_MSG((block_size.width % cell_size.width), "The block width must be multiple of cell width");
+    ARM_COMPUTE_ERROR_ON_MSG((block_size.height % cell_size.height), "Block height must be multiple of cell height");
+    ARM_COMPUTE_ERROR_ON_MSG((block_stride.width % cell_size.width), "Block stride width must be multiple of cell width");
+    ARM_COMPUTE_ERROR_ON_MSG((block_stride.height % cell_size.height), "Block stride height must be multiple of cell height");
+    ARM_COMPUTE_ERROR_ON_MSG(((detection_window_size.width - block_size.width) % block_stride.width), "Window width must be multiple of block width and block stride width");
+    ARM_COMPUTE_ERROR_ON_MSG(((detection_window_size.height - block_size.height) % block_stride.height), "Window height must be multiple of block height and block stride height");
+
+    _cell_size             = cell_size;
+    _block_size            = block_size;
+    _detection_window_size = detection_window_size;
+    _block_stride          = block_stride;
+    _num_bins              = num_bins;
+    _normalization_type    = normalization_type;
+    _l2_hyst_threshold     = l2_hyst_threshold;
+    _phase_type            = phase_type;
+
+    // Compute descriptor size. +1 takes into account of the bias
+    _descriptor_size = num_cells_per_block().area() * num_blocks_per_image(_detection_window_size).area() * _num_bins + 1;
+}
+
+Size2D HOGInfo::num_cells_per_block() const
+{
+    return Size2D(_block_size.width / _cell_size.width,
+                  _block_size.height / _cell_size.height);
+}
+
+Size2D HOGInfo::num_blocks_per_image(const Size2D &image_size) const
+{
+    return Size2D(((image_size.width - _block_size.width) / _block_stride.width) + 1,
+                  ((image_size.height - _block_size.height) / _block_stride.height) + 1);
+}
+
+const Size2D &HOGInfo::cell_size() const
+{
+    return _cell_size;
+}
+
+const Size2D &HOGInfo::block_size() const
+{
+    return _block_size;
+}
+
+const Size2D &HOGInfo::detection_window_size() const
+{
+    return _detection_window_size;
+}
+
+const Size2D &HOGInfo::block_stride() const
+{
+    return _block_stride;
+}
+
+size_t HOGInfo::num_bins() const
+{
+    return _num_bins;
+}
+
+HOGNormType HOGInfo::normalization_type() const
+{
+    return _normalization_type;
+}
+
+float HOGInfo::l2_hyst_threshold() const
+{
+    return _l2_hyst_threshold;
+}
+
+PhaseType HOGInfo::phase_type() const
+{
+    return _phase_type;
+}
+
+size_t HOGInfo::descriptor_size() const
+{
+    return _descriptor_size;
+}
diff --git a/src/core/Helpers.cpp b/src/core/Helpers.cpp
new file mode 100644
index 0000000000..ff903e9802
--- /dev/null
+++ b/src/core/Helpers.cpp
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/IKernel.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Utils.h"
+
+#include <algorithm>
+#include <cstdint>
+
+using namespace arm_compute;
+
+Window arm_compute::calculate_max_window(const ITensorInfo &info, const Steps &steps, bool skip_border, BorderSize border_size)
+{
+    if(!skip_border)
+    {
+        border_size = BorderSize(0);
+    }
+
+    const Coordinates &anchor = info.valid_region().anchor;
+    const TensorShape &shape  = info.valid_region().shape;
+
+    Window window;
+
+    window.set(0, Window::Dimension(
+                   // Skip the border left of the image
+                   anchor[0] + border_size.left,
+                   // Skip the border right of the image
+                   // Make sure the window width is a multiple of the step size
+                   anchor[0] + border_size.left + ceil_to_multiple(shape[0] - border_size.left - border_size.right, steps[0]),
+                   steps[0]));
+
+    size_t             n            = 1;
+    const TensorShape &tensor_shape = info.tensor_shape();
+
+    if(tensor_shape.num_dimensions() > 1)
+    {
+        window.set(1, Window::Dimension(
+                       // Skip the border above the image
+                       anchor[1] + border_size.top,
+                       // Skip the border below the image
+                       anchor[1] + border_size.top + ceil_to_multiple(shape[1] - border_size.top - border_size.bottom, steps[1]),
+                       steps[1]));
+
+        ++n;
+    }
+
+    for(; n < Coordinates::num_max_dimensions; ++n)
+    {
+        window.set(n, Window::Dimension(0, std::max<size_t>(1, tensor_shape[n])));
+    }
+
+    return window;
+}
+
+Window arm_compute::calculate_max_enlarged_window(const ITensorInfo &info, const Steps &steps, BorderSize border_size)
+{
+    const Coordinates &anchor = info.valid_region().anchor;
+    const TensorShape &shape  = info.valid_region().shape;
+
+    Window window;
+
+    window.set(0, Window::Dimension(
+                   // move the anchor to the start from the border
+                   anchor[0] - border_size.left,
+                   // move the anchor to include the right end border
+                   // Make sure the window width is a multiple of the step size
+                   anchor[0] - border_size.left + ceil_to_multiple(shape[0] + border_size.left + border_size.right, steps[0]),
+                   steps[0]));
+
+    size_t             n            = 1;
+    const TensorShape &tensor_shape = info.tensor_shape();
+
+    if(tensor_shape.num_dimensions() > 1)
+    {
+        window.set(1, Window::Dimension(
+                       // Include the border above the image
+                       anchor[1] - border_size.top,
+                       // Include the border below the image
+                       anchor[1] - border_size.top + ceil_to_multiple(shape[1] + border_size.top + border_size.bottom, steps[1]),
+                       steps[1]));
+
+        ++n;
+    }
+
+    for(; n < Coordinates::num_max_dimensions; ++n)
+    {
+        window.set(n, Window::Dimension(0, std::max<size_t>(1, tensor_shape[n])));
+    }
+
+    return window;
+}
+
+Window arm_compute::calculate_max_window_horizontal(const ITensorInfo &info, const Steps &steps, bool skip_border, BorderSize border_size)
+{
+    if(skip_border)
+    {
+        border_size.top    = 0;
+        border_size.bottom = 0;
+    }
+    else
+    {
+        border_size.left  = 0;
+        border_size.right = 0;
+    }
+
+    const Coordinates &anchor = info.valid_region().anchor;
+    const TensorShape &shape  = info.valid_region().shape;
+
+    Window window;
+
+    window.set(0, Window::Dimension(
+                   // Skip the border left of the image
+                   anchor[0] + border_size.left,
+                   // Skip the border right of the image
+                   // Make sure the window width is a multiple of the step size
+                   anchor[0] + border_size.left + ceil_to_multiple(shape[0] - border_size.left - border_size.right, steps[0]),
+                   steps[0]));
+
+    size_t             n            = 1;
+    const TensorShape &tensor_shape = info.tensor_shape();
+
+    if(tensor_shape.num_dimensions() > 1)
+    {
+        window.set(1, Window::Dimension(
+                       // Skip the border above the image
+                       anchor[1] - border_size.top,
+                       // Skip the border below the image
+                       anchor[1] + shape[1] + border_size.bottom,
+                       1));
+
+        ++n;
+    }
+
+    for(; n < Coordinates::num_max_dimensions; ++n)
+    {
+        window.set(n, Window::Dimension(0, std::max<size_t>(1, tensor_shape[n])));
+    }
+
+    return window;
+}
diff --git a/src/core/IAccessWindow.cpp b/src/core/IAccessWindow.cpp
new file mode 100644
index 0000000000..4ddc0fef1d
--- /dev/null
+++ b/src/core/IAccessWindow.cpp
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/IAccessWindow.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, const ValidRegion &input_valid_region) const
+{
+    return compute_valid_region(window, input_valid_region, false, BorderSize(0));
+}
+
+ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const
+{
+    if(_info == nullptr)
+    {
+        return input_valid_region;
+    }
+
+    Coordinates &anchor = input_valid_region.anchor;
+    Coordinates  old_anchor(anchor);
+    TensorShape &shape = input_valid_region.shape;
+
+    if(!border_undefined)
+    {
+        border_size = BorderSize(0);
+    }
+
+    // Start of the valid region is equal to the start of the window. But it
+    // cannot be less than the start of the input's valid region plus the border
+    // size required by this kernel (if undefined).
+    // Additionally the valid region is shifted by the offset that is used by
+    // the kernel to write back output values.
+    anchor.set(0, std::max<int>(window.x().start() * _scale_x, anchor[0] + border_size.left) + _x);
+    if(_info->num_dimensions() > 1)
+    {
+        anchor.set(1, std::max<int>(window.y().start() * _scale_y, anchor[1] + border_size.top) + _y);
+    }
+
+    // End of the valid region is equal to the start of the last write of the
+    // kernel plus the number of written elements. (This assumes that all
+    // written elements are valid). Nevertheless the end cannot be larger than
+    // the end of the input's valid region minus the border size.
+    // Note: not the end points of the region are stored but its size. Thus the
+    // old size is first converted into end points to compared against the
+    // execution window. Afterwards the new end points are converted back into
+    // a size of the region.
+    shape.set(0, std::min<int>(old_anchor[0] + shape[0] - border_size.right, (window.x().end() - window.x().step()) * _scale_x + _width) - anchor[0]);
+    if(_info->num_dimensions() > 1)
+    {
+        shape.set(1, std::min<int>(old_anchor[1] + shape[1] - border_size.bottom, (window.y().end() - window.y().step()) * _scale_y + _height) - anchor[1]);
+    }
+
+    // For higher dimensions use the intersection of the window size and the
+    // valid region of the input
+    for(size_t d = 2; d < _info->num_dimensions(); ++d)
+    {
+        anchor.set(d, std::max(window[d].start(), input_valid_region.anchor[d]));
+        shape.set(d, std::min<int>(window[d].end(), input_valid_region.shape[d]) - anchor[d]);
+    }
+
+    return input_valid_region;
+}
+
+void AccessWindowRectangle::set_valid_region(const Window &window, const ValidRegion &input_valid_region, bool border_undefined, const BorderSize &border_size)
+{
+    if(_info != nullptr)
+    {
+        _info->set_valid_region(compute_valid_region(window, input_valid_region, border_undefined, border_size));
+    }
+}
+
+bool AccessWindowRectangle::update_window_if_needed(Window &window) const
+{
+    // Only update the window size if we can't use padding
+    if(_info == nullptr || _info->is_resizable())
+    {
+        return false;
+    }
+
+    const TensorShape &shape                = _info->tensor_shape();
+    const Strides     &strides              = _info->strides_in_bytes();
+    const size_t       offset_first_element = _info->offset_first_element_in_bytes();
+
+    bool window_modified = false;
+
+    int front_pad_y = 0;
+
+    const int min_y = window.y().start() * _scale_y + _y;
+    const int max_y = (window.y().end() - window.y().step()) * _scale_y + _y + _height;
+
+    // Adjust window start for Y dimension
+    if(min_y < 0)
+    {
+        // Calculate rows available above the tensor
+        const int front_pad_y_available = -static_cast<int>(offset_first_element / strides[1]);
+
+        if(min_y < front_pad_y_available)
+        {
+            // Not enough padding available, need to shrink the window
+            const int start = adjust_up(min_y, front_pad_y_available, window.y().step() * _scale_y) - _y;
+
+            window.set(1, Window::Dimension(start / _scale_y, window.y().end(), window.y().step()));
+            window_modified = true;
+        }
+
+        // Update front padding with reconstructed value
+        front_pad_y = std::max(0, static_cast<int>(std::floor(-window.y().start() * _scale_y)) - _y);
+    }
+
+    // Adjust window end for Y dimension
+    if(max_y > static_cast<int>(shape[1]))
+    {
+        const int stride_z = _info->num_dimensions() > 2 ? strides[2] : _info->total_size();
+
+        // Calculate rows available below the tensor
+        const int tail_pad_y_available = (stride_z / strides[1]) - shape[1] - front_pad_y;
+
+        if(static_cast<int>(shape[1]) + tail_pad_y_available < max_y)
+        {
+            // Not enough padding available, need to shrink the window
+            const int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.y().step() * _scale_y) + window.y().step() * _scale_y - _y - _height;
+            window.set(1, Window::Dimension(window.y().start(), end / _scale_y, window.y().step()));
+            window_modified = true;
+        }
+    }
+
+    int front_pad_x = 0;
+
+    const int min_x = window.x().start() * _scale_x + _x;
+    const int max_x = (window.x().end() - window.x().step()) * _scale_x + _x + _width;
+
+    const int stride_y = _info->num_dimensions() > 1 ? strides[1] : _info->total_size();
+
+    // Adjust window start for X dimension
+    if(min_x < 0)
+    {
+        const int front_pad_x_available = -std::min<int>(static_cast<int>(offset_first_element) - front_pad_y * strides[1], stride_y - shape[0] * strides[0]) / static_cast<int>(strides[0]);
+
+        if(min_x < front_pad_x_available)
+        {
+            // Not enough padding available, need to shrink the window
+            const int start = adjust_up(min_x, front_pad_x_available, window.x().step() * _scale_x) - _x;
+            window.set(0, Window::Dimension(start / _scale_x, window.x().end(), window.x().step()));
+            window_modified = true;
+        }
+
+        // Update front padding with reconstructed value
+        front_pad_x = std::max(0, static_cast<int>(std::floor(-window.x().start() * _scale_x)) - _x);
+    }
+
+    // Adjust window end for X dimension
+    if(max_x > static_cast<int>(shape[0]))
+    {
+        const int tail_pad_x_available = (stride_y / strides[0]) - shape[0] - front_pad_x;
+
+        if(static_cast<int>(shape[0]) + tail_pad_x_available < max_x)
+        {
+            // Not enough padding available, need to shrink the window
+            const int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.x().step() * _scale_x) + window.x().step() * _scale_x - _x - _width;
+            window.set(0, Window::Dimension(window.x().start(), end / _scale_x, window.x().step()));
+            window_modified = true;
+        }
+    }
+
+    window.validate();
+
+    return window_modified;
+}
+
+bool AccessWindowRectangle::update_padding_if_needed(const Window &window) const
+{
+    // Only update the padding if the tensor allows it
+    if(_info == nullptr || !_info->is_resizable())
+    {
+        return false;
+    }
+
+    ARM_COMPUTE_ERROR_ON(window.x().step() * _scale_x == 0);
+    ARM_COMPUTE_ERROR_ON(window.y().step() * _scale_y == 0);
+
+    const int min_x = window.x().start() * _scale_x + _x;
+    const int max_x = (window.x().end() - window.x().step()) * _scale_x + _x + _width;
+    const int min_y = window.y().start() * _scale_y + _y;
+    const int max_y = (window.y().end() - window.y().step()) * _scale_y + _y + _height;
+
+    const TensorShape &shape = _info->tensor_shape();
+
+    PaddingSize padding;
+    padding.left   = std::max(0, -min_x);
+    padding.right  = std::max<int>(0, max_x - shape[0]);
+    padding.top    = shape.num_dimensions() == 1 ? 0 : std::max(0, -min_y);
+    padding.bottom = shape.num_dimensions() == 1 ? 0 : std::max<int>(0, max_y - shape[1]);
+
+    // Update strides in tensor info
+    return _info->extend_padding(padding);
+}
diff --git a/src/core/IDistribution.cpp b/src/core/IDistribution.cpp
new file mode 100644
index 0000000000..7d7186989e
--- /dev/null
+++ b/src/core/IDistribution.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/IDistribution.h"
+
+#include "arm_compute/core/Error.h"
+
+#include <cstring>
+
+using namespace arm_compute;
+
+void IDistribution::clear() const
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == buffer());
+    std::memset(buffer(), 0, size());
+}
diff --git a/src/core/IDistribution1D.cpp b/src/core/IDistribution1D.cpp
new file mode 100644
index 0000000000..f304289991
--- /dev/null
+++ b/src/core/IDistribution1D.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/IDistribution1D.h"
+
+#include "arm_compute/core/Error.h"
+
+using namespace arm_compute;
+
+IDistribution1D::IDistribution1D(size_t num_bins, int32_t offset, uint32_t range)
+    : _num_bins(num_bins), _offset(offset), _range(range)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(0 == _num_bins, "Invalid number of bins, it should be greater than 0");
+}
+
+size_t IDistribution1D::num_bins() const
+{
+    return _num_bins;
+}
+
+int32_t IDistribution1D::offset() const
+{
+    return _offset;
+}
+
+uint32_t IDistribution1D::range() const
+{
+    return _range;
+}
+
+uint32_t IDistribution1D::window() const
+{
+    return _range / _num_bins;
+}
+
+size_t IDistribution1D::size() const
+{
+    return _num_bins * sizeof(uint32_t);
+}
+
+void IDistribution1D::set_range(uint32_t range)
+{
+    _range = range;
+}
+
+size_t IDistribution1D::dimensions() const
+{
+    return 1;
+}
diff --git a/src/core/IKernel.cpp b/src/core/IKernel.cpp
new file mode 100644
index 0000000000..6450a4fc2a
--- /dev/null
+++ b/src/core/IKernel.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/IKernel.h"
+
+using namespace arm_compute;
+
+const Window &IKernel::window() const
+{
+    return _window;
+}
+
+IKernel::IKernel()
+    : _window()
+{
+    // Create an empty window to make sure the children classes set the window values themselves
+    _window.set(Window::DimX, Window::Dimension(0, 0, 1));
+    _window.set(Window::DimY, Window::Dimension(0, 0, 1));
+}
+
+bool IKernel::is_parallelisable() const
+{
+    return true;
+}
+
+BorderSize IKernel::border_size() const
+{
+    return BorderSize(0);
+}
+
+void IKernel::configure(const Window &window)
+{
+    _window = window;
+}
diff --git a/src/core/ITensor.cpp b/src/core/ITensor.cpp
new file mode 100644
index 0000000000..0b29eca57b
--- /dev/null
+++ b/src/core/ITensor.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/ITensor.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+
+#include <cstring>
+#include <limits>
+
+using namespace arm_compute;
+
+void ITensor::copy_from(const ITensor &src)
+{
+    if(&src == this)
+    {
+        return;
+    }
+
+    const ITensorInfo *src_info = src.info();
+    ITensorInfo       *dst_info = this->info();
+
+    ARM_COMPUTE_ERROR_ON(src_info->num_dimensions() > dst_info->num_dimensions());
+    ARM_COMPUTE_ERROR_ON(src_info->num_channels() != dst_info->num_channels());
+    ARM_COMPUTE_ERROR_ON(src_info->element_size() != dst_info->element_size());
+
+    for(size_t d = 0; d < src_info->num_dimensions(); d++)
+    {
+        ARM_COMPUTE_ERROR_ON(src_info->dimension(d) > dst_info->dimension(d));
+    }
+
+    // Copy information about valid region
+    dst_info->set_valid_region(src_info->valid_region());
+
+    Window win_src;
+    win_src.use_tensor_dimensions(src_info, Window::DimY);
+    Window win_dst;
+    win_dst.use_tensor_dimensions(dst_info, Window::DimY);
+
+    Iterator src_it(&src, win_src);
+    Iterator dst_it(this, win_dst);
+
+    const size_t line_size = src_info->num_channels() * src_info->element_size() * src_info->dimension(0);
+
+    execute_window_loop(win_src, [&](const Coordinates & id)
+    {
+        memcpy(dst_it.ptr(), src_it.ptr(), line_size);
+    },
+    src_it, dst_it);
+}
+
+void ITensor::print(std::ostream &s, IOFormatInfo io_fmt) const
+{
+    ARM_COMPUTE_ERROR_ON(this->buffer() == nullptr);
+
+    const DataType    dt       = this->info()->data_type();
+    const size_t      slices2D = this->info()->tensor_shape().total_size_upper(2);
+    const Strides     strides  = this->info()->strides_in_bytes();
+    const PaddingSize padding  = this->info()->padding();
+
+    // Set precision
+    if(is_data_type_float(dt) && (io_fmt.precision_type != IOFormatInfo::PrecisionType::Default))
+    {
+        int precision = io_fmt.precision;
+        if(io_fmt.precision_type == IOFormatInfo::PrecisionType::Full)
+        {
+            precision = std::numeric_limits<float>().max_digits10;
+        }
+        s.precision(precision);
+    }
+
+    // Define region to print
+    size_t print_width  = 0;
+    size_t print_height = 0;
+    int    start_offset = 0;
+    switch(io_fmt.print_region)
+    {
+        case IOFormatInfo::PrintRegion::NoPadding:
+            print_width  = this->info()->dimension(0);
+            print_height = this->info()->dimension(1);
+            start_offset = this->info()->offset_first_element_in_bytes();
+            break;
+        case IOFormatInfo::PrintRegion::ValidRegion:
+            print_width  = this->info()->valid_region().shape.x();
+            print_height = this->info()->valid_region().shape.y();
+            start_offset = this->info()->offset_element_in_bytes(Coordinates(this->info()->valid_region().anchor.x(),
+                                                                             this->info()->valid_region().anchor.y()));
+            break;
+        case IOFormatInfo::PrintRegion::Full:
+            print_width  = padding.left + this->info()->dimension(0) + padding.right;
+            print_height = padding.top + this->info()->dimension(1) + padding.bottom;
+            start_offset = static_cast<int>(this->info()->offset_first_element_in_bytes()) - padding.top * strides[1] - padding.left * strides[0];
+            break;
+        default:
+            break;
+    }
+
+    // Set pointer to start
+    const uint8_t *ptr = this->buffer() + start_offset;
+
+    // Start printing
+    for(size_t i = 0; i < slices2D; ++i)
+    {
+        // Find max_width of elements in slice to align columns
+        int max_element_width = 0;
+        if(io_fmt.align_columns)
+        {
+            size_t offset = i * strides[2];
+            for(size_t h = 0; h < print_height; ++h)
+            {
+                max_element_width = std::max<int>(max_element_width, max_consecutive_elements_display_width(s, dt, ptr + offset, print_width));
+                offset += strides[1];
+            }
+        }
+
+        // Print slice
+        {
+            size_t offset = i * strides[2];
+            for(size_t h = 0; h < print_height; ++h)
+            {
+                print_consecutive_elements(s, dt, ptr + offset, print_width, max_element_width, io_fmt.element_delim);
+                offset += strides[1];
+                s << io_fmt.row_delim;
+            }
+            s << io_fmt.row_delim;
+        }
+    }
+}
+\ No newline at end of file
diff --git a/src/core/MultiImageInfo.cpp b/src/core/MultiImageInfo.cpp
new file mode 100644
index 0000000000..1e40a77c82
--- /dev/null
+++ b/src/core/MultiImageInfo.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/MultiImageInfo.h"
+
+using namespace arm_compute;
+
+MultiImageInfo::MultiImageInfo()
+    : _width(0), _height(0), _format(Format::UNKNOWN)
+{
+}
+
+void MultiImageInfo::init(unsigned int width, unsigned int height, Format format)
+{
+    _format = format;
+    _width  = width;
+    _height = height;
+}
+
+Format MultiImageInfo::format() const
+{
+    return _format;
+}
+
+unsigned int MultiImageInfo::width() const
+{
+    return _width;
+}
+
+unsigned int MultiImageInfo::height() const
+{
+    return _height;
+}
diff --git a/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
new file mode 100644
index 0000000000..edb0a0f304
--- /dev/null
+++ b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+void abs_diff_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t input1_val = vld1q_u8(input1.ptr());
+        const uint8x16_t input2_val = vld1q_u8(input2.ptr());
+
+        vst1q_u8(output.ptr(), vabdq_u8(input1_val, input2_val));
+    },
+    input1, input2, output);
+}
+
+inline int16x8x2_t vqabd2q_s16(const int16x8x2_t &v1, const int16x8x2_t &v2)
+{
+    const int16x8x2_t res =
+    {
+        {
+            vqabsq_s16(vqsubq_s16(v1.val[0], v2.val[0])),
+            vqabsq_s16(vqsubq_s16(v1.val[1], v2.val[1]))
+        }
+    };
+
+    return res;
+}
+
+void abs_diff_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        int16x8x2_t input1_val = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
+        int16x8x2_t input2_val = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
+        vst2q_s16(reinterpret_cast<int16_t *>(output.ptr()), vqabd2q_s16(input1_val, input2_val));
+    },
+    input1, input2, output);
+}
+
+void abs_diff_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t  input1_val = vld1q_u8(input1.ptr());
+        const int16x8x2_t input2_val =
+        {
+            {
+                vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr())),
+                vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()) + 8)
+            }
+        };
+
+        const int16x8x2_t out_val =
+        {
+            {
+                vqabsq_s16(vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(input1_val))), input2_val.val[0])),
+                vqabsq_s16(vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(input1_val))), input2_val.val[1]))
+            }
+        };
+
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out_val.val[0]);
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, out_val.val[1]);
+
+    },
+    input1, input2, output);
+}
+
+void abs_diff_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    abs_diff_U8_S16_S16(in2, in1, out, window);
+}
+} // namespace
+
+NEAbsoluteDifferenceKernel::NEAbsoluteDifferenceKernel()
+    : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void NEAbsoluteDifferenceKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+    set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+
+    if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
+    {
+        set_format_if_unknown(*output->info(), Format::S16);
+    }
+    else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
+    {
+        set_format_if_unknown(*output->info(), Format::U8);
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
+                             "The output image can only be U8 if both input images are U8");
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    const DataType input1_data_type = input1->info()->data_type();
+    const DataType input2_data_type = input2->info()->data_type();
+
+    if(input1_data_type == input2_data_type)
+    {
+        if(input1_data_type == DataType::U8)
+        {
+            _func = &abs_diff_U8_U8_U8;
+        }
+        else
+        {
+            _func = &abs_diff_S16_S16_S16;
+        }
+    }
+    else
+    {
+        if(input1_data_type == DataType::U8)
+        {
+            _func = &abs_diff_U8_S16_S16;
+        }
+        else
+        {
+            _func = &abs_diff_S16_U8_S16;
+        }
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
+                              AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
+                              output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    INEKernel::configure(win);
+}
+
+void NEAbsoluteDifferenceKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    _func(_input1, _input2, _output, window);
+}
diff --git a/src/core/NEON/kernels/NEAccumulateKernel.cpp b/src/core/NEON/kernels/NEAccumulateKernel.cpp
new file mode 100644
index 0000000000..e5b933a781
--- /dev/null
+++ b/src/core/NEON/kernels/NEAccumulateKernel.cpp
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEAccumulateKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+/* Max S16 value used for saturation purposes. */
+const static uint16x8_t max_int_u16 = vdupq_n_u16(static_cast<uint16_t>(INT16_MAX));
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+namespace fp16
+{
+inline float16x8x2_t convert_u8x16_to_f16x8x2(uint8x16_t input)
+{
+    const float16x8x2_t out =
+    {
+        {
+            vcvtq_f16_u16(vmovl_u8(vget_low_u8(input))),
+            vcvtq_f16_u16(vmovl_u8(vget_high_u8(input)))
+        }
+    };
+
+    return out;
+}
+
+inline uint8x16_t convert_f16x8x2_to_u8x16(const float16x8x2_t &input)
+{
+    return vcombine_u8(vmovn_u16(vcvtq_u16_f16(input.val[0])),
+                       vmovn_u16(vcvtq_u16_f16(input.val[1])));
+}
+
+inline float16x8x2_t vector_accumulate_weighted(const float16x8x2_t &vec0, const float16x8x2_t &vec1, float16x8_t scale_val, float16x8_t scale_val2)
+{
+    const float16x8x2_t res =
+    {
+        {
+            vfmaq_f16(vmulq_f16(vec1.val[0], scale_val), vec0.val[0], scale_val2),
+            vfmaq_f16(vmulq_f16(vec1.val[1], scale_val), vec0.val[1], scale_val2)
+        }
+    };
+
+    return res;
+}
+
+void acc_we_v16_u8(const void *__restrict input, void *__restrict accum, float16x8_t scale_val, float16x8_t scale_val2)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == accum);
+
+    const auto input_ptr = static_cast<const uint8_t *__restrict>(input);
+    const auto accum_ptr = static_cast<uint8_t *__restrict>(accum);
+
+    const uint8x16x4_t input_buffer = vld4q_u8(input_ptr);
+    uint8x16x4_t       accum_buffer = vld4q_u8(accum_ptr);
+
+    const float16x8x2_t f16_input_0 = convert_u8x16_to_f16x8x2(input_buffer.val[0]);
+    const float16x8x2_t f16_input_1 = convert_u8x16_to_f16x8x2(input_buffer.val[1]);
+    const float16x8x2_t f16_input_2 = convert_u8x16_to_f16x8x2(input_buffer.val[2]);
+    const float16x8x2_t f16_input_3 = convert_u8x16_to_f16x8x2(input_buffer.val[3]);
+
+    float16x8x2_t f16_accum_0 = convert_u8x16_to_f16x8x2(accum_buffer.val[0]);
+    float16x8x2_t f16_accum_1 = convert_u8x16_to_f16x8x2(accum_buffer.val[1]);
+    float16x8x2_t f16_accum_2 = convert_u8x16_to_f16x8x2(accum_buffer.val[2]);
+    float16x8x2_t f16_accum_3 = convert_u8x16_to_f16x8x2(accum_buffer.val[3]);
+
+    f16_accum_0 = vector_accumulate_weighted(f16_input_0, f16_accum_0, scale_val, scale_val2);
+    f16_accum_1 = vector_accumulate_weighted(f16_input_1, f16_accum_1, scale_val, scale_val2);
+    f16_accum_2 = vector_accumulate_weighted(f16_input_2, f16_accum_2, scale_val, scale_val2);
+    f16_accum_3 = vector_accumulate_weighted(f16_input_3, f16_accum_3, scale_val, scale_val2);
+
+    accum_buffer = { {
+            convert_f16x8x2_to_u8x16(f16_accum_0),
+            convert_f16x8x2_to_u8x16(f16_accum_1),
+            convert_f16x8x2_to_u8x16(f16_accum_2),
+            convert_f16x8x2_to_u8x16(f16_accum_3)
+        }
+    };
+
+    vst4q_u8(accum_ptr, accum_buffer);
+}
+} // namespace fp16
+
+void NEAccumulateWeightedFP16Kernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+
+    Iterator input(_input, window);
+    Iterator accum(_output, window);
+
+    const float16x8_t scale_val  = vdupq_n_f16(1.f - _alpha);
+    const float16x8_t scale_val2 = vdupq_n_f16(_alpha);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        fp16::acc_we_v16_u8(input.ptr(), accum.ptr(), scale_val, scale_val2);
+    },
+    input, accum);
+}
+#endif
+
+namespace
+{
+inline void acc_v16_u8(const void *__restrict input, void *__restrict accum)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == accum);
+
+    const auto in  = static_cast<const uint8_t *__restrict>(input);
+    const auto out = static_cast<int16_t *__restrict>(accum);
+
+    uint8x16_t ta1 = vld1q_u8(in);
+    int16x8_t  ta2 = vld1q_s16(out);
+    int16x8_t  ta3 = vld1q_s16(out + 8);
+
+    ta2 = vqaddq_s16(ta2, vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(ta1))));
+    ta3 = vqaddq_s16(ta3, vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(ta1))));
+
+    vst1q_s16(out, ta2);
+    vst1q_s16(out + 8, ta3);
+}
+
+inline float32x4x4_t convert_u8x16_to_f32x4x4(uint8x16_t input)
+{
+    const uint16x8_t u16_output_low = vmovl_u8(vget_low_u8(input));
+    const uint16x8_t u16_output_hi  = vmovl_u8(vget_high_u8(input));
+
+    const float32x4x4_t res =
+    {
+        {
+            vcvtq_f32_u32(vmovl_u16(vget_low_u16(u16_output_low))),
+            vcvtq_f32_u32(vmovl_u16(vget_high_u16(u16_output_low))),
+            vcvtq_f32_u32(vmovl_u16(vget_low_u16(u16_output_hi))),
+            vcvtq_f32_u32(vmovl_u16(vget_high_u16(u16_output_hi)))
+        }
+    };
+
+    return res;
+}
+
+inline uint8x16_t convert_f32x4x4_to_u8x16(const float32x4x4_t &input)
+{
+    return vcombine_u8(vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(input.val[0])),
+                                              vmovn_u32(vcvtq_u32_f32(input.val[1])))),
+                       vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(input.val[2])),
+                                              vmovn_u32(vcvtq_u32_f32(input.val[3])))));
+}
+
+inline float32x4x4_t vector_accumulate_weighted(const float32x4x4_t &vector_input, float32x4x4_t vector_output, float32x4_t scale_val, float32x4_t scale_val2)
+{
+    vector_output.val[0] = vmulq_f32(vector_output.val[0], scale_val);
+    vector_output.val[1] = vmulq_f32(vector_output.val[1], scale_val);
+    vector_output.val[2] = vmulq_f32(vector_output.val[2], scale_val);
+    vector_output.val[3] = vmulq_f32(vector_output.val[3], scale_val);
+
+    vector_output.val[0] = vmlaq_f32(vector_output.val[0], vector_input.val[0], scale_val2);
+    vector_output.val[1] = vmlaq_f32(vector_output.val[1], vector_input.val[1], scale_val2);
+    vector_output.val[2] = vmlaq_f32(vector_output.val[2], vector_input.val[2], scale_val2);
+    vector_output.val[3] = vmlaq_f32(vector_output.val[3], vector_input.val[3], scale_val2);
+
+    return vector_output;
+}
+
+inline void acc_we_v16_u8(const void *__restrict input, void *__restrict accum, const float32x4_t scale_val, const float32x4_t scale_val2)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == accum);
+
+    const auto input_ptr = static_cast<const uint8_t *__restrict>(input);
+    const auto accum_ptr = static_cast<uint8_t *__restrict>(accum);
+
+    const uint8x16_t input_buffer = vld1q_u8(input_ptr);
+    const uint8x16_t accum_buffer = vld1q_u8(accum_ptr);
+
+    const float32x4x4_t f32_input_0  = convert_u8x16_to_f32x4x4(input_buffer);
+    const float32x4x4_t f32_output_0 = convert_u8x16_to_f32x4x4(accum_buffer);
+
+    const float32x4x4_t f32_res_0 = vector_accumulate_weighted(f32_input_0, f32_output_0, scale_val, scale_val2);
+
+    vst1q_u8(accum_ptr, convert_f32x4x4_to_u8x16(f32_res_0));
+}
+
+void acc_sq_v16_u8(const void *__restrict input, uint32_t shift, void *__restrict accum)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == accum);
+    ARM_COMPUTE_ERROR_ON(shift > 15);
+
+    const auto input_buffer = static_cast<const uint8_t *__restrict>(input);
+    const auto accum_buffer = static_cast<int16_t *__restrict>(accum);
+
+    const uint8x16_t ta1 = vld1q_u8(input_buffer);
+    uint16x8_t       ta2 = vreinterpretq_u16_s16(vld1q_s16(accum_buffer));
+    uint16x8_t       ta3 = vreinterpretq_u16_s16(vld1q_s16(accum_buffer + 8));
+
+    const int16x8_t vector_shift = vdupq_n_s16(-static_cast<int16_t>(shift));
+
+    uint16x8_t linput = vmovl_u8(vget_low_u8(ta1));
+    uint16x8_t hinput = vmovl_u8(vget_high_u8(ta1));
+
+    linput = vmulq_u16(linput, linput);
+    hinput = vmulq_u16(hinput, hinput);
+
+    linput = vqshlq_u16(linput, vector_shift);
+    hinput = vqshlq_u16(hinput, vector_shift);
+
+    ta2 = vqaddq_u16(ta2, linput);
+    ta3 = vqaddq_u16(ta3, hinput);
+
+    vst1q_s16(accum_buffer, vreinterpretq_s16_u16(vminq_u16(max_int_u16, ta2)));
+    vst1q_s16(accum_buffer + 8, vreinterpretq_s16_u16(vminq_u16(max_int_u16, ta3)));
+}
+} // namespace
+
+void NEAccumulateKernel::configure(const ITensor *input, ITensor *accum)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, accum);
+
+    set_shape_if_empty(*accum->info(), input->info()->tensor_shape());
+
+    set_format_if_unknown(*accum->info(), Format::S16);
+
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, accum);
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    INESimpleKernel::configure(input, accum, num_elems_processed_per_iteration);
+}
+
+void NEAccumulateKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+    Iterator input(_input, window);
+    Iterator accum(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        acc_v16_u8(input.ptr(), accum.ptr());
+    },
+    input, accum);
+}
+
+NEAccumulateWeightedKernel::NEAccumulateWeightedKernel()
+    : _alpha(0.0f)
+{
+}
+
+void NEAccumulateWeightedKernel::configure(const ITensor *input, float alpha, ITensor *accum)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, accum);
+
+    set_shape_if_empty(*accum->info(), input->info()->tensor_shape());
+
+    set_format_if_unknown(*accum->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, accum);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(alpha < 0.0 || alpha > 1.0);
+
+    _alpha = alpha;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    INESimpleKernel::configure(input, accum, num_elems_processed_per_iteration);
+}
+
+void NEAccumulateWeightedKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+
+    Iterator input(_input, window);
+    Iterator accum(_output, window);
+
+    const float32x4_t scale_val  = vdupq_n_f32(1.f - _alpha);
+    const float32x4_t scale_val2 = vdupq_n_f32(_alpha);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        acc_we_v16_u8(input.ptr(), accum.ptr(), scale_val, scale_val2);
+    },
+    input, accum);
+}
+
+NEAccumulateSquaredKernel::NEAccumulateSquaredKernel()
+    : _shift(0)
+{
+}
+
+void NEAccumulateSquaredKernel::configure(const ITensor *input, uint32_t shift, ITensor *accum)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, accum);
+
+    set_shape_if_empty(*accum->info(), input->info()->tensor_shape());
+
+    set_format_if_unknown(*accum->info(), Format::S16);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, accum);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON(shift > 15);
+
+    _shift = shift;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    INESimpleKernel::configure(input, accum, num_elems_processed_per_iteration);
+}
+
+void NEAccumulateSquaredKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+    Iterator input(_input, window);
+    Iterator accum(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        acc_sq_v16_u8(input.ptr(), _shift, accum.ptr());
+    },
+    input, accum);
+}
diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
new file mode 100644
index 0000000000..a878078007
--- /dev/null
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
+
+#include "arm_compute/core/FixedPoint.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <array>
+#include <cmath>
+#include <map>
+
+using namespace arm_compute;
+
+NEActivationLayerKernel::NEActivationLayerKernel()
+    : _func(nullptr), _act_info(ActivationFunction::LOGISTIC)
+{
+}
+
+void NEActivationLayerKernel::configure(const ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+    // Activation functions : FP32
+    static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_f32 =
+    {
+        { ActivationFunction::ABS, &NEActivationLayerKernel::activation<ActivationFunction::ABS, float> },
+        { ActivationFunction::LINEAR, &NEActivationLayerKernel::activation<ActivationFunction::LINEAR, float> },
+        { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation<ActivationFunction::LOGISTIC, float> },
+        { ActivationFunction::RELU, &NEActivationLayerKernel::activation<ActivationFunction::RELU, float> },
+        { ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::BOUNDED_RELU, float> },
+        { ActivationFunction::SOFT_RELU, &NEActivationLayerKernel::activation<ActivationFunction::SOFT_RELU, float> },
+        { ActivationFunction::SQRT, &NEActivationLayerKernel::activation<ActivationFunction::SQRT, float> },
+        { ActivationFunction::SQUARE, &NEActivationLayerKernel::activation<ActivationFunction::SQUARE, float> },
+        { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, float> },
+    };
+
+    // Activation functions : QS8
+    static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qs8 =
+    {
+        { ActivationFunction::ABS, &NEActivationLayerKernel::activation<ActivationFunction::ABS, qint8_t> },
+        { ActivationFunction::LINEAR, &NEActivationLayerKernel::activation<ActivationFunction::LINEAR, qint8_t> },
+        { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation<ActivationFunction::LOGISTIC, qint8_t> },
+        { ActivationFunction::RELU, &NEActivationLayerKernel::activation<ActivationFunction::RELU, qint8_t> },
+        { ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::BOUNDED_RELU, qint8_t> },
+        { ActivationFunction::SOFT_RELU, &NEActivationLayerKernel::activation<ActivationFunction::SOFT_RELU, qint8_t> },
+        { ActivationFunction::SQRT, &NEActivationLayerKernel::activation<ActivationFunction::SQRT, qint8_t> },
+        { ActivationFunction::SQUARE, &NEActivationLayerKernel::activation<ActivationFunction::SQUARE, qint8_t> },
+        { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, qint8_t> },
+    };
+
+    _input    = input;
+    _output   = output;
+    _act_info = activation_info;
+    switch(input->info()->data_type())
+    {
+        case DataType::F32:
+            _func = act_map_f32[activation_info.activation()];
+            break;
+        case DataType::QS8:
+            _func = act_map_qs8[activation_info.activation()];
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type.");
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    INESimpleKernel::configure(_input, _output, num_elems_processed_per_iteration);
+}
+
+template <ActivationLayerInfo::ActivationFunction F, typename T>
+typename std::enable_if<std::is_same<T, float>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
+{
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    static const float32x4_t CONST_1 = vdupq_n_f32(1.f);
+    static const float32x4_t CONST_0 = vdupq_n_f32(0.f);
+    const float32x4_t        a       = vdupq_n_f32(_act_info.a());
+    const float32x4_t        b       = vdupq_n_f32(_act_info.b());
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<float *>(output.ptr());
+
+        const float32x4x4_t in  = vld4q_f32(input_ptr);
+        float32x4x4_t       tmp = { {} };
+
+        switch(F)
+        {
+            case ActivationFunction::ABS:
+                tmp =
+                {
+                    {
+                        vabsq_f32(in.val[0]),
+                        vabsq_f32(in.val[1]),
+                        vabsq_f32(in.val[2]),
+                        vabsq_f32(in.val[3]),
+                    }
+                };
+                break;
+            case ActivationFunction::BOUNDED_RELU:
+                tmp =
+                {
+                    {
+                        vminq_f32(a, vmaxq_f32(CONST_0, in.val[0])),
+                        vminq_f32(a, vmaxq_f32(CONST_0, in.val[1])),
+                        vminq_f32(a, vmaxq_f32(CONST_0, in.val[2])),
+                        vminq_f32(a, vmaxq_f32(CONST_0, in.val[3])),
+                    }
+                };
+                break;
+            case ActivationFunction::LINEAR:
+                tmp =
+                {
+                    {
+                        vmlaq_f32(b, a, in.val[0]),
+                        vmlaq_f32(b, a, in.val[1]),
+                        vmlaq_f32(b, a, in.val[2]),
+                        vmlaq_f32(b, a, in.val[3]),
+                    }
+                };
+                break;
+            case ActivationFunction::LOGISTIC:
+                tmp =
+                {
+                    {
+                        vinvq_f32(vaddq_f32(CONST_1, vexpq_f32(vnegq_f32(in.val[0])))),
+                        vinvq_f32(vaddq_f32(CONST_1, vexpq_f32(vnegq_f32(in.val[1])))),
+                        vinvq_f32(vaddq_f32(CONST_1, vexpq_f32(vnegq_f32(in.val[2])))),
+                        vinvq_f32(vaddq_f32(CONST_1, vexpq_f32(vnegq_f32(in.val[3])))),
+                    }
+                };
+                break;
+            case ActivationFunction::RELU:
+                tmp =
+                {
+                    {
+                        vmaxq_f32(CONST_0, in.val[0]),
+                        vmaxq_f32(CONST_0, in.val[1]),
+                        vmaxq_f32(CONST_0, in.val[2]),
+                        vmaxq_f32(CONST_0, in.val[3]),
+                    }
+                };
+                break;
+            case ActivationFunction::SOFT_RELU:
+                tmp =
+                {
+                    {
+                        vlogq_f32(vaddq_f32(CONST_1, vexpq_f32(in.val[0]))),
+                        vlogq_f32(vaddq_f32(CONST_1, vexpq_f32(in.val[1]))),
+                        vlogq_f32(vaddq_f32(CONST_1, vexpq_f32(in.val[2]))),
+                        vlogq_f32(vaddq_f32(CONST_1, vexpq_f32(in.val[3]))),
+                    }
+                };
+                break;
+            case ActivationFunction::SQRT:
+                tmp =
+                {
+                    {
+                        vinvq_f32(vinvsqrtq_f32(in.val[0])),
+                        vinvq_f32(vinvsqrtq_f32(in.val[1])),
+                        vinvq_f32(vinvsqrtq_f32(in.val[2])),
+                        vinvq_f32(vinvsqrtq_f32(in.val[3])),
+                    }
+                };
+                break;
+            case ActivationFunction::SQUARE:
+                tmp =
+                {
+                    {
+                        vmulq_f32(in.val[0], in.val[0]),
+                        vmulq_f32(in.val[1], in.val[1]),
+                        vmulq_f32(in.val[2], in.val[2]),
+                        vmulq_f32(in.val[3], in.val[3]),
+                    }
+                };
+                break;
+            case ActivationFunction::TANH:
+                tmp =
+                {
+                    {
+                        vmulq_f32(a, vtanhq_f32(vmulq_f32(b, in.val[0]))),
+                        vmulq_f32(a, vtanhq_f32(vmulq_f32(b, in.val[1]))),
+                        vmulq_f32(a, vtanhq_f32(vmulq_f32(b, in.val[2]))),
+                        vmulq_f32(a, vtanhq_f32(vmulq_f32(b, in.val[3]))),
+                    }
+                };
+                break;
+            default:
+                break;
+        }
+
+        vst4q_f32(output_ptr, tmp);
+    },
+    input, output);
+}
+
+template <ActivationLayerInfo::ActivationFunction F, typename T>
+typename std::enable_if<std::is_same<T, int8_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
+{
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+    int      fixed_point_position = _input->info()->fixed_point_position();
+
+    static const qint8x16_t CONST_0 = vdupq_n_qs8(0);
+    const qint8x16_t        CONST_1 = vdupq_n_qs8(scvt_qs8_f32(1.f, fixed_point_position));
+    const qint8x16_t        a       = vdupq_n_qs8(scvt_qs8_f32(_act_info.a(), fixed_point_position));
+    const qint8x16_t        b       = vdupq_n_qs8(scvt_qs8_f32(_act_info.b(), fixed_point_position));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto input_ptr  = reinterpret_cast<const int8_t *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+
+        const qint8x16_t in  = vld1q_qs8(input_ptr);
+        qint8x16_t       tmp = {};
+
+        switch(F)
+        {
+            case ActivationFunction::ABS:
+                tmp = vqabsq_qs8(in);
+                break;
+            case ActivationFunction::BOUNDED_RELU:
+                tmp = vminq_qs8(a, vmaxq_qs8(CONST_0, in));
+                break;
+            case ActivationFunction::LINEAR:
+                tmp = vqmlaq_qs8(b, a, in, fixed_point_position);
+                break;
+            case ActivationFunction::LOGISTIC:
+                tmp = vrecipq_qs8(vqaddq_qs8(CONST_1, vqexpq_qs8(vnegq_s8(in), fixed_point_position)), fixed_point_position);
+                break;
+            case ActivationFunction::RELU:
+                tmp = vmaxq_qs8(CONST_0, in);
+                break;
+            case ActivationFunction::SOFT_RELU:
+                tmp = vlogq_qs8(vqaddq_qs8(CONST_1, vqexpq_qs8(in, fixed_point_position)), fixed_point_position);
+                break;
+            case ActivationFunction::SQRT:
+                tmp = vrecipq_qs8(vinvsqrtq_qs8(in, fixed_point_position), fixed_point_position);
+                break;
+            case ActivationFunction::SQUARE:
+                tmp = vqmulq_qs8(in, in, fixed_point_position);
+                break;
+            case ActivationFunction::TANH:
+                tmp = vtanhq_qs8(in, fixed_point_position);
+                break;
+            default:
+                break;
+        }
+
+        vst1q_qs8(output_ptr, tmp);
+    },
+    input, output);
+}
+
+void NEActivationLayerKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (this->*_func)(window);
+}
diff --git a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
new file mode 100644
index 0000000000..a4fdad8a2a
--- /dev/null
+++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
@@ -0,0 +1,378 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <cstdint>
+#include <map>
+#include <string>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+void add_wrap_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        vst1q_u8(output.ptr(), vaddq_u8(vld1q_u8(input1.ptr()), vld1q_u8(input2.ptr())));
+    },
+    input1, input2, output);
+}
+
+void add_saturate_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        vst1q_u8(output.ptr(), vqaddq_u8(vld1q_u8(input1.ptr()), vld1q_u8(input2.ptr())));
+    },
+    input1, input2, output);
+}
+
+inline int16x8x2_t vadd2q_s16(const int16x8x2_t &a, const int16x8x2_t &b)
+{
+    const int16x8x2_t res =
+    {
+        {
+            vaddq_s16(a.val[0], b.val[0]),
+            vaddq_s16(a.val[1], b.val[1])
+        }
+    };
+
+    return res;
+}
+
+inline float32x4x4_t vadd4q_f32(const float32x4x4_t &a, const float32x4x4_t &b)
+{
+    const float32x4x4_t res =
+    {
+        {
+            vaddq_f32(a.val[0], b.val[0]),
+            vaddq_f32(a.val[1], b.val[1]),
+            vaddq_f32(a.val[2], b.val[2]),
+            vaddq_f32(a.val[3], b.val[3])
+        }
+    };
+
+    return res;
+}
+
+inline int16x8x2_t vqadd2q_s16(const int16x8x2_t &a, const int16x8x2_t &b)
+{
+    const int16x8x2_t res =
+    {
+        {
+            vqaddq_s16(a.val[0], b.val[0]),
+            vqaddq_s16(a.val[1], b.val[1])
+        }
+    };
+
+    return res;
+}
+
+void add_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const float32x4x4_t a = vld4q_f32(reinterpret_cast<const float *>(input1.ptr()));
+        const float32x4x4_t b = vld4q_f32(reinterpret_cast<const float *>(input2.ptr()));
+
+        vst4q_f32(reinterpret_cast<float *>(output.ptr()), vadd4q_f32(a, b));
+    },
+    input1, input2, output);
+}
+
+void add_wrap_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int16x8x2_t a = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
+        const int16x8x2_t b = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
+
+        vst2q_s16(reinterpret_cast<int16_t *>(output.ptr()), vadd2q_s16(a, b));
+    },
+    input1, input2, output);
+}
+
+void add_saturate_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int16x8x2_t a = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
+        const int16x8x2_t b = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
+
+        vst2q_s16(reinterpret_cast<int16_t *>(output.ptr()), vqadd2q_s16(a, b));
+    },
+    input1, input2, output);
+}
+
+void add_wrap_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int16x8x2_t a =
+        {
+            {
+                vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr())),
+                vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8)
+            }
+        };
+        const uint8x16_t b = vld1q_u8(input2.ptr());
+
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vaddq_s16(a.val[0], vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b)))));
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vaddq_s16(a.val[1], vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)))));
+    },
+    input1, input2, output);
+}
+
+void add_saturate_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int16x8x2_t a =
+        {
+            {
+                vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr())),
+                vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8)
+            }
+        };
+        const uint8x16_t b = vld1q_u8(input2.ptr());
+
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vqaddq_s16(a.val[0], vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b)))));
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vqaddq_s16(a.val[1], vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)))));
+    },
+    input1, input2, output);
+}
+
+inline void add_wrap_U8_S16_S16(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window)
+{
+    //Simply swap the two input buffers:
+    add_wrap_S16_U8_S16(input2, input1, output, window);
+}
+
+inline void add_saturate_U8_S16_S16(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window)
+{
+    //Simply swap the two input buffers:
+    add_saturate_S16_U8_S16(input2, input1, output, window);
+}
+
+void add_wrap_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t a = vld1q_u8(input1.ptr());
+        const uint8x16_t b = vld1q_u8(input2.ptr());
+
+        const int16x8x2_t a_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a)))
+            }
+        };
+
+        const int16x8x2_t b_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)))
+            }
+        };
+
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vaddq_s16(a_s16.val[0], b_s16.val[0]));
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vaddq_s16(a_s16.val[1], b_s16.val[1]));
+    },
+    input1, input2, output);
+}
+
+void add_saturate_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t a = vld1q_u8(input1.ptr());
+        const uint8x16_t b = vld1q_u8(input2.ptr());
+
+        const int16x8x2_t a_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a)))
+            }
+        };
+
+        const int16x8x2_t b_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)))
+            }
+        };
+
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vqaddq_s16(a_s16.val[0], b_s16.val[0]));
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vqaddq_s16(a_s16.val[1], b_s16.val[1]));
+    },
+    input1, input2, output);
+}
+} // namespace
+
+NEArithmeticAdditionKernel::NEArithmeticAdditionKernel()
+    : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void NEArithmeticAdditionKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+    set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+
+    if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
+    {
+        set_format_if_unknown(*output->info(), Format::S16);
+    }
+    else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
+    {
+        set_format_if_unknown(*output->info(), Format::F32);
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
+                             "Output can only be U8 if both inputs are U8");
+
+    static std::map<std::string, AddFunction *> map_function =
+    {
+        { "add_wrap_U8_U8_U8", &add_wrap_U8_U8_U8 },
+        { "add_saturate_U8_U8_U8", &add_saturate_U8_U8_U8 },
+        { "add_wrap_S16_U8_S16", &add_wrap_S16_U8_S16 },
+        { "add_saturate_S16_U8_S16", &add_saturate_S16_U8_S16 },
+        { "add_wrap_U8_S16_S16", &add_wrap_U8_S16_S16 },
+        { "add_saturate_U8_S16_S16", &add_saturate_U8_S16_S16 },
+        { "add_wrap_U8_U8_S16", &add_wrap_U8_U8_S16 },
+        { "add_saturate_U8_U8_S16", &add_saturate_U8_U8_S16 },
+        { "add_wrap_S16_S16_S16", &add_wrap_S16_S16_S16 },
+        { "add_saturate_S16_S16_S16", &add_saturate_S16_S16_S16 },
+        { "add_wrap_F32_F32_F32", &add_F32_F32_F32 },
+        { "add_saturate_F32_F32_F32", &add_F32_F32_F32 },
+    };
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    std::string function_to_call("add_");
+    function_to_call += policy == ConvertPolicy::WRAP ? "wrap_" : "saturate_";
+    function_to_call += string_from_data_type(input1->info()->data_type()) + "_";
+    function_to_call += string_from_data_type(input2->info()->data_type()) + "_";
+    function_to_call += string_from_data_type(output->info()->data_type());
+
+    auto it = map_function.find(function_to_call);
+
+    if(it != map_function.end())
+    {
+        _func = it->second;
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("You called arithmetic addition with the wrong tensor data type");
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
+                              AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
+                              output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    INEKernel::configure(win);
+}
+
+void NEArithmeticAdditionKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (*_func)(_input1, _input2, _output, window);
+}
diff --git a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
new file mode 100644
index 0000000000..d3e62b069e
--- /dev/null
+++ b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <cstdint>
+#include <map>
+#include <string>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+void sub_wrap_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t ta1 = vld1q_u8(input1.ptr());
+        const uint8x16_t ta2 = vld1q_u8(input2.ptr());
+
+        vst1q_u8(output.ptr(), vsubq_u8(ta1, ta2));
+    },
+    input1, input2, output);
+}
+
+void sub_saturate_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t ta1 = vld1q_u8(input1.ptr());
+        const uint8x16_t ta2 = vld1q_u8(input2.ptr());
+
+        vst1q_u8(output.ptr(), vqsubq_u8(ta1, ta2));
+    },
+    input1, input2, output);
+}
+
+void sub_wrap_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int16x8x2_t ta1 = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
+        const int16x8x2_t ta2 = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
+
+        const int16x8x2_t ta3 =
+        {
+            {
+                vsubq_s16(ta1.val[0], ta2.val[0]),
+                vsubq_s16(ta1.val[1], ta2.val[1])
+            }
+        };
+
+        vst2q_s16(reinterpret_cast<int16_t *>(output.ptr()), ta3);
+    },
+    input1, input2, output);
+}
+
+void sub_saturate_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int16x8x2_t ta1 = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
+        const int16x8x2_t ta2 = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
+
+        const int16x8x2_t ta3 =
+        {
+            {
+                vqsubq_s16(ta1.val[0], ta2.val[0]),
+                vqsubq_s16(ta1.val[1], ta2.val[1])
+            }
+        };
+
+        vst2q_s16(reinterpret_cast<int16_t *>(output.ptr()), ta3);
+    },
+    input1, input2, output);
+}
+
+void sub_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const float32x4x4_t ta1 = vld4q_f32(reinterpret_cast<const float *>(input1.ptr()));
+        const float32x4x4_t ta2 = vld4q_f32(reinterpret_cast<const float *>(input2.ptr()));
+
+        const float32x4x4_t ta3 =
+        {
+            {
+                vsubq_f32(ta1.val[0], ta2.val[0]),
+                vsubq_f32(ta1.val[1], ta2.val[1]),
+                vsubq_f32(ta1.val[2], ta2.val[2]),
+                vsubq_f32(ta1.val[3], ta2.val[3]),
+            }
+        };
+
+        vst4q_f32(reinterpret_cast<float *>(output.ptr()), ta3);
+    },
+    input1, input2, output);
+}
+void sub_wrap_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
+        int16x8_t        a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
+        int16x8_t        a2_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8);
+
+        a1_0 = vsubq_s16(a1_0, vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))));
+        a2_0 = vsubq_s16(a2_0, vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))));
+
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
+    },
+    input1, input2, output);
+}
+
+void sub_saturate_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
+        int16x8_t        a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
+        int16x8_t        a2_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8);
+
+        a1_0 = vqsubq_s16(a1_0, vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))));
+        a2_0 = vqsubq_s16(a2_0, vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))));
+
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
+    },
+    input1, input2, output);
+}
+
+void sub_wrap_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t bv_0 = vld1q_u8(input1.ptr());
+        int16x8_t        a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
+        int16x8_t        a2_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()) + 8);
+
+        a1_0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))), a1_0);
+        a2_0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))), a2_0);
+
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
+    },
+    input1, input2, output);
+}
+
+void sub_saturate_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t bv_0 = vld1q_u8(input1.ptr());
+        int16x8_t        a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
+        int16x8_t        a2_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()) + 8);
+
+        a1_0 = vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))), a1_0);
+        a2_0 = vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))), a2_0);
+
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
+    },
+    input1, input2, output);
+}
+
+void sub_wrap_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t av_0 = vld1q_u8(input1.ptr());
+        const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
+
+        const int16x8_t a1_0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(av_0))),
+                                         vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))));
+        const int16x8_t a2_0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(av_0))),
+                                         vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))));
+
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
+    },
+    input1, input2, output);
+}
+
+void sub_saturate_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    Iterator input1(in1, window);
+    Iterator input2(in2, window);
+    Iterator output(out, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t av_0 = vld1q_u8(input1.ptr());
+        const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
+
+        const int16x8_t a1_0 = vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(av_0))),
+                                          vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))));
+        const int16x8_t a2_0 = vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(av_0))),
+                                          vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))));
+
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
+    },
+    input1, input2, output);
+}
+} // namespace
+
+NEArithmeticSubtractionKernel::NEArithmeticSubtractionKernel()
+    : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void NEArithmeticSubtractionKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+    set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+
+    if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
+    {
+        set_format_if_unknown(*output->info(), Format::S16);
+    }
+    else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
+    {
+        set_format_if_unknown(*output->info(), Format::F32);
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
+                             "Output can only be U8 if both inputs are U8");
+
+    static std::map<std::string, SubFunction *> map_function =
+    {
+        { "sub_wrap_U8_U8_U8", &sub_wrap_U8_U8_U8 },
+        { "sub_wrap_U8_U8_S16", &sub_wrap_U8_U8_S16 },
+        { "sub_saturate_U8_U8_U8", &sub_saturate_U8_U8_U8 },
+        { "sub_saturate_U8_U8_S16", &sub_saturate_U8_U8_S16 },
+        { "sub_wrap_U8_S16_S16", &sub_wrap_U8_S16_S16 },
+        { "sub_wrap_S16_U8_S16", &sub_wrap_S16_U8_S16 },
+        { "sub_saturate_U8_S16_S16", &sub_saturate_U8_S16_S16 },
+        { "sub_saturate_S16_U8_S16", &sub_saturate_S16_U8_S16 },
+        { "sub_wrap_S16_S16_S16", &sub_wrap_S16_S16_S16 },
+        { "sub_saturate_S16_S16_S16", &sub_saturate_S16_S16_S16 },
+        { "sub_wrap_F32_F32_F32", &sub_F32_F32_F32 },
+        { "sub_saturate_F32_F32_F32", &sub_F32_F32_F32 },
+    };
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    std::string function_to_call("sub_");
+    function_to_call += policy == ConvertPolicy::WRAP ? "wrap_" : "saturate_";
+    function_to_call += string_from_data_type(input1->info()->data_type()) + "_";
+    function_to_call += string_from_data_type(input2->info()->data_type()) + "_";
+    function_to_call += string_from_data_type(output->info()->data_type());
+
+    auto it = map_function.find(function_to_call);
+
+    if(it != map_function.end())
+    {
+        _func = it->second;
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("You called subtract with the wrong image formats");
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
+                              AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
+                              output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    INEKernel::configure(win);
+}
+
+void NEArithmeticSubtractionKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (*_func)(_input1, _input2, _output, window);
+}
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
new file mode 100644
index 0000000000..9a216aecde
--- /dev/null
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+NEBatchNormalizationLayerKernel::NEBatchNormalizationLayerKernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon()
+{
+}
+
+void batch_normalization_q8(const ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window)
+{
+    Iterator input(in, window);
+    Iterator output(out, window);
+
+    // Hold information about the current feature map we are iterating.
+    // Only compute denominator and NEON vectors once per feature map.
+    int slice = -1;
+
+    int        fixed_point_position = in->info()->fixed_point_position();
+    const auto input_mean           = reinterpret_cast<const qint8_t *>(mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var            = reinterpret_cast<const qint8_t *>(var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma          = reinterpret_cast<const qint8_t *>(gamma->ptr_to_element(Coordinates(0, 0)));
+    const auto input_beta           = reinterpret_cast<const qint8_t *>(beta->ptr_to_element(Coordinates(0, 0)));
+
+    qint8x16_t       mean_vec    = vdupq_n_qs8(0);
+    qint8x16_t       var_vec     = vdupq_n_qs8(0);
+    qint8x16_t       gamma_vec   = vdupq_n_qs8(0);
+    qint8x16_t       beta_vec    = vdupq_n_qs8(0);
+    qint8x16_t       denominator = vdupq_n_qs8(0);
+    const qint8x16_t epsilon_vec = vdupq_n_qs8(scvt_qs8_f32(epsilon, fixed_point_position));
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        if(slice != id.z())
+        {
+            // Conctruct vectors
+            mean_vec  = vdupq_n_qs8(*(input_mean + id.z()));
+            var_vec   = vdupq_n_qs8(*(input_var + id.z()));
+            gamma_vec = vdupq_n_qs8(*(input_gamma + id.z()));
+            beta_vec  = vdupq_n_qs8(*(input_beta + id.z()));
+
+            // Calculate denominator
+            denominator = vqinvsqrtq_qs8(vqaddq_qs8(var_vec, epsilon_vec), fixed_point_position);
+            slice       = id.z();
+        }
+
+        // Calculate x bar and store results
+        const qint8x16_t numerator = vqsubq_qs8(vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr())), mean_vec);
+        const qint8x16_t x_bar     = vqmulq_qs8(numerator, denominator, fixed_point_position);
+        vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqmlaq_qs8(beta_vec, x_bar, gamma_vec, fixed_point_position));
+    },
+    input, output);
+}
+
+void batch_normalization_fp32(const ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window)
+{
+    Iterator input(in, window);
+    Iterator output(out, window);
+
+    // Hold information about the current feature map we are iterating.
+    // Only compute denominator and NEON vectors once per feature map.
+    int slice = -1;
+
+    const auto input_mean  = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var   = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma = reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0)));
+    const auto input_beta  = reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0)));
+
+    float32x4_t       mean_vec    = vdupq_n_f32(0.0);
+    float32x4_t       var_vec     = vdupq_n_f32(0.0);
+    float32x4_t       gamma_vec   = vdupq_n_f32(0.0);
+    float32x4_t       beta_vec    = vdupq_n_f32(0.0);
+    float32x4_t       denominator = vdupq_n_f32(0.0);
+    const float32x4_t epsilon_vec = vdupq_n_f32(epsilon);
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        if(slice != id.z())
+        {
+            // Conctruct vectors
+            mean_vec  = vdupq_n_f32(*(input_mean + id.z()));
+            var_vec   = vdupq_n_f32(*(input_var + id.z()));
+            gamma_vec = vdupq_n_f32(*(input_gamma + id.z()));
+            beta_vec  = vdupq_n_f32(*(input_beta + id.z()));
+
+            // Calculate denominator
+            denominator = vinvsqrtq_f32(vaddq_f32(var_vec, epsilon_vec));
+            slice       = id.z();
+        }
+
+        // Calculate x bar and store results
+        const float32x4_t numerator = vsubq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())), mean_vec);
+        const float32x4_t x_bar     = vmulq_f32(numerator, denominator);
+        vst1q_f32(reinterpret_cast<float *>(output.ptr()), vmlaq_f32(beta_vec, x_bar, gamma_vec));
+    },
+    input, output);
+}
+
+void NEBatchNormalizationLayerKernel::configure(const ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mean, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(var, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gamma, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(beta, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, var);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+    _input   = input;
+    _output  = output;
+    _mean    = mean;
+    _var     = var;
+    _gamma   = gamma;
+    _beta    = beta;
+    _epsilon = epsilon;
+
+    unsigned int num_elems_processed_per_iteration = 0;
+
+    switch(input->info()->data_type())
+    {
+        case DataType::QS8:
+            _func                             = &batch_normalization_q8;
+            num_elems_processed_per_iteration = 16;
+            break;
+        case DataType::F32:
+            _func                             = &batch_normalization_fp32;
+            num_elems_processed_per_iteration = 4;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Element size not supported");
+            break;
+    }
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+
+void NEBatchNormalizationLayerKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (*_func)(_input, _output, _mean, _var, _beta, _gamma, _epsilon, window);
+}
diff --git a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
new file mode 100644
index 0000000000..e8e448e455
--- /dev/null
+++ b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+inline void bitwise_and_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output)
+{
+    const uint8x16_t val1 = vld1q_u8(input1);
+    const uint8x16_t val2 = vld1q_u8(input2);
+
+    vst1q_u8(output, vandq_u8(val1, val2));
+}
+} // namespace
+
+NEBitwiseAndKernel::NEBitwiseAndKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void NEBitwiseAndKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+    set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+
+    set_format_if_unknown(*output->info(), Format::U8);
+    set_format_if_unknown(*input1->info(), Format::U8);
+    set_format_if_unknown(*input2->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
+                              AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
+                              output_access);
+
+    const ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                             input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    INEKernel::configure(win);
+}
+
+void NEBitwiseAndKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    Iterator input1(_input1, window);
+    Iterator input2(_input2, window);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        bitwise_and_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr());
+    },
+    input1, input2, output);
+}
diff --git a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
new file mode 100644
index 0000000000..bf75592677
--- /dev/null
+++ b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+inline void bitwise_not_U8_U8(const uint8_t *__restrict input, uint8_t *__restrict output)
+{
+    const uint8x16_t val0 = vld1q_u8(input);
+
+    vst1q_u8(output, vmvnq_u8(val0));
+}
+} // namespace
+
+NEBitwiseNotKernel::NEBitwiseNotKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void NEBitwiseNotKernel::configure(const ITensor *input, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+    set_format_if_unknown(*output->info(), Format::U8);
+    set_format_if_unknown(*input->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input  = input;
+    _output = output;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), output_access);
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+
+void NEBitwiseNotKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        bitwise_not_U8_U8(input.ptr(), output.ptr());
+    },
+    input, output);
+}
diff --git a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
new file mode 100644
index 0000000000..f184be2f26
--- /dev/null
+++ b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+inline void bitwise_or_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output)
+{
+    const uint8x16_t val1 = vld1q_u8(input1);
+    const uint8x16_t val2 = vld1q_u8(input2);
+
+    vst1q_u8(output, vorrq_u8(val1, val2));
+}
+} // namespace
+
+NEBitwiseOrKernel::NEBitwiseOrKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void NEBitwiseOrKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+    set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+
+    set_format_if_unknown(*output->info(), Format::U8);
+    set_format_if_unknown(*input1->info(), Format::U8);
+    set_format_if_unknown(*input2->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
+                              AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
+                              output_access);
+
+    const ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                             input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    INEKernel::configure(win);
+}
+
+void NEBitwiseOrKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    Iterator input1(_input1, window);
+    Iterator input2(_input2, window);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        bitwise_or_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr());
+    },
+    input1, input2, output);
+}
diff --git a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
new file mode 100644
index 0000000000..c4fb4c0d03
--- /dev/null
+++ b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+inline void bitwise_xor_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output)
+{
+    const uint8x16_t val1 = vld1q_u8(input1);
+    const uint8x16_t val2 = vld1q_u8(input2);
+
+    vst1q_u8(output, veorq_u8(val1, val2));
+}
+} // namespace
+
+NEBitwiseXorKernel::NEBitwiseXorKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void NEBitwiseXorKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+    set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+
+    set_format_if_unknown(*output->info(), Format::U8);
+    set_format_if_unknown(*input1->info(), Format::U8);
+    set_format_if_unknown(*input2->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
+                              AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration), output_access);
+
+    const ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    INEKernel::configure(win);
+}
+
+void NEBitwiseXorKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    Iterator input1(_input1, window);
+    Iterator input2(_input2, window);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        bitwise_xor_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr());
+    },
+    input1, input2, output);
+}
diff --git a/src/core/NEON/kernels/NEBox3x3Kernel.cpp b/src/core/NEON/kernels/NEBox3x3Kernel.cpp
new file mode 100644
index 0000000000..d7e6d73cd7
--- /dev/null
+++ b/src/core/NEON/kernels/NEBox3x3Kernel.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEBox3x3Kernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Validate.h"
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+void NEBox3x3FP16Kernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-1, -1));
+    unsigned char *const input_mid_ptr = _input->ptr_to_element(Coordinates(-1, 0));
+    unsigned char *const input_bot_ptr = _input->ptr_to_element(Coordinates(-1, +1));
+
+    const float16x8_t oneovernine = vdupq_n_f16(1.0f / 9.0f);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
+        const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
+        const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
+
+        const float16x8x2_t top_f16 =
+        {
+            {
+                vcvtq_f16_u16(vmovl_u8(vget_low_u8(top_data))),
+                vcvtq_f16_u16(vmovl_u8(vget_high_u8(top_data)))
+            }
+        };
+
+        const float16x8x2_t mid_f16 =
+        {
+            {
+                vcvtq_f16_u16(vmovl_u8(vget_low_u8(mid_data))),
+                vcvtq_f16_u16(vmovl_u8(vget_high_u8(mid_data)))
+            }
+        };
+
+        const float16x8x2_t bot_f16 =
+        {
+            {
+                vcvtq_f16_u16(vmovl_u8(vget_low_u8(bot_data))),
+                vcvtq_f16_u16(vmovl_u8(vget_high_u8(bot_data)))
+            }
+        };
+
+        //top left
+        float16x8_t out = top_f16.val[0];
+        //top mid
+        out = vaddq_f16(out, vextq_f16(top_f16.val[0], top_f16.val[1], 1));
+        //top right
+        out = vaddq_f16(out, vextq_f16(top_f16.val[0], top_f16.val[1], 2));
+        //mid left
+        out = vaddq_f16(out, mid_f16.val[0]);
+        //mid mid
+        out = vaddq_f16(out, vextq_f16(mid_f16.val[0], mid_f16.val[1], 1));
+        //mid right
+        out = vaddq_f16(out, vextq_f16(mid_f16.val[0], mid_f16.val[1], 2));
+        //bot left
+        out = vaddq_f16(out, bot_f16.val[0]);
+        //bot mid
+        out = vaddq_f16(out, vextq_f16(bot_f16.val[0], bot_f16.val[1], 1));
+        //bot right
+        out = vaddq_f16(out, vextq_f16(bot_f16.val[0], bot_f16.val[1], 2));
+
+        out = vmulq_f16(out, oneovernine);
+
+        vst1_u8(output.ptr(), vqmovun_s16(vcvtq_s16_f16(out)));
+    },
+    input, output);
+}
+#endif
+
+BorderSize NEBox3x3Kernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void NEBox3x3Kernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+    set_format_if_unknown(*input->info(), Format::U8);
+    set_format_if_unknown(*output->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input  = input;
+    _output = output;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+    constexpr int          rect_offset_xy                    = -1;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, AccessWindowRectangle(input->info(), rect_offset_xy, rect_offset_xy, num_elems_read_per_iteration, num_rows_read_per_iteration), output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+void NEBox3x3Kernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-1, -1));
+    unsigned char *const input_mid_ptr = _input->ptr_to_element(Coordinates(-1, 0));
+    unsigned char *const input_bot_ptr = _input->ptr_to_element(Coordinates(-1, +1));
+
+    const float32x4_t oneovernine = vdupq_n_f32(1.0f / 9.0f);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
+        const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
+        const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
+
+        const int16x8x2_t top_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data)))
+            }
+        };
+        const int16x8x2_t mid_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mid_data))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mid_data)))
+            }
+        };
+        const int16x8x2_t bot_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data)))
+            }
+        };
+
+        //top left
+        int16x8_t out = top_s16.val[0];
+        //top mid
+        out = vaddq_s16(out, vextq_s16(top_s16.val[0], top_s16.val[1], 1));
+        //top right
+        out = vaddq_s16(out, vextq_s16(top_s16.val[0], top_s16.val[1], 2));
+        //mid left
+        out = vaddq_s16(out, mid_s16.val[0]);
+        //mid mid
+        out = vaddq_s16(out, vextq_s16(mid_s16.val[0], mid_s16.val[1], 1));
+        //mid right
+        out = vaddq_s16(out, vextq_s16(mid_s16.val[0], mid_s16.val[1], 2));
+        //bot left
+        out = vaddq_s16(out, bot_s16.val[0]);
+        //bot mid
+        out = vaddq_s16(out, vextq_s16(bot_s16.val[0], bot_s16.val[1], 1));
+        //bot right
+        out = vaddq_s16(out, vextq_s16(bot_s16.val[0], bot_s16.val[1], 2));
+
+        float32x4_t outfloathigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(out)));
+        float32x4_t outfloatlow  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(out)));
+
+        outfloathigh = vmulq_f32(outfloathigh, oneovernine);
+        outfloatlow  = vmulq_f32(outfloatlow, oneovernine);
+
+        out = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(outfloatlow)),
+                           vqmovn_s32(vcvtq_s32_f32(outfloathigh)));
+
+        vst1_u8(output.ptr(), vqmovun_s16(out));
+    },
+    input, output);
+}
diff --git a/src/core/NEON/kernels/NECannyEdgeKernel.cpp b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
new file mode 100644
index 0000000000..85a2cd5855
--- /dev/null
+++ b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
@@ -0,0 +1,1856 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NECannyEdgeKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+constexpr int NO_EDGE = 0;
+constexpr int EDGE    = 255;
+constexpr int MAYBE   = 127;
+} // namespace
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+namespace fp16
+{
+inline uint8x8_t phase_quantization(const float32x4x2_t &gx, const float32x4x2_t &gy)
+{
+    // Constant use for evaluating score1 and score3
+    static const float32x4_t const45 = vdupq_n_f32(0.70710678118655f);
+    static const float32x4_t zero    = vdupq_n_f32(0.0f);
+    static const float32x4_t one     = vdupq_n_f32(1.0f);
+    static const float32x4_t two     = vdupq_n_f32(2.0f);
+    static const float32x4_t three   = vdupq_n_f32(3.0f);
+
+    // Score0: (1, 0)
+    const float32x4x2_t score0 =
+    {
+        vabsq_f32(gx.val[0]),
+        vabsq_f32(gx.val[1])
+    };
+
+    // Score2: ( 0, 1 )
+    const float32x4x2_t score2 =
+    {
+        vabsq_f32(gy.val[0]),
+        vabsq_f32(gy.val[1])
+    };
+
+    // Score1 and Score3: ( sqrt(2) / 2, sqrt(2) / 2 ) - ( -sqrt(2) / 2, sqrt(2) / 2 )
+    float32x4x2_t score1 =
+    {
+        vmulq_f32(gy.val[0], const45),
+        vmulq_f32(gy.val[1], const45)
+    };
+
+    float32x4x2_t score3 = score1;
+
+    score1.val[0] = vmlaq_f32(score1.val[0], gx.val[0], const45);
+    score1.val[1] = vmlaq_f32(score1.val[1], gx.val[1], const45);
+    score3.val[0] = vmlsq_f32(score3.val[0], gx.val[0], const45);
+    score3.val[1] = vmlsq_f32(score3.val[1], gx.val[1], const45);
+
+    score1.val[0] = vabsq_f32(score1.val[0]);
+    score1.val[1] = vabsq_f32(score1.val[1]);
+    score3.val[0] = vabsq_f32(score3.val[0]);
+    score3.val[1] = vabsq_f32(score3.val[1]);
+
+    float32x4x2_t phase =
+    {
+        zero,
+        zero
+    };
+
+    float32x4x2_t old_score = score0;
+
+    // score1 > old_score?
+    uint32x4x2_t mask =
+    {
+        vcgtq_f32(score1.val[0], old_score.val[0]),
+        vcgtq_f32(score1.val[1], old_score.val[1])
+    };
+
+    phase.val[0]     = vbslq_f32(mask.val[0], one, phase.val[0]);
+    phase.val[1]     = vbslq_f32(mask.val[1], one, phase.val[1]);
+    old_score.val[0] = vbslq_f32(mask.val[0], score1.val[0], old_score.val[0]);
+    old_score.val[1] = vbslq_f32(mask.val[1], score1.val[1], old_score.val[1]);
+
+    // score2 > old_score?
+    mask.val[0] = vcgtq_f32(score2.val[0], old_score.val[0]);
+    mask.val[1] = vcgtq_f32(score2.val[1], old_score.val[1]);
+
+    phase.val[0]     = vbslq_f32(mask.val[0], two, phase.val[0]);
+    phase.val[1]     = vbslq_f32(mask.val[1], two, phase.val[1]);
+    old_score.val[0] = vbslq_f32(mask.val[0], score2.val[0], old_score.val[0]);
+    old_score.val[1] = vbslq_f32(mask.val[1], score2.val[1], old_score.val[1]);
+
+    // score3 > old_score?
+    mask.val[0] = vcgtq_f32(score3.val[0], old_score.val[0]);
+    mask.val[1] = vcgtq_f32(score3.val[1], old_score.val[1]);
+
+    phase.val[0]     = vbslq_f32(mask.val[0], three, phase.val[0]);
+    phase.val[1]     = vbslq_f32(mask.val[1], three, phase.val[1]);
+    old_score.val[0] = vbslq_f32(mask.val[0], score3.val[0], old_score.val[0]);
+    old_score.val[1] = vbslq_f32(mask.val[1], score3.val[1], old_score.val[1]);
+
+    // Convert from float32x4_t to uint8x8_t
+    return vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(phase.val[0])),
+                                  vmovn_u32(vcvtq_u32_f32(phase.val[1]))));
+}
+
+inline uint8x8_t phase_quantization(float16x8_t gx, float16x8_t gy)
+{
+    // Constant use for evaluating score1 and score3
+    static const float16x8_t const45 = vdupq_n_f16(0.70710678118655f);
+    static const float16x8_t zero    = vdupq_n_f16(0.0f);
+    static const float16x8_t one     = vdupq_n_f16(1.0f);
+    static const float16x8_t two     = vdupq_n_f16(2.0f);
+    static const float16x8_t three   = vdupq_n_f16(3.0f);
+
+    // Score0: (1, 0)
+    const float16x8_t score0 = vabsq_f16(gx);
+
+    // Score2: ( 0, 1 )
+    const float16x8_t score2 = vabsq_f16(gy);
+
+    // Score1 and Score3: ( sqrt(2) / 2, sqrt(2) / 2 ) - ( -sqrt(2) / 2, sqrt(2) / 2 )
+    float16x8_t score1 = vmulq_f16(gy, const45);
+    float16x8_t score3 = score1;
+
+    score1 = vfmaq_f16(score1, gx, const45);
+    score3 = vfmsq_f16(score3, gx, const45);
+
+    score1 = vabsq_f16(score1);
+    score3 = vabsq_f16(score3);
+
+    float16x8_t phase     = zero;
+    float16x8_t old_score = score0;
+
+    // score1 > old_score?
+    uint16x8_t mask = vcgtq_f16(score1, old_score);
+
+    phase     = vbslq_f16(mask, one, phase);
+    old_score = vbslq_f16(mask, score1, old_score);
+
+    // score2 > old_score?
+    mask = vcgtq_f16(score2, old_score);
+
+    phase     = vbslq_f16(mask, two, phase);
+    old_score = vbslq_f16(mask, score2, old_score);
+
+    // score3 > old_score?
+    mask = vcgtq_f16(score3, old_score);
+
+    phase = vbslq_f16(mask, three, phase);
+
+    // Convert from float16x8_t to uint8x8_t
+    return vmovn_u16(vcvtq_u16_f16(phase));
+}
+
+/** Computes the gradient phase if gradient_size = 3 or 5. The output is quantized.
+ *         0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
+ *
+ * @param[in] gx Gx component
+ * @param[in] gy Gy component
+ *
+ * @return quantized phase for 8 pixels
+ */
+inline uint8x8_t phase_quantization_S16_S16(int16x8_t gx, int16x8_t gy)
+{
+    return phase_quantization(vcvtq_f16_s16(gx), vcvtq_f16_s16(gy));
+}
+
+/** Computes the gradient phase if gradient_size = 7. The output is quantized.
+ *         0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
+ *
+ * @param[in] gx Gx component
+ * @param[in] gy Gy component
+ *
+ * @return quantized phase for 8 pixels
+ */
+inline uint8x8_t phase_quantization_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
+{
+    // Convert to float
+    const float32x4x2_t gx_f32 =
+    {
+        vcvtq_f32_s32(gx.val[0]),
+        vcvtq_f32_s32(gx.val[1])
+    };
+
+    const float32x4x2_t gy_f32 =
+    {
+        vcvtq_f32_s32(gy.val[0]),
+        vcvtq_f32_s32(gy.val[1])
+    };
+
+    return phase_quantization(gx_f32, gy_f32);
+}
+
+/** Computes the magnitude using the L1-norm type if gradient_size = 3 or 5
+ *
+ * @param[in] gx Gx component
+ * @param[in] gy Gy component
+ *
+ * @return magnitude for 8 pixels
+ */
+inline uint16x8_t mag_l1_S16_S16(int16x8_t gx, int16x8_t gy)
+{
+    return vaddq_u16(vreinterpretq_u16_s16(vabsq_s16(gx)),
+                     vreinterpretq_u16_s16(vabsq_s16(gy)));
+}
+
+/** Computes the magnitude using the L1-norm type if gradient_size = 7
+ *
+ * @param[in] gx Gx component
+ * @param[in] gy Gy component
+ *
+ * @return magnitude for 8 pixels
+ */
+inline uint32x4x2_t mag_l1_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
+{
+    const uint32x4x2_t gx_abs =
+    {
+        vreinterpretq_u32_s32(vabsq_s32(gx.val[0])),
+        vreinterpretq_u32_s32(vabsq_s32(gx.val[1]))
+    };
+
+    const uint32x4x2_t gy_abs =
+    {
+        vreinterpretq_u32_s32(vabsq_s32(gy.val[0])),
+        vreinterpretq_u32_s32(vabsq_s32(gy.val[1]))
+    };
+
+    const uint32x4x2_t out =
+    {
+        vaddq_u32(gx_abs.val[0], gy_abs.val[0]),
+        vaddq_u32(gx_abs.val[1], gy_abs.val[1])
+    };
+
+    return out;
+}
+
+inline float32x4x2_t mag_l2(const float32x4x2_t &gx, const float32x4x2_t &gy)
+{
+    // x^2 ...
+    float32x4x2_t mag =
+    {
+        vmulq_f32(gx.val[0], gx.val[0]),
+        vmulq_f32(gx.val[1], gx.val[1])
+    };
+
+    // ... + y^2
+    mag.val[0] = vmlaq_f32(mag.val[0], gy.val[0], gy.val[0]);
+    mag.val[1] = vmlaq_f32(mag.val[1], gy.val[1], gy.val[1]);
+
+    // sqrt(...)
+    mag.val[0] = vmulq_f32(vrsqrteq_f32(mag.val[0]), mag.val[0]);
+    mag.val[1] = vmulq_f32(vrsqrteq_f32(mag.val[1]), mag.val[1]);
+
+    return mag;
+}
+
+inline float16x8_t mag_l2(float16x8_t gx, float16x8_t gy)
+{
+    // x^2 ...
+    float16x8_t mag = vmulq_f16(gx, gx);
+
+    // ... + y^2
+    mag = vfmaq_f16(mag, gy, gy);
+
+    // sqrt(...)
+    mag = vmulq_f16(vrsqrteq_f16(mag), mag);
+
+    return mag;
+}
+
+/** Computes the magnitude using L2-norm if gradient_size = 3 or 5
+ *
+ * @param[in] gx Gx component
+ * @param[in] gy Gy component
+ *
+ * @return magnitude for 8 pixels
+ */
+inline uint16x8_t mag_l2_S16_S16(int16x8_t gx, int16x8_t gy)
+{
+    /* Compute magnitude using L2 normalization */
+    const float16x8_t gx2 = vcvtq_f16_s16(gx);
+    const float16x8_t gy2 = vcvtq_f16_s16(gy);
+    const float16x8_t mag = mag_l2(gx2, gy2);
+
+    /* Store magnitude - Convert to uint16x8 */
+    return vcvtq_u16_f16(mag);
+}
+
+/** Computes the magnitude using L2-norm if gradient_size = 7
+ *
+ * @param[in] gx Gx component
+ * @param[in] gy Gy component
+ *
+ * @return magnitude for 8 pixels
+ */
+inline uint32x4x2_t mag_l2_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
+{
+    // Compute magnitude using L2 normalization
+    float32x4x2_t gx2 =
+    {
+        vcvtq_f32_s32(gx.val[0]),
+        vcvtq_f32_s32(gx.val[1])
+    };
+
+    float32x4x2_t gy2 =
+    {
+        vcvtq_f32_s32(gy.val[0]),
+        vcvtq_f32_s32(gy.val[1])
+    };
+
+    const float32x4x2_t mag = mag_l2(gx2, gy2);
+    const uint32x4x2_t  mag32 =
+    {
+        vcvtq_u32_f32(mag.val[0]),
+        vcvtq_u32_f32(mag.val[1])
+    };
+
+    return mag32;
+}
+
+/** Gradient function used when the gradient size = 3 or 5 and when the norm_type = L1-norm
+ *
+ * @param[in]  in1_ptr  Pointer to source image. Gx image. Data type supported S16
+ * @param[in]  in2_ptr  Pointer to source image. Gy image. Data type supported S16
+ * @param[out] out1_ptr Pointer to destination image. Magnitude. Data type supported U16
+ * @param[out] out2_ptr Pointer to destination image. Quantized phase. Data type supported U8
+ */
+void mag_phase_l1norm_S16_S16_U16_U8(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out1_ptr, void *__restrict out2_ptr)
+{
+    const auto in1  = static_cast<const int16_t *__restrict>(in1_ptr);
+    const auto in2  = static_cast<const int16_t *__restrict>(in2_ptr);
+    const auto out1 = static_cast<uint16_t *__restrict>(out1_ptr);
+    const auto out2 = static_cast<uint8_t *__restrict>(out2_ptr);
+
+    const int16x8x4_t gx =
+    {
+        vld1q_s16(in1),
+        vld1q_s16(in1 + 8),
+        vld1q_s16(in1 + 16),
+        vld1q_s16(in1 + 24)
+    };
+
+    const int16x8x4_t gy =
+    {
+        vld1q_s16(in2),
+        vld1q_s16(in2 + 8),
+        vld1q_s16(in2 + 16),
+        vld1q_s16(in2 + 24)
+    };
+
+    // Compute and store phase
+    vst1_u8(out2 + 0, phase_quantization_S16_S16(gx.val[0], gy.val[0]));
+    vst1_u8(out2 + 8, phase_quantization_S16_S16(gx.val[1], gy.val[1]));
+    vst1_u8(out2 + 16, phase_quantization_S16_S16(gx.val[2], gy.val[2]));
+    vst1_u8(out2 + 24, phase_quantization_S16_S16(gx.val[3], gy.val[3]));
+
+    // Compute ans store magnitude using L1 normalization
+    vst1q_u16(out1 + 0, mag_l1_S16_S16(gx.val[0], gy.val[0]));
+    vst1q_u16(out1 + 8, mag_l1_S16_S16(gx.val[1], gy.val[1]));
+    vst1q_u16(out1 + 16, mag_l1_S16_S16(gx.val[2], gy.val[2]));
+    vst1q_u16(out1 + 24, mag_l1_S16_S16(gx.val[3], gy.val[3]));
+}
+
+/** Gradient function used when the gradient size = 3 or 5 and when the norm_type = L2-norm
+ *
+ * @param[in]  in1_ptr  Pointer to source image. Gx image. Data type supported S16
+ * @param[in]  in2_ptr  Pointer to source image. Gy image. Data type supported S16
+ * @param[out] out1_ptr Pointer to destination image. Magnitude. Data type supported U16
+ * @param[out] out2_ptr Pointer to destination image. Quantized phase. Data type supported U8
+ */
+void mag_phase_l2norm_S16_S16_U16_U8(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out1_ptr, void *__restrict out2_ptr)
+{
+    const auto in1  = static_cast<const int16_t *__restrict>(in1_ptr);
+    const auto in2  = static_cast<const int16_t *__restrict>(in2_ptr);
+    const auto out1 = static_cast<uint16_t *__restrict>(out1_ptr);
+    const auto out2 = static_cast<uint8_t *__restrict>(out2_ptr);
+
+    const int16x8x4_t gx =
+    {
+        vld1q_s16(in1),
+        vld1q_s16(in1 + 8),
+        vld1q_s16(in1 + 16),
+        vld1q_s16(in1 + 24)
+    };
+
+    const int16x8x4_t gy =
+    {
+        vld1q_s16(in2),
+        vld1q_s16(in2 + 8),
+        vld1q_s16(in2 + 16),
+        vld1q_s16(in2 + 24)
+    };
+
+    // Compute and store phase
+    vst1_u8(out2 + 0, phase_quantization_S16_S16(gx.val[0], gy.val[0]));
+    vst1_u8(out2 + 8, phase_quantization_S16_S16(gx.val[1], gy.val[1]));
+    vst1_u8(out2 + 16, phase_quantization_S16_S16(gx.val[2], gy.val[2]));
+    vst1_u8(out2 + 24, phase_quantization_S16_S16(gx.val[3], gy.val[3]));
+
+    // Compute and store magnitude using L2 normalization
+    vst1q_u16(out1 + 0, mag_l2_S16_S16(gx.val[0], gy.val[0]));
+    vst1q_u16(out1 + 8, mag_l2_S16_S16(gx.val[1], gy.val[1]));
+    vst1q_u16(out1 + 16, mag_l2_S16_S16(gx.val[2], gy.val[2]));
+    vst1q_u16(out1 + 24, mag_l2_S16_S16(gx.val[3], gy.val[3]));
+}
+
+/** Gradient function used when the gradient size = 7 and when the norm_type = L1-norm
+ *
+ * @param[in]  in1_ptr  Pointer to source image. Gx image. Data type supported S32
+ * @param[in]  in2_ptr  Pointer to source image. Gy image. Data type supported S32
+ * @param[out] out1_ptr Pointer to destination image. Magnitude. Data type supported U32
+ * @param[out] out2_ptr Pointer to destination image. Quantized phase. Data type supported U8
+ */
+void mag_phase_l1norm_S32_S32_U32_U8(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out1_ptr, void *__restrict out2_ptr)
+{
+    auto in1  = static_cast<const int32_t *__restrict>(in1_ptr);
+    auto in2  = static_cast<const int32_t *__restrict>(in2_ptr);
+    auto out1 = static_cast<uint32_t *__restrict>(out1_ptr);
+    auto out2 = static_cast<uint8_t *__restrict>(out2_ptr);
+
+    // Process low and high part
+    for(size_t i = 0; i < 2; ++i, in1 += 16, in2 += 16, out1 += 16, out2 += 16)
+    {
+        const int32x4x2_t gx0 =
+        {
+            vld1q_s32(in1 + 0),
+            vld1q_s32(in1 + 4)
+        };
+
+        const int32x4x2_t gx1 =
+        {
+            vld1q_s32(in1 + 8),
+            vld1q_s32(in1 + 12)
+        };
+
+        const int32x4x2_t gy0 =
+        {
+            vld1q_s32(in2 + 0),
+            vld1q_s32(in2 + 4)
+        };
+
+        const int32x4x2_t gy1 =
+        {
+            vld1q_s32(in2 + 8),
+            vld1q_s32(in2 + 12)
+        };
+
+        // Compute and store phase
+        vst1_u8(out2 + 0, phase_quantization_S32_S32(gx0, gy0));
+        vst1_u8(out2 + 8, phase_quantization_S32_S32(gx1, gy1));
+
+        // Compute magnitude using L1 normalization
+        const uint32x4x2_t mag0 = mag_l1_S32_S32(gx0, gy0);
+        const uint32x4x2_t mag1 = mag_l1_S32_S32(gx1, gy1);
+
+        // Store magnitude
+        vst1q_u32(out1 + 0, mag0.val[0]);
+        vst1q_u32(out1 + 4, mag0.val[1]);
+        vst1q_u32(out1 + 8, mag1.val[0]);
+        vst1q_u32(out1 + 12, mag1.val[1]);
+    }
+}
+
+/** Gradient function used when the gradient size = 7 and when the norm_type = L2-norm
+ *
+ * @param[in]  in1_ptr  Pointer to source image. Gx image. Data type supported S32
+ * @param[in]  in2_ptr  Pointer to source image. Gy image. Data type supported S32
+ * @param[out] out1_ptr Pointer to destination image. Magnitude. Data type supported U32
+ * @param[out] out2_ptr Pointer to destination image. Quantized phase. Data type supported U8
+ */
+void mag_phase_l2norm_S32_S32_U32_U8(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out1_ptr, void *__restrict out2_ptr)
+{
+    auto in1  = static_cast<const int32_t *__restrict>(in1_ptr);
+    auto in2  = static_cast<const int32_t *__restrict>(in2_ptr);
+    auto out1 = static_cast<uint32_t *__restrict>(out1_ptr);
+    auto out2 = static_cast<uint8_t *__restrict>(out2_ptr);
+
+    // Process low and high part
+    for(size_t i = 0; i < 2; ++i, in1 += 16, in2 += 16, out1 += 16, out2 += 16)
+    {
+        const int32x4x2_t gx0 =
+        {
+            vld1q_s32(in1 + 0),
+            vld1q_s32(in1 + 4)
+        };
+
+        const int32x4x2_t gx1 =
+        {
+            vld1q_s32(in1 + 8),
+            vld1q_s32(in1 + 12)
+        };
+
+        const int32x4x2_t gy0 =
+        {
+            vld1q_s32(in2 + 0),
+            vld1q_s32(in2 + 4)
+        };
+
+        const int32x4x2_t gy1 =
+        {
+            vld1q_s32(in2 + 8),
+            vld1q_s32(in2 + 12)
+        };
+
+        // Compute and store phase
+        vst1_u8(out2 + 0, phase_quantization_S32_S32(gx0, gy0));
+        vst1_u8(out2 + 8, phase_quantization_S32_S32(gx1, gy1));
+
+        // Compute magnitude using L2 normalization
+        const uint32x4x2_t mag0 = mag_l2_S32_S32(gx0, gy0);
+        const uint32x4x2_t mag1 = mag_l2_S32_S32(gx1, gy1);
+
+        // Store magnitude
+        vst1q_u32(out1 + 0, mag0.val[0]);
+        vst1q_u32(out1 + 4, mag0.val[1]);
+        vst1q_u32(out1 + 8, mag1.val[0]);
+        vst1q_u32(out1 + 12, mag1.val[1]);
+    }
+}
+
+inline uint16x4_t non_max_U32_helper(const uint32_t *in, const uint16x4_t pc, const uint32_t stride_mag, const int32_t lower_thr, const int32_t upper_thr)
+{
+    // Phase for 4 pixel
+    const uint32x4_t pc32 = vmovl_u16(pc);
+
+    // Get magnitude for 4 pixel
+    uint32x4_t mc = vld1q_u32(in);
+
+    // Angle_quantized: 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
+    // 0 degree
+    const uint32x4_t mk0_0 = vld1q_u32(in - 1);
+    const uint32x4_t mk0_1 = vld1q_u32(in + 1);
+    uint32x4_t       mask0 = vceqq_u32(pc32, vdupq_n_u32(0));
+    mask0                  = vandq_u32(mask0, vcgeq_u32(mc, mk0_0));
+    mask0                  = vandq_u32(mask0, vcgeq_u32(mc, mk0_1));
+
+    // 45 degree
+    const uint32x4_t mk45_0 = vld1q_u32(in - stride_mag - 1);
+    const uint32x4_t mk45_1 = vld1q_u32(in + stride_mag + 1);
+    uint32x4_t       mask1  = vceqq_u32(pc32, vdupq_n_u32(1));
+    mask1                   = vandq_u32(mask1, vcgeq_u32(mc, mk45_0));
+    mask1                   = vandq_u32(mask1, vcgeq_u32(mc, mk45_1));
+
+    // 90 degree
+    const uint32x4_t mk90_0 = vld1q_u32(in - stride_mag);
+    const uint32x4_t mk90_1 = vld1q_u32(in + stride_mag);
+    uint32x4_t       mask2  = vceqq_u32(pc32, vdupq_n_u32(2));
+    mask2                   = vandq_u32(mask2, vcgeq_u32(mc, mk90_0));
+    mask2                   = vandq_u32(mask2, vcgeq_u32(mc, mk90_1));
+
+    // 135 degree
+    const uint32x4_t mk135_0 = vld1q_u32(in - stride_mag + 1);
+    const uint32x4_t mk135_1 = vld1q_u32(in + stride_mag - 1);
+    uint32x4_t       mask3   = vceqq_u32(pc32, vdupq_n_u32(3));
+    mask3                    = vandq_u32(mask3, vcgeq_u32(mc, mk135_0));
+    mask3                    = vandq_u32(mask3, vcgeq_u32(mc, mk135_1));
+
+    // Merge masks
+    mask0 = vorrq_u32(mask0, mask1);
+    mask2 = vorrq_u32(mask2, mask3);
+    mask0 = vorrq_u32(mask0, mask2);
+
+    mc = vbslq_u32(mask0, mc, vdupq_n_u32(0));
+
+    // mc > upper_thr
+    mask0 = vcgtq_u32(mc, vdupq_n_u32(upper_thr));
+
+    // mc <= lower_thr
+    mask1 = vcleq_u32(mc, vdupq_n_u32(lower_thr));
+
+    // mc <= upper_thr && mc > lower_thr
+    mask2 = vcleq_u32(mc, vdupq_n_u32(upper_thr));
+    mask2 = vandq_u32(mask2, vcgtq_u32(mc, vdupq_n_u32(lower_thr)));
+
+    mc = vbslq_u32(mask0, vdupq_n_u32(EDGE), mc);
+    mc = vbslq_u32(mask1, vdupq_n_u32(NO_EDGE), mc);
+    mc = vbslq_u32(mask2, vdupq_n_u32(MAYBE), mc);
+
+    return vmovn_u32(mc);
+}
+
+/** Computes edge tracing when is called by edge_trace_U8_U8 recursively
+ *
+ * @param[in]  in         Pointer to source image. Data type supported U8
+ * @param[out] out        Pointer to destination image. Data type supported U8
+ * @param[in]  in_stride  Stride of the input image
+ * @param[in]  out_stride Stride of the output image
+ */
+void edge_trace_recursive_U8_U8(uint8_t *__restrict in, uint8_t *__restrict out, const int32_t in_stride, const int32_t out_stride)
+{
+    // Look for MAYBE pixels in 8 directions
+    *out = EDGE;
+
+    // (-1, 0)
+    uint8_t pixel = *(in - 1);
+
+    if(pixel == MAYBE)
+    {
+        // Touched a MAYBE point. MAYBE becomes EDGE
+        *(in - 1) = EDGE;
+
+        edge_trace_recursive_U8_U8(in - 1, out - 1, in_stride, out_stride);
+    }
+
+    // (+1, 0)
+    pixel = *(in + 1);
+
+    if(pixel == MAYBE)
+    {
+        // Touched a MAYBE point. MAYBE becomes EDGE
+        *(in + 1) = EDGE;
+
+        edge_trace_recursive_U8_U8(in + 1, out + 1, in_stride, out_stride);
+    }
+
+    in -= in_stride;
+    out -= out_stride;
+
+    // (-1, -1)
+    pixel = *(in - 1);
+
+    if(pixel == MAYBE)
+    {
+        // Touched a MAYBE point. MAYBE becomes EDGE
+        *(in - 1) = EDGE;
+
+        edge_trace_recursive_U8_U8(in - 1, out - 1, in_stride, out_stride);
+    }
+
+    // (0, -1)
+    pixel = *in;
+
+    if(pixel == MAYBE)
+    {
+        // Touched a MAYBE point. MAYBE becomes EDGE
+        *in = EDGE;
+
+        edge_trace_recursive_U8_U8(in, out, in_stride, out_stride);
+    }
+
+    // (+1, -1)
+    pixel = *(in + 1);
+
+    if(pixel == MAYBE)
+    {
+        // Touched a MAYBE point. MAYBE becomes EDGE
+        *(in + 1) = EDGE;
+
+        edge_trace_recursive_U8_U8(in + 1, out + 1, in_stride, out_stride);
+    }
+
+    in += in_stride * 2;
+    out += out_stride * 2;
+
+    // (-1, +1)
+    pixel = *(in - 1);
+
+    if(pixel == MAYBE)
+    {
+        // Touched a MAYBE point. MAYBE becomes EDGE
+        *(in - 1) = EDGE;
+
+        edge_trace_recursive_U8_U8(in - 1, out - 1, in_stride, out_stride);
+    }
+
+    // (0, +1)
+    pixel = *in;
+
+    if(pixel == MAYBE)
+    {
+        // Touched a MAYBE point. MAYBE becomes EDGE
+        *in = EDGE;
+
+        edge_trace_recursive_U8_U8(in, out, in_stride, out_stride);
+    }
+
+    // (+1, +1)
+    pixel = *(in + 1);
+
+    if(pixel == MAYBE)
+    {
+        // Touched a MAYBE point. MAYBE becomes EDGE
+        *(in + 1) = EDGE;
+
+        edge_trace_recursive_U8_U8(in + 1, out + 1, in_stride, out_stride);
+    }
+}
+} // namespace fp16
+
+void NEGradientFP16Kernel::configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase, int32_t norm_type)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(gx, gy, magnitude, phase);
+
+    set_shape_if_empty(*magnitude->info(), gx->info()->tensor_shape());
+    set_shape_if_empty(*phase->info(), gx->info()->tensor_shape());
+
+    Format magnitude_format = gx->info()->data_type() == DataType::S16 ? Format::U16 : Format::U32;
+    set_format_if_unknown(*magnitude->info(), magnitude_format);
+    set_format_if_unknown(*phase->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(gx, gy, magnitude, phase);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, gy);
+    ARM_COMPUTE_ERROR_ON_MSG(element_size_from_data_type(gx->info()->data_type()) != element_size_from_data_type(magnitude->info()->data_type()), "Magnitude must have the same element size as Gx and Gy");
+
+    _gx        = gx;
+    _gy        = gy;
+    _magnitude = magnitude;
+    _phase     = phase;
+
+    if(_gx->info()->data_type() == DataType::S16)
+    {
+        if(norm_type == 1)
+        {
+            _func = &fp16::mag_phase_l1norm_S16_S16_U16_U8;
+        }
+        else
+        {
+            _func = &fp16::mag_phase_l2norm_S16_S16_U16_U8;
+        }
+    }
+    else
+    {
+        if(norm_type == 1)
+        {
+            _func = &fp16::mag_phase_l1norm_S32_S32_U32_U8;
+        }
+        else
+        {
+            _func = &fp16::mag_phase_l2norm_S32_S32_U32_U8;
+        }
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 32;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*_gx->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal gx_access(_gx->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal gy_access(_gy->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal mag_access(_magnitude->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, gx_access, gy_access, mag_access, phase_access);
+
+    mag_access.set_valid_region(win, _gx->info()->valid_region());
+    phase_access.set_valid_region(win, _gx->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+#endif
+
+namespace
+{
+inline uint8x8_t phase_quantization(const float32x4x2_t &gx, const float32x4x2_t &gy)
+{
+    // Constant use for evaluating score1 and score3
+    static const float32x4_t const45 = vdupq_n_f32(0.70710678118655f);
+    static const float32x4_t zero    = vdupq_n_f32(0.0f);
+    static const float32x4_t one     = vdupq_n_f32(1.0f);
+    static const float32x4_t two     = vdupq_n_f32(2.0f);
+    static const float32x4_t three   = vdupq_n_f32(3.0f);
+
+    // Score0: (1, 0)
+    const float32x4x2_t score0 =
+    {
+        {
+            vabsq_f32(gx.val[0]),
+            vabsq_f32(gx.val[1])
+        }
+    };
+
+    // Score2: ( 0, 1 )
+    const float32x4x2_t score2 =
+    {
+        {
+            vabsq_f32(gy.val[0]),
+            vabsq_f32(gy.val[1])
+        }
+    };
+
+    // Score1 and Score3: ( sqrt(2) / 2, sqrt(2) / 2 ) - ( -sqrt(2) / 2, sqrt(2) / 2 )
+    float32x4x2_t score1 =
+    {
+        {
+            vmulq_f32(gy.val[0], const45),
+            vmulq_f32(gy.val[1], const45)
+        }
+    };
+
+    float32x4x2_t score3 = score1;
+
+    score1.val[0] = vmlaq_f32(score1.val[0], gx.val[0], const45);
+    score1.val[1] = vmlaq_f32(score1.val[1], gx.val[1], const45);
+    score3.val[0] = vmlsq_f32(score3.val[0], gx.val[0], const45);
+    score3.val[1] = vmlsq_f32(score3.val[1], gx.val[1], const45);
+
+    score1.val[0] = vabsq_f32(score1.val[0]);
+    score1.val[1] = vabsq_f32(score1.val[1]);
+    score3.val[0] = vabsq_f32(score3.val[0]);
+    score3.val[1] = vabsq_f32(score3.val[1]);
+
+    float32x4x2_t phase =
+    {
+        {
+            zero,
+            zero
+        }
+    };
+
+    float32x4x2_t old_score = score0;
+
+    // score1 > old_score?
+    uint32x4x2_t mask =
+    {
+        {
+            vcgtq_f32(score1.val[0], old_score.val[0]),
+            vcgtq_f32(score1.val[1], old_score.val[1])
+        }
+    };
+
+    phase.val[0]     = vbslq_f32(mask.val[0], one, phase.val[0]);
+    phase.val[1]     = vbslq_f32(mask.val[1], one, phase.val[1]);
+    old_score.val[0] = vbslq_f32(mask.val[0], score1.val[0], old_score.val[0]);
+    old_score.val[1] = vbslq_f32(mask.val[1], score1.val[1], old_score.val[1]);
+
+    // score2 > old_score?
+    mask.val[0] = vcgtq_f32(score2.val[0], old_score.val[0]);
+    mask.val[1] = vcgtq_f32(score2.val[1], old_score.val[1]);
+
+    phase.val[0]     = vbslq_f32(mask.val[0], two, phase.val[0]);
+    phase.val[1]     = vbslq_f32(mask.val[1], two, phase.val[1]);
+    old_score.val[0] = vbslq_f32(mask.val[0], score2.val[0], old_score.val[0]);
+    old_score.val[1] = vbslq_f32(mask.val[1], score2.val[1], old_score.val[1]);
+
+    // score3 > old_score?
+    mask.val[0] = vcgtq_f32(score3.val[0], old_score.val[0]);
+    mask.val[1] = vcgtq_f32(score3.val[1], old_score.val[1]);
+
+    phase.val[0]     = vbslq_f32(mask.val[0], three, phase.val[0]);
+    phase.val[1]     = vbslq_f32(mask.val[1], three, phase.val[1]);
+    old_score.val[0] = vbslq_f32(mask.val[0], score3.val[0], old_score.val[0]);
+    old_score.val[1] = vbslq_f32(mask.val[1], score3.val[1], old_score.val[1]);
+
+    // Convert from float32x4_t to uint8x8_t
+    return vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(phase.val[0])),
+                                  vmovn_u32(vcvtq_u32_f32(phase.val[1]))));
+}
+
+/* Computes the gradient phase if gradient_size = 3 or 5. The output is quantized.
+ * 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
+ *
+ * @param[in] gx Gx component
+ * @param[in] gy Gy component
+ *
+ * @return quantized phase for 8 pixels
+ */
+inline uint8x8_t phase_quantization_S16_S16(int16x8_t gx, int16x8_t gy)
+{
+    // Convert to float
+    const float32x4x2_t gx_f32 =
+    {
+        {
+            vcvtq_f32_s32(vmovl_s16(vget_low_s16(gx))),
+            vcvtq_f32_s32(vmovl_s16(vget_high_s16(gx)))
+        }
+    };
+
+    const float32x4x2_t gy_f32 =
+    {
+        {
+            vcvtq_f32_s32(vmovl_s16(vget_low_s16(gy))),
+            vcvtq_f32_s32(vmovl_s16(vget_high_s16(gy)))
+        }
+    };
+
+    return phase_quantization(gx_f32, gy_f32);
+}
+
+/* Computes the gradient phase if gradient_size = 7. The output is quantized.
+ * 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
+ *
+ * @param[in] gx Gx component
+ * @param[in] gy Gy component
+ *
+ * @return quantized phase for 8 pixels
+ */
+inline uint8x8_t phase_quantization_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
+{
+    // Convert to float
+    const float32x4x2_t gx_f32 =
+    {
+        {
+            vcvtq_f32_s32(gx.val[0]),
+            vcvtq_f32_s32(gx.val[1])
+        }
+    };
+
+    const float32x4x2_t gy_f32 =
+    {
+        {
+            vcvtq_f32_s32(gy.val[0]),
+            vcvtq_f32_s32(gy.val[1])
+        }
+    };
+
+    return phase_quantization(gx_f32, gy_f32);
+}
+
+/* Computes the magnitude using the L1-norm type if gradient_size = 3 or 5
+ *
+ * @param[in] gx Gx component
+ * @param[in] gy Gy component
+ *
+ * @return magnitude for 8 pixels
+ */
+inline uint16x8_t mag_l1_S16_S16(int16x8_t gx, int16x8_t gy)
+{
+    return vaddq_u16(vreinterpretq_u16_s16(vabsq_s16(gx)),
+                     vreinterpretq_u16_s16(vabsq_s16(gy)));
+}
+
+/* Computes the magnitude using the L1-norm type if gradient_size = 7
+ *
+ * @param[in] gx Gx component
+ * @param[in] gy Gy component
+ *
+ * @return magnitude for 8 pixels
+ */
+inline uint32x4x2_t mag_l1_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
+{
+    const uint32x4x2_t gx_abs =
+    {
+        {
+            vreinterpretq_u32_s32(vabsq_s32(gx.val[0])),
+            vreinterpretq_u32_s32(vabsq_s32(gx.val[1]))
+        }
+    };
+
+    const uint32x4x2_t gy_abs =
+    {
+        {
+            vreinterpretq_u32_s32(vabsq_s32(gy.val[0])),
+            vreinterpretq_u32_s32(vabsq_s32(gy.val[1]))
+        }
+    };
+
+    const uint32x4x2_t output =
+    {
+        {
+            vaddq_u32(gx_abs.val[0], gy_abs.val[0]),
+            vaddq_u32(gx_abs.val[1], gy_abs.val[1])
+        }
+    };
+
+    return output;
+}
+
+inline float32x4x2_t mag_l2(const float32x4x2_t &gx, const float32x4x2_t &gy)
+{
+    // x^2 ...
+    float32x4x2_t magnitude =
+    {
+        {
+            vmulq_f32(gx.val[0], gx.val[0]),
+            vmulq_f32(gx.val[1], gx.val[1])
+        }
+    };
+
+    // ... + y^2
+    magnitude.val[0] = vmlaq_f32(magnitude.val[0], gy.val[0], gy.val[0]);
+    magnitude.val[1] = vmlaq_f32(magnitude.val[1], gy.val[1], gy.val[1]);
+
+    // sqrt(...)
+    magnitude.val[0] = vmulq_f32(vrsqrteq_f32(magnitude.val[0]), magnitude.val[0]);
+    magnitude.val[1] = vmulq_f32(vrsqrteq_f32(magnitude.val[1]), magnitude.val[1]);
+
+    return magnitude;
+}
+
+/* Computes the magnitude using L2-norm if gradient_size = 3 or 5
+ *
+ * @param[in] gx Gx component
+ * @param[in] gy Gy component
+ *
+ * @return magnitude for 8 pixels
+ */
+inline uint16x8_t mag_l2_S16_S16(int16x8_t gx, int16x8_t gy)
+{
+    // Compute magnitude using L2 normalization
+    const float32x4x2_t gx2 =
+    {
+        {
+            vcvtq_f32_s32(vmovl_s16(vget_low_s16(gx))),
+            vcvtq_f32_s32(vmovl_s16(vget_high_s16(gx)))
+        }
+    };
+
+    const float32x4x2_t gy2 =
+    {
+        {
+            vcvtq_f32_s32(vmovl_s16(vget_low_s16(gy))),
+            vcvtq_f32_s32(vmovl_s16(vget_high_s16(gy)))
+        }
+    };
+
+    const float32x4x2_t magnitude = mag_l2(gx2, gy2);
+
+    // Store magnitude - Convert to uint16x8
+    return vcombine_u16(vmovn_u32(vcvtq_u32_f32(magnitude.val[0])),
+                        vmovn_u32(vcvtq_u32_f32(magnitude.val[1])));
+}
+
+/* Computes the magnitude using L2-norm if gradient_size = 7
+ *
+ * @param[in] gx Gx component
+ * @param[in] gy Gy component
+ *
+ * @return magnitude for 8 pixels
+ */
+inline uint32x4x2_t mag_l2_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
+{
+    // Compute magnitude using L2 normalization
+    float32x4x2_t gx2 =
+    {
+        {
+            vcvtq_f32_s32(gx.val[0]),
+            vcvtq_f32_s32(gx.val[1])
+        }
+    };
+
+    float32x4x2_t gy2 =
+    {
+        {
+            vcvtq_f32_s32(gy.val[0]),
+            vcvtq_f32_s32(gy.val[1])
+        }
+    };
+
+    const float32x4x2_t magnitude = mag_l2(gx2, gy2);
+    const uint32x4x2_t  mag32 =
+    {
+        {
+            vcvtq_u32_f32(magnitude.val[0]),
+            vcvtq_u32_f32(magnitude.val[1])
+        }
+    };
+
+    return mag32;
+}
+
+/* Gradient function used when the gradient size = 3 or 5 and when the norm_type = L1-norm
+ *
+ * @param[in]  gx_ptr        Pointer to source image. Gx image. Data type supported S16
+ * @param[in]  gy_ptr        Pointer to source image. Gy image. Data type supported S16
+ * @param[out] magnitude_ptr Pointer to destination image. Magnitude. Data type supported U16
+ * @param[out] phase_ptr     Pointer to destination image. Quantized phase. Data type supported U8
+ */
+void mag_phase_l1norm_S16_S16_U16_U8(const void *__restrict gx_ptr, const void *__restrict gy_ptr, void *__restrict magnitude_ptr, void *__restrict phase_ptr)
+{
+    const auto gx        = static_cast<const int16_t *__restrict>(gx_ptr);
+    const auto gy        = static_cast<const int16_t *__restrict>(gy_ptr);
+    const auto magnitude = static_cast<uint16_t *__restrict>(magnitude_ptr);
+    const auto phase     = static_cast<uint8_t *__restrict>(phase_ptr);
+
+    const int16x8x4_t gx_val =
+    {
+        {
+            vld1q_s16(gx),
+            vld1q_s16(gx + 8),
+            vld1q_s16(gx + 16),
+            vld1q_s16(gx + 24)
+        }
+    };
+
+    const int16x8x4_t gy_val =
+    {
+        {
+            vld1q_s16(gy),
+            vld1q_s16(gy + 8),
+            vld1q_s16(gy + 16),
+            vld1q_s16(gy + 24)
+        }
+    };
+
+    // Compute and store phase
+    vst1_u8(phase + 0, phase_quantization_S16_S16(gx_val.val[0], gy_val.val[0]));
+    vst1_u8(phase + 8, phase_quantization_S16_S16(gx_val.val[1], gy_val.val[1]));
+    vst1_u8(phase + 16, phase_quantization_S16_S16(gx_val.val[2], gy_val.val[2]));
+    vst1_u8(phase + 24, phase_quantization_S16_S16(gx_val.val[3], gy_val.val[3]));
+
+    // Compute ans store magnitude using L1 normalization
+    vst1q_u16(magnitude + 0, mag_l1_S16_S16(gx_val.val[0], gy_val.val[0]));
+    vst1q_u16(magnitude + 8, mag_l1_S16_S16(gx_val.val[1], gy_val.val[1]));
+    vst1q_u16(magnitude + 16, mag_l1_S16_S16(gx_val.val[2], gy_val.val[2]));
+    vst1q_u16(magnitude + 24, mag_l1_S16_S16(gx_val.val[3], gy_val.val[3]));
+}
+
+/* Gradient function used when the gradient size = 3 or 5 and when the norm_type = L2-norm
+ *
+ * @param[in]  gx_ptr        Pointer to source image. Gx image. Data type supported S16
+ * @param[in]  gy_ptr        Pointer to source image. Gy image. Data type supported S16
+ * @param[out] magnitude_ptr Pointer to destination image. Magnitude. Data type supported U16
+ * @param[out] phase_ptr     Pointer to destination image. Quantized phase. Data type supported U8
+ */
+void mag_phase_l2norm_S16_S16_U16_U8(const void *__restrict gx_ptr, const void *__restrict gy_ptr, void *__restrict magnitude_ptr, void *__restrict phase_ptr)
+{
+    const auto gx        = static_cast<const int16_t *__restrict>(gx_ptr);
+    const auto gy        = static_cast<const int16_t *__restrict>(gy_ptr);
+    const auto magnitude = static_cast<uint16_t *__restrict>(magnitude_ptr);
+    const auto phase     = static_cast<uint8_t *__restrict>(phase_ptr);
+
+    const int16x8x4_t gx_val =
+    {
+        {
+            vld1q_s16(gx),
+            vld1q_s16(gx + 8),
+            vld1q_s16(gx + 16),
+            vld1q_s16(gx + 24)
+        }
+    };
+
+    const int16x8x4_t gy_val =
+    {
+        {
+            vld1q_s16(gy),
+            vld1q_s16(gy + 8),
+            vld1q_s16(gy + 16),
+            vld1q_s16(gy + 24)
+        }
+    };
+
+    // Compute and store phase
+    vst1_u8(phase + 0, phase_quantization_S16_S16(gx_val.val[0], gy_val.val[0]));
+    vst1_u8(phase + 8, phase_quantization_S16_S16(gx_val.val[1], gy_val.val[1]));
+    vst1_u8(phase + 16, phase_quantization_S16_S16(gx_val.val[2], gy_val.val[2]));
+    vst1_u8(phase + 24, phase_quantization_S16_S16(gx_val.val[3], gy_val.val[3]));
+
+    // Compute and store magnitude using L2 normalization
+    vst1q_u16(magnitude + 0, mag_l2_S16_S16(gx_val.val[0], gy_val.val[0]));
+    vst1q_u16(magnitude + 8, mag_l2_S16_S16(gx_val.val[1], gy_val.val[1]));
+    vst1q_u16(magnitude + 16, mag_l2_S16_S16(gx_val.val[2], gy_val.val[2]));
+    vst1q_u16(magnitude + 24, mag_l2_S16_S16(gx_val.val[3], gy_val.val[3]));
+}
+
+/* Gradient function used when the gradient size = 7 and when the norm_type = L1-norm
+ *
+ * @param[in]  gx_ptr        Pointer to source image. Gx image. Data type supported S32
+ * @param[in]  gy_ptr        Pointer to source image. Gy image. Data type supported S32
+ * @param[out] magnitude_ptr Pointer to destination image. Magnitude. Data type supported U32
+ * @param[out] phase_ptr     Pointer to destination image. Quantized phase. Data type support U8
+ */
+void mag_phase_l1norm_S32_S32_U32_U8(const void *__restrict gx_ptr, const void *__restrict gy_ptr, void *__restrict magnitude_ptr, void *__restrict phase_ptr)
+{
+    auto gx        = static_cast<const int32_t *__restrict>(gx_ptr);
+    auto gy        = static_cast<const int32_t *__restrict>(gy_ptr);
+    auto magnitude = static_cast<uint32_t *__restrict>(magnitude_ptr);
+    auto phase     = static_cast<uint8_t *__restrict>(phase_ptr);
+
+    // Process low and high part
+    for(size_t i = 0; i < 2; ++i, gx += 16, gy += 16, magnitude += 16, phase += 16)
+    {
+        const int32x4x2_t gx0 =
+        {
+            {
+                vld1q_s32(gx + 0),
+                vld1q_s32(gx + 4)
+            }
+        };
+
+        const int32x4x2_t gx1 =
+        {
+            {
+                vld1q_s32(gx + 8),
+                vld1q_s32(gx + 12)
+            }
+        };
+
+        const int32x4x2_t gy0 =
+        {
+            {
+                vld1q_s32(gy + 0),
+                vld1q_s32(gy + 4)
+            }
+        };
+
+        const int32x4x2_t gy1 =
+        {
+            {
+                vld1q_s32(gy + 8),
+                vld1q_s32(gy + 12)
+            }
+        };
+
+        // Compute and store phase
+        vst1_u8(phase + 0, phase_quantization_S32_S32(gx0, gy0));
+        vst1_u8(phase + 8, phase_quantization_S32_S32(gx1, gy1));
+
+        // Compute magnitude using L1 normalization
+        const uint32x4x2_t mag0 = mag_l1_S32_S32(gx0, gy0);
+        const uint32x4x2_t mag1 = mag_l1_S32_S32(gx1, gy1);
+
+        // Store magnitude
+        vst1q_u32(magnitude + 0, mag0.val[0]);
+        vst1q_u32(magnitude + 4, mag0.val[1]);
+        vst1q_u32(magnitude + 8, mag1.val[0]);
+        vst1q_u32(magnitude + 12, mag1.val[1]);
+    }
+}
+
+/* Gradient function used when the gradient size = 7 and when the norm_type = L2-norm
+ *
+ * @param[in]  gx_ptr        Pointer to source image. Gx image. Data type supported S32
+ * @param[in]  gy_ptr        Pointer to source image. Gy image. Data type supported S32
+ * @param[out] magnitude_ptr Pointer to destination image. Magnitude. Data type supported U32
+ * @param[out] phase_ptr     Pointer to destination image. Quantized phase. Data type supported U8
+ */
+void mag_phase_l2norm_S32_S32_U32_U8(const void *__restrict gx_ptr, const void *__restrict gy_ptr, void *__restrict magnitude_ptr, void *__restrict phase_ptr)
+{
+    auto gx        = static_cast<const int32_t *__restrict>(gx_ptr);
+    auto gy        = static_cast<const int32_t *__restrict>(gy_ptr);
+    auto magnitude = static_cast<uint32_t *__restrict>(magnitude_ptr);
+    auto phase     = static_cast<uint8_t *__restrict>(phase_ptr);
+
+    // Process low and high part
+    for(size_t i = 0; i < 2; ++i, gx += 16, gy += 16, magnitude += 16, phase += 16)
+    {
+        const int32x4x2_t gx0 =
+        {
+            {
+                vld1q_s32(gx + 0),
+                vld1q_s32(gx + 4)
+            }
+        };
+
+        const int32x4x2_t gx1 =
+        {
+            {
+                vld1q_s32(gx + 8),
+                vld1q_s32(gx + 12)
+            }
+        };
+
+        const int32x4x2_t gy0 =
+        {
+            {
+                vld1q_s32(gy + 0),
+                vld1q_s32(gy + 4)
+            }
+        };
+
+        const int32x4x2_t gy1 =
+        {
+            {
+                vld1q_s32(gy + 8),
+                vld1q_s32(gy + 12)
+            }
+        };
+
+        // Compute and store phase
+        vst1_u8(phase + 0, phase_quantization_S32_S32(gx0, gy0));
+        vst1_u8(phase + 8, phase_quantization_S32_S32(gx1, gy1));
+
+        // Compute magnitude using L2 normalization
+        const uint32x4x2_t mag0 = mag_l2_S32_S32(gx0, gy0);
+        const uint32x4x2_t mag1 = mag_l2_S32_S32(gx1, gy1);
+
+        // Store magnitude
+        vst1q_u32(magnitude + 0, mag0.val[0]);
+        vst1q_u32(magnitude + 4, mag0.val[1]);
+        vst1q_u32(magnitude + 8, mag1.val[0]);
+        vst1q_u32(magnitude + 12, mag1.val[1]);
+    }
+}
+
+/* Computes non-maxima suppression and hysteresis when the gradient size = 3 or 5
+ *
+ * @param[in]  magnitude_ptr Pointer to source image. Magnitude. Data type supported U16
+ * @param[in]  phase_ptr     Pointer to source image. Quantized phase. Data type supported U8
+ * @param[out] output_ptr    Pointer to output image. Data type supported U8
+ * @param[in]  stride_mag    Stride of magnitude image
+ * @param[in]  lower_thr     Lower threshold used for the hysteresis
+ * @param[in]  upper_thr     Upper threshold used for the hysteresis
+ */
+void non_max_suppression_U16_U8_U8(const void *__restrict magnitude_ptr, const void *__restrict phase_ptr, void *__restrict output_ptr, const uint32_t stride_mag, const int32_t lower_thr,
+                                   const int32_t upper_thr)
+{
+    const auto magnitude = static_cast<const uint16_t *__restrict>(magnitude_ptr);
+    const auto phase     = static_cast<const uint8_t *__restrict>(phase_ptr);
+    const auto output    = static_cast<uint8_t *__restrict>(output_ptr);
+
+    // Get magnitude and phase of the centre pixels
+    uint16x8_t mc = vld1q_u16(magnitude);
+
+    // Angle_quantized: 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
+    const uint16x8_t pc16 = vmovl_u8(vld1_u8(phase));
+
+    // 0 degree
+    const uint16x8_t mk0_0 = vld1q_u16(magnitude - 1);
+    const uint16x8_t mk0_1 = vld1q_u16(magnitude + 1);
+    uint16x8_t       mask0 = vceqq_u16(pc16, vdupq_n_u16(0));
+    mask0                  = vandq_u16(mask0, vcgeq_u16(mc, mk0_0));
+    mask0                  = vandq_u16(mask0, vcgeq_u16(mc, mk0_1));
+
+    // 45 degree
+    const uint16x8_t mk45_0 = vld1q_u16(magnitude - stride_mag - 1);
+    const uint16x8_t mk45_1 = vld1q_u16(magnitude + stride_mag + 1);
+    uint16x8_t       mask1  = vceqq_u16(pc16, vdupq_n_u16(1));
+    mask1                   = vandq_u16(mask1, vcgeq_u16(mc, mk45_0));
+    mask1                   = vandq_u16(mask1, vcgeq_u16(mc, mk45_1));
+
+    // 90 degree
+    const uint16x8_t mk90_0 = vld1q_u16(magnitude - stride_mag);
+    const uint16x8_t mk90_1 = vld1q_u16(magnitude + stride_mag);
+    uint16x8_t       mask2  = vceqq_u16(pc16, vdupq_n_u16(2));
+    mask2                   = vandq_u16(mask2, vcgeq_u16(mc, mk90_0));
+    mask2                   = vandq_u16(mask2, vcgeq_u16(mc, mk90_1));
+
+    // 135 degree
+    const uint16x8_t mk135_0 = vld1q_u16(magnitude - stride_mag + 1);
+    const uint16x8_t mk135_1 = vld1q_u16(magnitude + stride_mag - 1);
+    uint16x8_t       mask3   = vceqq_u16(pc16, vdupq_n_u16(3));
+    mask3                    = vandq_u16(mask3, vcgeq_u16(mc, mk135_0));
+    mask3                    = vandq_u16(mask3, vcgeq_u16(mc, mk135_1));
+
+    // Merge masks
+    mask0 = vorrq_u16(mask0, mask1);
+    mask2 = vorrq_u16(mask2, mask3);
+    mask0 = vorrq_u16(mask0, mask2);
+
+    mc = vbslq_u16(mask0, mc, vdupq_n_u16(0));
+
+    // mc > upper_thr
+    mask0 = vcgtq_u16(mc, vdupq_n_u16(upper_thr));
+
+    // mc <= lower_thr
+    mask1 = vcleq_u16(mc, vdupq_n_u16(lower_thr));
+
+    // mc <= upper_thr && mc > lower_thr
+    mask2 = vcleq_u16(mc, vdupq_n_u16(upper_thr));
+    mask2 = vandq_u16(mask2, vcgtq_u16(mc, vdupq_n_u16(lower_thr)));
+
+    mc = vbslq_u16(mask0, vdupq_n_u16(EDGE), mc);
+    mc = vbslq_u16(mask1, vdupq_n_u16(NO_EDGE), mc);
+    mc = vbslq_u16(mask2, vdupq_n_u16(MAYBE), mc);
+
+    vst1_u8(output, vmovn_u16(mc));
+}
+
+inline uint16x4_t non_max_U32_helper(const uint32_t *input, const uint16x4_t pc, const uint32_t stride_mag, const int32_t lower_thr, const int32_t upper_thr)
+{
+    // Phase for 4 pixel
+    const uint32x4_t pc32 = vmovl_u16(pc);
+
+    // Get magnitude for 4 pixel
+    uint32x4_t mc = vld1q_u32(input);
+
+    // Angle_quantized: 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
+    // 0 degree
+    const uint32x4_t mk0_0 = vld1q_u32(input - 1);
+    const uint32x4_t mk0_1 = vld1q_u32(input + 1);
+    uint32x4_t       mask0 = vceqq_u32(pc32, vdupq_n_u32(0));
+    mask0                  = vandq_u32(mask0, vcgeq_u32(mc, mk0_0));
+    mask0                  = vandq_u32(mask0, vcgeq_u32(mc, mk0_1));
+
+    // 45 degree
+    const uint32x4_t mk45_0 = vld1q_u32(input - stride_mag - 1);
+    const uint32x4_t mk45_1 = vld1q_u32(input + stride_mag + 1);
+    uint32x4_t       mask1  = vceqq_u32(pc32, vdupq_n_u32(1));
+    mask1                   = vandq_u32(mask1, vcgeq_u32(mc, mk45_0));
+    mask1                   = vandq_u32(mask1, vcgeq_u32(mc, mk45_1));
+
+    // 90 degree
+    const uint32x4_t mk90_0 = vld1q_u32(input - stride_mag);
+    const uint32x4_t mk90_1 = vld1q_u32(input + stride_mag);
+    uint32x4_t       mask2  = vceqq_u32(pc32, vdupq_n_u32(2));
+    mask2                   = vandq_u32(mask2, vcgeq_u32(mc, mk90_0));
+    mask2                   = vandq_u32(mask2, vcgeq_u32(mc, mk90_1));
+
+    // 135 degree
+    const uint32x4_t mk135_0 = vld1q_u32(input - stride_mag + 1);
+    const uint32x4_t mk135_1 = vld1q_u32(input + stride_mag - 1);
+    uint32x4_t       mask3   = vceqq_u32(pc32, vdupq_n_u32(3));
+    mask3                    = vandq_u32(mask3, vcgeq_u32(mc, mk135_0));
+    mask3                    = vandq_u32(mask3, vcgeq_u32(mc, mk135_1));
+
+    // Merge masks
+    mask0 = vorrq_u32(mask0, mask1);
+    mask2 = vorrq_u32(mask2, mask3);
+    mask0 = vorrq_u32(mask0, mask2);
+
+    mc = vbslq_u32(mask0, mc, vdupq_n_u32(0));
+
+    // mc > upper_thr
+    mask0 = vcgtq_u32(mc, vdupq_n_u32(upper_thr));
+
+    // mc <= lower_thr
+    mask1 = vcleq_u32(mc, vdupq_n_u32(lower_thr));
+
+    // mc <= upper_thr && mc > lower_thr
+    mask2 = vcleq_u32(mc, vdupq_n_u32(upper_thr));
+    mask2 = vandq_u32(mask2, vcgtq_u32(mc, vdupq_n_u32(lower_thr)));
+
+    mc = vbslq_u32(mask0, vdupq_n_u32(EDGE), mc);
+    mc = vbslq_u32(mask1, vdupq_n_u32(NO_EDGE), mc);
+    mc = vbslq_u32(mask2, vdupq_n_u32(MAYBE), mc);
+
+    return vmovn_u32(mc);
+}
+
+/* Computes non-maxima suppression and hysteresis when the gradient_size = 7
+ *
+ * @param[in]  magnitude_ptr Pointer to source image. Magnitude. Data type supported U32
+ * @param[in]  phase_ptr     Pointer to source image. Quantized phase. Data type supported U8
+ * @param[out] output_ptr    Pointer to destination image. Data type supported U8
+ * @param[in]  stride_mag    Stride of magnitude image
+ * @param[in]  lower_thr     Lower threshold used for the hysteresis
+ * @param[in]  upper_thr     Upper threshold used for the hysteresis
+ */
+void non_max_suppression_U32_U8_U8(const void *__restrict magnitude_ptr, const void *__restrict phase_ptr, void *__restrict output_ptr, const uint32_t stride_mag, const int32_t lower_thr,
+                                   const int32_t upper_thr)
+{
+    const auto magnitude = static_cast<const uint32_t *__restrict>(magnitude_ptr);
+    const auto phase     = static_cast<const uint8_t *__restrict>(phase_ptr);
+    const auto output    = static_cast<uint8_t *__restrict>(output_ptr);
+
+    // Get phase for 8 pixel
+    const uint16x8_t pc16 = vmovl_u8(vld1_u8(phase));
+
+    // Compute non maxima suppression
+    const uint16x4x2_t res =
+    {
+        {
+            non_max_U32_helper(magnitude, vget_low_u16(pc16), stride_mag, lower_thr, upper_thr),
+            non_max_U32_helper(magnitude + 4, vget_high_u16(pc16), stride_mag, lower_thr, upper_thr)
+        }
+    };
+
+    // Store result
+    vst1_u8(output, vmovn_u16(vcombine_u16(res.val[0], res.val[1])));
+}
+
+/* Computes edge tracing when is called by edge_trace_U8_U8 recursively
+ *
+ * @param[in]  input         Pointer to source image. Data type supported U8
+ * @param[out] output        Pointer to destination image. Data type supported U8
+ * @param[in]  input_stride  Stride of the input image
+ * @param[in]  output_stride Stride of the output image
+ */
+void edge_trace_recursive_U8_U8(uint8_t *__restrict input, uint8_t *__restrict output, const int32_t input_stride, const int32_t output_stride)
+{
+    // Look for MAYBE pixels in 8 directions
+    *output = EDGE;
+
+    // (-1, 0)
+    uint8_t pixel = *(input - 1);
+
+    if(pixel == MAYBE)
+    {
+        // Touched a MAYBE point. MAYBE becomes EDGE
+        *(input - 1) = EDGE;
+
+        edge_trace_recursive_U8_U8(input - 1, output - 1, input_stride, output_stride);
+    }
+
+    // (+1, 0)
+    pixel = *(input + 1);
+
+    if(pixel == MAYBE)
+    {
+        // Touched a MAYBE point. MAYBE becomes EDGE
+        *(input + 1) = EDGE;
+
+        edge_trace_recursive_U8_U8(input + 1, output + 1, input_stride, output_stride);
+    }
+
+    input -= input_stride;
+    output -= output_stride;
+
+    // (-1, -1)
+    pixel = *(input - 1);
+
+    if(pixel == MAYBE)
+    {
+        // Touched a MAYBE point. MAYBE becomes EDGE
+        *(input - 1) = EDGE;
+
+        edge_trace_recursive_U8_U8(input - 1, output - 1, input_stride, output_stride);
+    }
+
+    // (0, -1)
+    pixel = *input;
+
+    if(pixel == MAYBE)
+    {
+        // Touched a MAYBE point. MAYBE becomes EDGE
+        *input = EDGE;
+
+        edge_trace_recursive_U8_U8(input, output, input_stride, output_stride);
+    }
+
+    // (+1, -1)
+    pixel = *(input + 1);
+
+    if(pixel == MAYBE)
+    {
+        // Touched a MAYBE point. MAYBE becomes EDGE
+        *(input + 1) = EDGE;
+
+        edge_trace_recursive_U8_U8(input + 1, output + 1, input_stride, output_stride);
+    }
+
+    input += input_stride * 2;
+    output += output_stride * 2;
+
+    // (-1, +1)
+    pixel = *(input - 1);
+
+    if(pixel == MAYBE)
+    {
+        // Touched a MAYBE point. MAYBE becomes EDGE
+        *(input - 1) = EDGE;
+
+        edge_trace_recursive_U8_U8(input - 1, output - 1, input_stride, output_stride);
+    }
+
+    // (0, +1)
+    pixel = *input;
+
+    if(pixel == MAYBE)
+    {
+        // Touched a MAYBE point. MAYBE becomes EDGE
+        *input = EDGE;
+
+        edge_trace_recursive_U8_U8(input, output, input_stride, output_stride);
+    }
+
+    // (+1, +1)
+    pixel = *(input + 1);
+
+    if(pixel == MAYBE)
+    {
+        // Touched a MAYBE point. MAYBE becomes EDGE
+        *(input + 1) = EDGE;
+
+        edge_trace_recursive_U8_U8(input + 1, output + 1, input_stride, output_stride);
+    }
+}
+
+/* Computes edge tracing
+ *
+ * @param[in]  input         Pointer to source image. Data type supported U8
+ * @param[out] output        Pointer to destination image. Data type supported U8
+ * @param[in]  input_stride  Stride of the input image
+ * @param[in]  output_stride Stride of the output image
+ */
+void edge_trace_U8_U8(uint8_t *__restrict input, uint8_t *__restrict output, const int32_t input_stride, const int32_t output_stride)
+{
+    if(*input == NO_EDGE)
+    {
+        *output = NO_EDGE;
+    }
+    // Check if EDGE and not yet touched
+    else if((*input == EDGE) && (*output == NO_EDGE))
+    {
+        edge_trace_recursive_U8_U8(input, output, input_stride, output_stride);
+    }
+}
+} // namespace
+
+NEGradientKernel::NEGradientKernel()
+    : _func(nullptr), _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr)
+{
+}
+
+void NEGradientKernel::configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase, int32_t norm_type)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(gx, gy, magnitude, phase);
+
+    set_shape_if_empty(*magnitude->info(), gx->info()->tensor_shape());
+    set_shape_if_empty(*phase->info(), gx->info()->tensor_shape());
+
+    Format magnitude_format = gx->info()->data_type() == DataType::S16 ? Format::U16 : Format::U32;
+    set_format_if_unknown(*magnitude->info(), magnitude_format);
+    set_format_if_unknown(*phase->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(gx, gy, magnitude, phase);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, gy);
+    ARM_COMPUTE_ERROR_ON_MSG(element_size_from_data_type(gx->info()->data_type()) != element_size_from_data_type(magnitude->info()->data_type()), "Magnitude must have the same element size as Gx and Gy");
+
+    _gx        = gx;
+    _gy        = gy;
+    _magnitude = magnitude;
+    _phase     = phase;
+
+    if(_gx->info()->data_type() == DataType::S16)
+    {
+        if(norm_type == 1)
+        {
+            _func = &mag_phase_l1norm_S16_S16_U16_U8;
+        }
+        else
+        {
+            _func = &mag_phase_l2norm_S16_S16_U16_U8;
+        }
+    }
+    else
+    {
+        if(norm_type == 1)
+        {
+            _func = &mag_phase_l1norm_S32_S32_U32_U8;
+        }
+        else
+        {
+            _func = &mag_phase_l2norm_S32_S32_U32_U8;
+        }
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 32;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*_gx->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal gx_access(_gx->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal gy_access(_gy->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal mag_access(_magnitude->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, gx_access, gy_access, mag_access, phase_access);
+
+    mag_access.set_valid_region(win, _gx->info()->valid_region());
+    phase_access.set_valid_region(win, _gx->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+
+void NEGradientKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+    Iterator gx(_gx, window);
+    Iterator gy(_gy, window);
+    Iterator magnitude(_magnitude, window);
+    Iterator phase(_phase, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        (*_func)(gx.ptr(), gy.ptr(), magnitude.ptr(), phase.ptr());
+    },
+    gx, gy, magnitude, phase);
+}
+
+NEEdgeNonMaxSuppressionKernel::NEEdgeNonMaxSuppressionKernel()
+    : _func(nullptr), _magnitude(nullptr), _phase(nullptr), _output(nullptr), _lower_thr(0), _upper_thr(0)
+{
+}
+
+BorderSize NEEdgeNonMaxSuppressionKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void NEEdgeNonMaxSuppressionKernel::configure(const ITensor *magnitude, const ITensor *phase, ITensor *output,
+                                              int32_t upper_thr, int32_t lower_thr, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(magnitude, phase, output);
+
+    set_shape_if_empty(*output->info(), magnitude->info()->tensor_shape());
+
+    set_format_if_unknown(*phase->info(), Format::U8);
+    set_format_if_unknown(*output->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(magnitude, phase, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(phase, output);
+
+    _magnitude = magnitude;
+    _phase     = phase;
+    _output    = output;
+
+    switch(_magnitude->info()->data_type())
+    {
+        case DataType::U16:
+            _func = &non_max_suppression_U16_U8_U8;
+            break;
+        case DataType::U32:
+            _func = &non_max_suppression_U32_U8_U8;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type!");
+    }
+
+    // Set thresholds
+    _lower_thr = lower_thr;
+    _upper_thr = upper_thr;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 10;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*_magnitude->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowRectangle  mag_access(_magnitude->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, mag_access, phase_access, output_access);
+
+    output_access.set_valid_region(win, _magnitude->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+void NEEdgeNonMaxSuppressionKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+    Iterator magnitude(_magnitude, window);
+    Iterator phase(_phase, window);
+    Iterator output(_output, window);
+
+    const size_t input1_stride        = _magnitude->info()->strides_in_bytes()[1];
+    const size_t input1_stride_ushort = input1_stride / data_size_from_type(_magnitude->info()->data_type());
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        (*_func)(magnitude.ptr(), phase.ptr(), output.ptr(), input1_stride_ushort, _lower_thr, _upper_thr);
+    },
+    magnitude, phase, output);
+}
+
+NEEdgeTraceKernel::NEEdgeTraceKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+BorderSize NEEdgeTraceKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+bool NEEdgeTraceKernel::is_parallelisable() const
+{
+    return false;
+}
+
+void NEEdgeTraceKernel::configure(ITensor *input, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+    set_format_if_unknown(*input->info(), Format::U8);
+    set_format_if_unknown(*output->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input  = input;
+    _output = output;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration));
+
+    const ValidRegion &input_valid_region  = input->info()->valid_region();
+    const ValidRegion &output_valid_region = output->info()->valid_region();
+
+    // Reads can occur within the valid region of the input + border
+    AccessWindowStatic input_access(input->info(),
+                                    input_valid_region.anchor[0] - border_size().left,
+                                    input_valid_region.anchor[1] - border_size().top,
+                                    input_valid_region.anchor[0] + input_valid_region.shape[0] + border_size().right,
+                                    input_valid_region.anchor[1] + input_valid_region.shape[1] + border_size().bottom);
+
+    // Writes can occur within the valid region of the output + border
+    AccessWindowStatic output_access(output->info(),
+                                     output_valid_region.anchor[0] - border_size().left,
+                                     output_valid_region.anchor[1] - border_size().top,
+                                     output_valid_region.anchor[0] + output_valid_region.shape[0] + border_size().right,
+                                     output_valid_region.anchor[1] + output_valid_region.shape[1] + border_size().bottom);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, _input->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+
+void NEEdgeTraceKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    const size_t input_stride  = _input->info()->strides_in_bytes()[1];
+    const size_t output_stride = _output->info()->strides_in_bytes()[1];
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        edge_trace_U8_U8(input.ptr(), output.ptr(), input_stride, output_stride);
+    },
+    input, output);
+}
diff --git a/src/core/NEON/kernels/NEChannelCombineKernel.cpp b/src/core/NEON/kernels/NEChannelCombineKernel.cpp
new file mode 100644
index 0000000000..3147a698ad
--- /dev/null
+++ b/src/core/NEON/kernels/NEChannelCombineKernel.cpp
@@ -0,0 +1,467 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEChannelCombineKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/IMultiImage.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/MultiImageInfo.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+NEChannelCombineKernel::NEChannelCombineKernel()
+    : _func(nullptr), _planes{ { nullptr } }, _output(nullptr), _output_multi(nullptr), _x_subsampling{ { 1, 1, 1 } }, _y_subsampling{ { 1, 1, 1 } }, _num_elems_processed_per_iteration(8),
+_is_parallelizable(true)
+{
+}
+
+void NEChannelCombineKernel::configure(const ITensor *plane0, const ITensor *plane1, const ITensor *plane2, const ITensor *plane3, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(plane0, plane1, plane2, output);
+    ARM_COMPUTE_ERROR_ON(plane0 == output);
+    ARM_COMPUTE_ERROR_ON(plane1 == output);
+    ARM_COMPUTE_ERROR_ON(plane2 == output);
+
+    set_format_if_unknown(*plane0->info(), Format::U8);
+    set_format_if_unknown(*plane1->info(), Format::U8);
+    set_format_if_unknown(*plane2->info(), Format::U8);
+
+    if(plane3 != nullptr)
+    {
+        set_format_if_unknown(*plane3->info(), Format::U8);
+    }
+
+    set_shape_if_empty(*output->info(), plane0->info()->tensor_shape());
+
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane0, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane1, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane2, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::RGB888, Format::RGBA8888, Format::UYVY422, Format::YUYV422);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(plane0, plane1, plane2);
+
+    if(plane3 != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(plane0, plane3);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(plane0, plane3);
+    }
+
+    const Format &output_format = output->info()->format();
+
+    if(output_format == Format::RGBA8888)
+    {
+        ARM_COMPUTE_ERROR_ON(plane3 == output);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane3, 1, DataType::U8);
+    }
+
+    _planes[0]    = plane0;
+    _planes[1]    = plane1;
+    _planes[2]    = plane2;
+    _planes[3]    = plane3;
+    _output       = output;
+    _output_multi = nullptr;
+
+    _num_elems_processed_per_iteration = 8;
+    _is_parallelizable                 = true;
+
+    switch(output_format)
+    {
+        case Format::RGB888:
+            _func = &NEChannelCombineKernel::combine_3C;
+            break;
+        case Format::RGBA8888:
+            _func = &NEChannelCombineKernel::combine_4C;
+            break;
+        case Format::UYVY422:
+            _x_subsampling[1]                  = 2;
+            _x_subsampling[2]                  = 2;
+            _num_elems_processed_per_iteration = 16;
+            _func                              = &NEChannelCombineKernel::combine_YUV_1p<true>;
+            break;
+        case Format::YUYV422:
+            _x_subsampling[1]                  = 2;
+            _x_subsampling[2]                  = 2;
+            _num_elems_processed_per_iteration = 16;
+            _func                              = &NEChannelCombineKernel::combine_YUV_1p<false>;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported format.");
+            break;
+    }
+
+    TensorShape subsampled_shape_plane1{ plane0->info()->tensor_shape() };
+    subsampled_shape_plane1.set(0, subsampled_shape_plane1[0] / _x_subsampling[1]);
+    TensorShape subsampled_shape_plane2{ plane0->info()->tensor_shape() };
+    subsampled_shape_plane2.set(0, subsampled_shape_plane2[0] / _x_subsampling[2]);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(plane1->info()->tensor_shape(), subsampled_shape_plane1);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(plane2->info()->tensor_shape(), subsampled_shape_plane2);
+
+    Window win = calculate_max_window(*plane0->info(), Steps(_num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal output_access(output->info(), 0, _num_elems_processed_per_iteration);
+    AccessWindowHorizontal plane0_access(plane0->info(), 0, _num_elems_processed_per_iteration / _x_subsampling[1], 1.f / _x_subsampling[0]);
+    AccessWindowHorizontal plane1_access(plane1->info(), 0, _num_elems_processed_per_iteration / _x_subsampling[1], 1.f / _x_subsampling[1]);
+    AccessWindowHorizontal plane2_access(plane2->info(), 0, _num_elems_processed_per_iteration / _x_subsampling[1], 1.f / _x_subsampling[2]);
+    AccessWindowHorizontal plane3_access(plane3 == nullptr ? nullptr : plane3->info(), 0, _num_elems_processed_per_iteration);
+
+    update_window_and_padding(
+        win,
+        plane0_access,
+        plane1_access,
+        plane2_access,
+        plane3_access,
+        output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(plane0->info()->valid_region(),
+                                                       plane1->info()->valid_region(),
+                                                       plane2->info()->valid_region());
+
+    if(plane3 != nullptr)
+    {
+        valid_region = intersect_valid_regions(plane3->info()->valid_region(), valid_region);
+    }
+
+    output_access.set_valid_region(win, ValidRegion(valid_region.anchor, output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NEChannelCombineKernel::configure(const IImage *plane0, const IImage *plane1, const IImage *plane2, IMultiImage *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(plane0, plane1, plane2, output);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane0);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane1);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane2);
+
+    set_format_if_unknown(*plane0->info(), Format::U8);
+    set_format_if_unknown(*plane1->info(), Format::U8);
+    set_format_if_unknown(*plane2->info(), Format::U8);
+
+    set_shape_if_empty(*output->plane(0)->info(), plane0->info()->tensor_shape());
+
+    switch(output->info()->format())
+    {
+        case Format::NV12:
+        case Format::NV21:
+        case Format::IYUV:
+        {
+            TensorShape subsampled_shape = plane0->info()->tensor_shape();
+            subsampled_shape.set(0, subsampled_shape[0] / 2);
+            subsampled_shape.set(1, subsampled_shape[1] / 2);
+
+            set_shape_if_empty(*output->plane(1)->info(), subsampled_shape);
+
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->plane(1)->info()->tensor_shape(), subsampled_shape);
+
+            if(output->info()->format() == Format::IYUV)
+            {
+                set_shape_if_empty(*output->plane(2)->info(), subsampled_shape);
+
+                ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->plane(2)->info()->tensor_shape(), subsampled_shape);
+            }
+            break;
+        }
+        case Format::YUV444:
+            set_shape_if_empty(*output->plane(1)->info(), plane0->info()->tensor_shape());
+            set_shape_if_empty(*output->plane(2)->info(), plane0->info()->tensor_shape());
+
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(plane1, plane2, output->plane(1), output->plane(2));
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported format");
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(plane0, output->plane(0));
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane0, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane1, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane2, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(plane0, plane1, plane2);
+
+    _planes[0]                            = plane0;
+    _planes[1]                            = plane1;
+    _planes[2]                            = plane2;
+    _planes[3]                            = nullptr;
+    _output                               = nullptr;
+    _output_multi                         = output;
+    bool         has_two_planes           = false;
+    unsigned int num_elems_written_plane1 = 8;
+
+    _num_elems_processed_per_iteration = 8;
+    _is_parallelizable                 = true;
+
+    const Format &output_format = output->info()->format();
+
+    switch(output_format)
+    {
+        case Format::NV12:
+        case Format::NV21:
+            _x_subsampling           = { { 1, 2, 2 } };
+            _y_subsampling           = { { 1, 2, 2 } };
+            _func                    = &NEChannelCombineKernel::combine_YUV_2p;
+            has_two_planes           = true;
+            num_elems_written_plane1 = 16;
+            break;
+        case Format::IYUV:
+            _is_parallelizable = false;
+            _x_subsampling     = { { 1, 2, 2 } };
+            _y_subsampling     = { { 1, 2, 2 } };
+            _func              = &NEChannelCombineKernel::combine_YUV_3p;
+            break;
+        case Format::YUV444:
+            _is_parallelizable = false;
+            _x_subsampling     = { { 1, 1, 1 } };
+            _y_subsampling     = { { 1, 1, 1 } };
+            _func              = &NEChannelCombineKernel::combine_YUV_3p;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported format.");
+            break;
+    }
+
+    const unsigned int y_step = *std::max_element(_y_subsampling.begin(), _y_subsampling.end());
+
+    Window                win = calculate_max_window(*plane0->info(), Steps(_num_elems_processed_per_iteration, y_step));
+    AccessWindowRectangle output_plane0_access(output->plane(0)->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f, 1.f / _y_subsampling[0]);
+    AccessWindowRectangle output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_written_plane1, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
+    AccessWindowRectangle output_plane2_access(has_two_planes ? nullptr : output->plane(2)->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(plane0->info(), 0, _num_elems_processed_per_iteration),
+                              AccessWindowRectangle(plane1->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]),
+                              AccessWindowRectangle(plane2->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]),
+                              output_plane0_access,
+                              output_plane1_access,
+                              output_plane2_access);
+
+    ValidRegion plane0_valid_region = plane0->info()->valid_region();
+
+    ValidRegion output_plane1_region = has_two_planes ? intersect_valid_regions(plane1->info()->valid_region(), plane2->info()->valid_region()) : plane2->info()->valid_region();
+
+    output_plane0_access.set_valid_region(win, ValidRegion(plane0_valid_region.anchor, output->plane(0)->info()->tensor_shape()));
+    output_plane1_access.set_valid_region(win, ValidRegion(output_plane1_region.anchor, output->plane(1)->info()->tensor_shape()));
+    output_plane2_access.set_valid_region(win, ValidRegion(plane2->info()->valid_region().anchor, output->plane(2)->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+bool NEChannelCombineKernel::is_parallelisable() const
+{
+    return _is_parallelizable;
+}
+
+void NEChannelCombineKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (this->*_func)(window);
+}
+
+void NEChannelCombineKernel::combine_3C(const Window &win)
+{
+    Iterator p0(_planes[0], win);
+    Iterator p1(_planes[1], win);
+    Iterator p2(_planes[2], win);
+    Iterator out(_output, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto p0_ptr  = static_cast<uint8_t *>(p0.ptr());
+        const auto p1_ptr  = static_cast<uint8_t *>(p1.ptr());
+        const auto p2_ptr  = static_cast<uint8_t *>(p2.ptr());
+        const auto out_ptr = static_cast<uint8_t *>(out.ptr());
+
+        const uint8x8x3_t pixels =
+        {
+            {
+                vld1_u8(p0_ptr),
+                vld1_u8(p1_ptr),
+                vld1_u8(p2_ptr)
+            }
+        };
+
+        vst3_u8(out_ptr, pixels);
+    },
+    p0, p1, p2, out);
+}
+
+void NEChannelCombineKernel::combine_4C(const Window &win)
+{
+    Iterator p0(_planes[0], win);
+    Iterator p1(_planes[1], win);
+    Iterator p2(_planes[2], win);
+    Iterator p3(_planes[3], win);
+    Iterator out(_output, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto p0_ptr  = static_cast<uint8_t *>(p0.ptr());
+        const auto p1_ptr  = static_cast<uint8_t *>(p1.ptr());
+        const auto p2_ptr  = static_cast<uint8_t *>(p2.ptr());
+        const auto p3_ptr  = static_cast<uint8_t *>(p3.ptr());
+        const auto out_ptr = static_cast<uint8_t *>(out.ptr());
+
+        const uint8x8x4_t pixels =
+        {
+            {
+                vld1_u8(p0_ptr),
+                vld1_u8(p1_ptr),
+                vld1_u8(p2_ptr),
+                vld1_u8(p3_ptr)
+            }
+        };
+
+        vst4_u8(out_ptr, pixels);
+    },
+    p0, p1, p2, p3, out);
+}
+
+template <bool is_uyvy>
+void NEChannelCombineKernel::combine_YUV_1p(const Window &win)
+{
+    // Create sub-sampled uv window and init uv planes
+    Window win_uv(win);
+    win_uv.set_dimension_step(0, win.x().step() / _x_subsampling[1]);
+    win_uv.validate();
+
+    Iterator p0(_planes[0], win);
+    Iterator p1(_planes[1], win_uv);
+    Iterator p2(_planes[2], win_uv);
+    Iterator out(_output, win);
+
+    constexpr auto shift = is_uyvy ? 1 : 0;
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto p0_ptr  = static_cast<uint8_t *>(p0.ptr());
+        const auto p1_ptr  = static_cast<uint8_t *>(p1.ptr());
+        const auto p2_ptr  = static_cast<uint8_t *>(p2.ptr());
+        const auto out_ptr = static_cast<uint8_t *>(out.ptr());
+
+        const uint8x8x2_t pixels_y = vld2_u8(p0_ptr);
+        const uint8x8x2_t pixels_uv =
+        {
+            {
+                vld1_u8(p1_ptr),
+                vld1_u8(p2_ptr)
+            }
+        };
+
+        uint8x8x4_t pixels{ {} };
+        pixels.val[0 + shift] = pixels_y.val[0];
+        pixels.val[1 - shift] = pixels_uv.val[0];
+        pixels.val[2 + shift] = pixels_y.val[1];
+        pixels.val[3 - shift] = pixels_uv.val[1];
+
+        vst4_u8(out_ptr, pixels);
+    },
+    p0, p1, p2, out);
+}
+
+void NEChannelCombineKernel::combine_YUV_2p(const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(win.x().start() % _x_subsampling[1]);
+    ARM_COMPUTE_ERROR_ON(win.y().start() % _y_subsampling[1]);
+
+    // Copy first plane
+    copy_plane(win, 0);
+
+    // Update UV window
+    Window uv_win(win);
+    uv_win.set(Window::DimX, Window::Dimension(uv_win.x().start() / _x_subsampling[1], uv_win.x().end() / _x_subsampling[1], _num_elems_processed_per_iteration));
+    uv_win.set(Window::DimY, Window::Dimension(uv_win.y().start() / _y_subsampling[1], uv_win.y().end() / _y_subsampling[1], 1));
+    uv_win.validate();
+
+    // Update output win
+    Window out_win(win);
+    out_win.set(Window::DimX, Window::Dimension(out_win.x().start(), out_win.x().end(), out_win.x().step() * 2));
+    out_win.set(Window::DimY, Window::Dimension(out_win.y().start() / _y_subsampling[1], out_win.y().end() / _y_subsampling[1], 1));
+    out_win.validate();
+
+    // Construct second plane
+    const int shift = (Format::NV12 == _output_multi->info()->format()) ? 0 : 1;
+    Iterator  p1(_planes[1 + shift], uv_win);
+    Iterator  p2(_planes[2 - shift], uv_win);
+    Iterator  out(_output_multi->plane(1), out_win);
+
+    execute_window_loop(out_win, [&](const Coordinates & id)
+    {
+        const uint8x8x2_t pixels =
+        {
+            {
+                vld1_u8(p1.ptr()),
+                vld1_u8(p2.ptr())
+            }
+        };
+
+        vst2_u8(out.ptr(), pixels);
+    },
+    p1, p2, out);
+}
+
+void NEChannelCombineKernel::combine_YUV_3p(const Window &win)
+{
+    copy_plane(win, 0);
+    copy_plane(win, 1);
+    copy_plane(win, 2);
+}
+
+void NEChannelCombineKernel::copy_plane(const Window &win, uint32_t plane_id)
+{
+    ARM_COMPUTE_ERROR_ON(win.x().start() % _x_subsampling[plane_id]);
+    ARM_COMPUTE_ERROR_ON(win.y().start() % _y_subsampling[plane_id]);
+
+    // Update window
+    Window tmp_win(win);
+    tmp_win.set(Window::DimX, Window::Dimension(tmp_win.x().start() / _x_subsampling[plane_id], tmp_win.x().end() / _x_subsampling[plane_id], _num_elems_processed_per_iteration));
+    tmp_win.set(Window::DimY, Window::Dimension(tmp_win.y().start() / _y_subsampling[plane_id], tmp_win.y().end() / _y_subsampling[plane_id], 1));
+    tmp_win.validate();
+
+    Iterator in(_planes[plane_id], tmp_win);
+    Iterator out(_output_multi->plane(plane_id), tmp_win);
+
+    execute_window_loop(tmp_win, [&](const Coordinates & id)
+    {
+        const auto in_ptr  = static_cast<uint8_t *>(in.ptr());
+        const auto out_ptr = static_cast<uint8_t *>(out.ptr());
+
+        vst1_u8(out_ptr, vld1_u8(in_ptr));
+    },
+    in, out);
+}
diff --git a/src/core/NEON/kernels/NEChannelExtractKernel.cpp b/src/core/NEON/kernels/NEChannelExtractKernel.cpp
new file mode 100644
index 0000000000..ebc4b85c98
--- /dev/null
+++ b/src/core/NEON/kernels/NEChannelExtractKernel.cpp
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEChannelExtractKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/IMultiImage.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/MultiImageInfo.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+NEChannelExtractKernel::NEChannelExtractKernel()
+    : _func(nullptr), _lut_index(0)
+{
+}
+
+void NEChannelExtractKernel::configure(const ITensor *input, Channel channel, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_ON(input == output);
+
+    set_format_if_unknown(*output->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::RGB888, Format::RGBA8888, Format::UYVY422, Format::YUYV422);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8);
+
+    unsigned int num_elems_processed_per_iteration = 8;
+
+    // Check format and channel
+    const Format       format      = input->info()->format();
+    const unsigned int subsampling = (format == Format::YUYV422 || format == Format::UYVY422) && channel != Channel::Y ? 2 : 1;
+    TensorShape        output_shape;
+
+    switch(format)
+    {
+        case Format::RGB888:
+        case Format::RGBA8888:
+            num_elems_processed_per_iteration = 16;
+            output_shape                      = input->info()->tensor_shape();
+
+            if(format == Format::RGB888)
+            {
+                _func = &NEChannelExtractKernel::extract_1C_from_3C_img;
+            }
+            else if(format == Format::RGBA8888)
+            {
+                _func = &NEChannelExtractKernel::extract_1C_from_4C_img;
+            }
+
+            switch(channel)
+            {
+                case Channel::R:
+                    _lut_index = 0;
+                    break;
+                case Channel::G:
+                    _lut_index = 1;
+                    break;
+                case Channel::B:
+                    _lut_index = 2;
+                    break;
+                case Channel::A:
+                    if(format == Format::RGBA8888)
+                    {
+                        _lut_index = 3;
+                        _func      = &NEChannelExtractKernel::extract_1C_from_4C_img;
+                        break;
+                    }
+                default:
+                    ARM_COMPUTE_ERROR("Not supported channel for this format.");
+                    break;
+            }
+            break;
+        case Format::YUYV422:
+        case Format::UYVY422:
+            output_shape = input->info()->tensor_shape();
+
+            if(channel != Channel::Y)
+            {
+                output_shape.set(0, output_shape[0] / 2);
+            }
+
+            switch(channel)
+            {
+                case Channel::Y:
+                    num_elems_processed_per_iteration = 16;
+                    _func                             = &NEChannelExtractKernel::extract_1C_from_2C_img;
+                    _lut_index                        = (Format::YUYV422 == format) ? 0 : 1;
+                    break;
+                case Channel::U:
+                    num_elems_processed_per_iteration = 32;
+                    _func                             = &NEChannelExtractKernel::extract_YUYV_uv;
+                    _lut_index                        = (Format::YUYV422 == format) ? 1 : 0;
+                    break;
+                case Channel::V:
+                    num_elems_processed_per_iteration = 32;
+                    _func                             = &NEChannelExtractKernel::extract_YUYV_uv;
+                    _lut_index                        = (Format::YUYV422 == format) ? 3 : 2;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported channel for this format.");
+                    break;
+            }
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported format.");
+            break;
+    }
+
+    set_shape_if_empty(*output->info(), output_shape);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+    _input  = input;
+    _output = output;
+
+    Window                win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowRectangle output_access(input->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / subsampling, 1.f / subsampling);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+                              output_access);
+
+    ValidRegion input_valid_region = input->info()->valid_region();
+
+    output_access.set_valid_region(win, ValidRegion(input_valid_region.anchor, output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NEChannelExtractKernel::configure(const IMultiImage *input, Channel channel, IImage *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+
+    set_format_if_unknown(*output->info(), Format::U8);
+
+    switch(input->info()->format())
+    {
+        case Format::NV12:
+        case Format::NV21:
+        case Format::IYUV:
+            switch(channel)
+            {
+                case Channel::Y:
+                    set_shape_if_empty(*output->info(), input->plane(0)->info()->tensor_shape());
+                    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output);
+                    break;
+                case Channel::U:
+                case Channel::V:
+                    set_shape_if_empty(*output->info(), input->plane(1)->info()->tensor_shape());
+                    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(1), output);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported channel for selected format");
+            }
+            break;
+        case Format::YUV444:
+            set_shape_if_empty(*output->info(), input->plane(0)->info()->tensor_shape());
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported format");
+    }
+
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8);
+
+    unsigned int num_elems_processed_per_iteration = 32;
+
+    const Format &format = input->info()->format();
+
+    switch(format)
+    {
+        case Format::NV12:
+        case Format::NV21:
+            switch(channel)
+            {
+                case Channel::Y:
+                    _input = input->plane(0);
+                    _func  = &NEChannelExtractKernel::copy_plane;
+                    break;
+                case Channel::U:
+                    _input                            = input->plane(1);
+                    num_elems_processed_per_iteration = 16;
+                    _func                             = &NEChannelExtractKernel::extract_1C_from_2C_img;
+                    _lut_index                        = (Format::NV12 == format) ? 0 : 1;
+                    break;
+                case Channel::V:
+                    _input                            = input->plane(1);
+                    num_elems_processed_per_iteration = 16;
+                    _func                             = &NEChannelExtractKernel::extract_1C_from_2C_img;
+                    _lut_index                        = (Format::NV12 == format) ? 1 : 0;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported channel for this format.");
+                    break;
+            }
+            break;
+        case Format::IYUV:
+        case Format::YUV444:
+            _func = &NEChannelExtractKernel::copy_plane;
+            switch(channel)
+            {
+                case Channel::Y:
+                    _input = input->plane(0);
+                    break;
+                case Channel::U:
+                    _input = input->plane(1);
+                    break;
+                case Channel::V:
+                    _input = input->plane(2);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported channel for this format.");
+                    break;
+            }
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported format.");
+            break;
+    }
+
+    _output                    = output;
+    Window                 win = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input_access(_input->info(), 0, num_elems_processed_per_iteration);
+    update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, _input->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+
+void NEChannelExtractKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (this->*_func)(window);
+}
+
+void NEChannelExtractKernel::extract_1C_from_2C_img(const Window &win)
+{
+    Iterator in(_input, win);
+    Iterator out(_output, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto in_ptr  = static_cast<uint8_t *>(in.ptr());
+        const auto out_ptr = static_cast<uint8_t *>(out.ptr());
+        const auto pixels  = vld2q_u8(in_ptr);
+        vst1q_u8(out_ptr, pixels.val[_lut_index]);
+    },
+    in, out);
+}
+
+void NEChannelExtractKernel::extract_1C_from_3C_img(const Window &win)
+{
+    Iterator in(_input, win);
+    Iterator out(_output, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto in_ptr  = static_cast<uint8_t *>(in.ptr());
+        const auto out_ptr = static_cast<uint8_t *>(out.ptr());
+        const auto pixels  = vld3q_u8(in_ptr);
+        vst1q_u8(out_ptr, pixels.val[_lut_index]);
+    },
+    in, out);
+}
+
+void NEChannelExtractKernel::extract_1C_from_4C_img(const Window &win)
+{
+    Iterator in(_input, win);
+    Iterator out(_output, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto in_ptr  = static_cast<uint8_t *>(in.ptr());
+        const auto out_ptr = static_cast<uint8_t *>(out.ptr());
+        const auto pixels  = vld4q_u8(in_ptr);
+        vst1q_u8(out_ptr, pixels.val[_lut_index]);
+    },
+    in, out);
+}
+
+void NEChannelExtractKernel::extract_YUYV_uv(const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(win.x().step() % 2);
+
+    Window win_out(win);
+    win_out.set_dimension_step(Window::DimX, win.x().step() / 2);
+
+    Iterator in(_input, win);
+    Iterator out(_output, win_out);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto in_ptr  = static_cast<uint8_t *>(in.ptr());
+        const auto out_ptr = static_cast<uint8_t *>(out.ptr());
+        const auto pixels  = vld4q_u8(in_ptr);
+        vst1q_u8(out_ptr, pixels.val[_lut_index]);
+    },
+    in, out);
+}
+
+void NEChannelExtractKernel::copy_plane(const Window &win)
+{
+    Iterator in(_input, win);
+    Iterator out(_output, win);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        const auto in_ptr  = static_cast<uint8_t *>(in.ptr());
+        const auto out_ptr = static_cast<uint8_t *>(out.ptr());
+        vst4_u8(out_ptr, vld4_u8(in_ptr));
+    },
+    in, out);
+}
diff --git a/src/core/NEON/kernels/NECol2ImKernel.cpp b/src/core/NEON/kernels/NECol2ImKernel.cpp
new file mode 100644
index 0000000000..6d370acff1
--- /dev/null
+++ b/src/core/NEON/kernels/NECol2ImKernel.cpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+template <typename T>
+void NECol2ImKernel::run_col2im(const Window &window)
+{
+    const int output_stride_x = _output->info()->strides_in_bytes().x();
+    const int output_stride_y = _output->info()->strides_in_bytes().y();
+    const int output_stride_z = _output->info()->strides_in_bytes().z();
+
+    Window window_out(window);
+    window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+    window_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    // Create iterators
+    Iterator in(_input, window);
+    Iterator out(_output, window_out);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int hidx = id.y();
+        const int idx  = id.x() * output_stride_z + (hidx / _convolved_dims.first) * output_stride_y + (hidx % _convolved_dims.first) * output_stride_x;
+
+        *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
+    },
+    in, out);
+}
+
+NECol2ImKernel::NECol2ImKernel()
+    : _func(), _input(nullptr), _output(nullptr), _convolved_dims()
+{
+}
+
+void NECol2ImKernel::configure(const ITensor *input, ITensor *output, std::pair<unsigned int, unsigned int> convolved_dims)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    set_data_type_if_unknown(*output->info(), input->info()->data_type());
+
+    TensorShape output_shape = input->info()->tensor_shape();
+    output_shape.set(0, convolved_dims.first);
+    output_shape.set(1, convolved_dims.second);
+    output_shape.set(2, input->info()->tensor_shape()[0]);
+
+    set_shape_if_empty(*output->info(), output_shape);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input          = input;
+    _output         = output;
+    _convolved_dims = convolved_dims;
+
+    switch(input->info()->element_size())
+    {
+        case 1:
+            _func = &NECol2ImKernel::run_col2im<uint8_t>;
+            break;
+        case 2:
+            _func = &NECol2ImKernel::run_col2im<uint16_t>;
+            break;
+        case 4:
+            _func = &NECol2ImKernel::run_col2im<uint32_t>;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Element size not supported");
+            break;
+    }
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps());
+
+    // The NECol2ImKernel doesn't need padding so update_window_and_padding() can be skipped
+    Coordinates coord;
+    coord.set_num_dimensions(output->info()->num_dimensions());
+    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NECol2ImKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    (this->*_func)(window);
+}
diff --git a/src/core/NEON/kernels/NEColorConvertKernel.cpp b/src/core/NEON/kernels/NEColorConvertKernel.cpp
new file mode 100644
index 0000000000..cb5152e2b3
--- /dev/null
+++ b/src/core/NEON/kernels/NEColorConvertKernel.cpp
@@ -0,0 +1,582 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEColorConvertKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/IMultiImage.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/MultiImageInfo.h"
+#include "arm_compute/core/NEON/NEColorConvertHelper.inl"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+NEColorConvertKernel::NEColorConvertKernel()
+    : _input(nullptr), _output(nullptr), _func(nullptr)
+{
+}
+
+void NEColorConvertKernel::configure(const ITensor *input, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+    unsigned int num_elems_processed_per_iteration = 0;
+
+    switch(input->info()->format())
+    {
+        case Format::RGBA8888:
+        {
+            switch(output->info()->format())
+            {
+                case Format::RGB888:
+                    _func                             = colorconvert_rgbx_to_rgb;
+                    num_elems_processed_per_iteration = 16;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
+            break;
+        }
+        case Format::UYVY422:
+        {
+            switch(output->info()->format())
+            {
+                case Format::RGB888:
+                    _func                             = colorconvert_yuyv_to_rgb<false, false>;
+                    num_elems_processed_per_iteration = 32;
+                    break;
+                case Format::RGBA8888:
+                    _func                             = colorconvert_yuyv_to_rgb<false, true>;
+                    num_elems_processed_per_iteration = 32;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
+            break;
+        }
+        case Format::YUYV422:
+        {
+            switch(output->info()->format())
+            {
+                case Format::RGB888:
+                    _func                             = colorconvert_yuyv_to_rgb<true, false>;
+                    num_elems_processed_per_iteration = 32;
+                    break;
+                case Format::RGBA8888:
+                    _func                             = colorconvert_yuyv_to_rgb<true, true>;
+                    num_elems_processed_per_iteration = 32;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
+            break;
+        }
+        case Format::RGB888:
+        {
+            switch(output->info()->format())
+            {
+                case Format::RGBA8888:
+                    _func                             = colorconvert_rgb_to_rgbx;
+                    num_elems_processed_per_iteration = 16;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+
+void NEColorConvertKernel::configure(const IMultiImage *input, IImage *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+
+    set_shape_if_empty(*output->info(), input->plane(0)->info()->tensor_shape());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output);
+
+    unsigned int num_elems_processed_per_iteration = 0;
+
+    switch(input->info()->format())
+    {
+        case Format::NV12:
+        {
+            switch(output->info()->format())
+            {
+                case Format::RGB888:
+                    _func                             = colorconvert_nv12_to_rgb<true, false>;
+                    num_elems_processed_per_iteration = 32;
+                    break;
+                case Format::RGBA8888:
+                    _func                             = colorconvert_nv12_to_rgb<true, true>;
+                    num_elems_processed_per_iteration = 32;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
+            break;
+        }
+        case Format::NV21:
+        {
+            switch(output->info()->format())
+            {
+                case Format::RGB888:
+                    _func                             = colorconvert_nv12_to_rgb<false, false>;
+                    num_elems_processed_per_iteration = 32;
+                    break;
+                case Format::RGBA8888:
+                    _func                             = colorconvert_nv12_to_rgb<false, true>;
+                    num_elems_processed_per_iteration = 32;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
+            break;
+        }
+        case Format::IYUV:
+        {
+            switch(output->info()->format())
+            {
+                case Format::RGB888:
+                    _func                             = colorconvert_iyuv_to_rgb<false>;
+                    num_elems_processed_per_iteration = 32;
+                    break;
+                case Format::RGBA8888:
+                    _func                             = colorconvert_iyuv_to_rgb<true>;
+                    num_elems_processed_per_iteration = 32;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    win.set_dimension_step(Window::DimY, 2);
+
+    unsigned int input_plane_count = 3;
+
+    if(input->info()->format() == Format::NV12 || input->info()->format() == Format::NV21)
+    {
+        input_plane_count = 2;
+    }
+
+    AccessWindowHorizontal input0_access(input->plane(0)->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowRectangle  input1_access(input->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, 0.5f, 0.5f);
+    AccessWindowRectangle  input2_access(input_plane_count == 2 ? nullptr : input->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1, 0.5f, 0.5f);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              input0_access, input1_access, input2_access,
+                              output_access);
+
+    ValidRegion intersect_region = intersect_valid_regions(input->plane(0)->info()->valid_region(),
+                                                           input->plane(1)->info()->valid_region());
+
+    if(input_plane_count == 3)
+    {
+        intersect_region = intersect_valid_regions(intersect_region, input->plane(2)->info()->valid_region());
+    }
+
+    output_access.set_valid_region(win, intersect_region);
+
+    INEKernel::configure(win);
+}
+
+void NEColorConvertKernel::configure(const IImage *input, IMultiImage *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+
+    set_shape_if_empty(*output->plane(0)->info(), input->info()->tensor_shape());
+
+    switch(output->info()->format())
+    {
+        case Format::NV12:
+        {
+            TensorShape subsampled_shape = input->info()->tensor_shape();
+            subsampled_shape.set(0, subsampled_shape[0] / 2);
+            subsampled_shape.set(1, subsampled_shape[1] / 2);
+
+            set_shape_if_empty(*output->plane(1)->info(), subsampled_shape);
+
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(1)->info()->tensor_shape());
+            break;
+        }
+        case Format::IYUV:
+        {
+            TensorShape subsampled_shape = input->info()->tensor_shape();
+            subsampled_shape.set(0, subsampled_shape[0] / 2);
+            subsampled_shape.set(1, subsampled_shape[1] / 2);
+
+            set_shape_if_empty(*output->plane(1)->info(), subsampled_shape);
+            set_shape_if_empty(*output->plane(2)->info(), subsampled_shape);
+
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(1)->info()->tensor_shape());
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(2)->info()->tensor_shape());
+            break;
+        }
+        case Format::YUV444:
+            set_shape_if_empty(*output->plane(1)->info(), input->info()->tensor_shape());
+            set_shape_if_empty(*output->plane(2)->info(), input->info()->tensor_shape());
+
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output->plane(1));
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output->plane(2));
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output->plane(0));
+
+    unsigned int num_elems_processed_per_iteration = 0;
+
+    switch(input->info()->format())
+    {
+        case Format::RGB888:
+        {
+            switch(output->info()->format())
+            {
+                case Format::NV12:
+                    _func                             = colorconvert_rgb_to_nv12<false>;
+                    num_elems_processed_per_iteration = 16;
+                    break;
+                case Format::IYUV:
+                    _func                             = colorconvert_rgb_to_iyuv<false>;
+                    num_elems_processed_per_iteration = 16;
+                    break;
+                case Format::YUV444:
+                    _func                             = colorconvert_rgb_to_yuv4<false>;
+                    num_elems_processed_per_iteration = 16;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
+            break;
+        }
+        case Format::RGBA8888:
+        {
+            switch(output->info()->format())
+            {
+                case Format::NV12:
+                    _func                             = colorconvert_rgb_to_nv12<true>;
+                    num_elems_processed_per_iteration = 16;
+                    break;
+                case Format::IYUV:
+                    _func                             = colorconvert_rgb_to_iyuv<true>;
+                    num_elems_processed_per_iteration = 16;
+                    break;
+                case Format::YUV444:
+                    _func                             = colorconvert_rgb_to_yuv4<true>;
+                    num_elems_processed_per_iteration = 16;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
+            break;
+        }
+        case Format::UYVY422:
+        {
+            switch(output->info()->format())
+            {
+                case Format::NV12:
+                    _func                             = colorconvert_yuyv_to_nv12<false>;
+                    num_elems_processed_per_iteration = 32;
+                    break;
+                case Format::IYUV:
+                    _func                             = colorconvert_yuyv_to_iyuv<false>;
+                    num_elems_processed_per_iteration = 32;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
+            break;
+        }
+        case Format::YUYV422:
+        {
+            switch(output->info()->format())
+            {
+                case Format::NV12:
+                    _func                             = colorconvert_yuyv_to_nv12<true>;
+                    num_elems_processed_per_iteration = 32;
+                    break;
+                case Format::IYUV:
+                    _func                             = colorconvert_yuyv_to_iyuv<true>;
+                    num_elems_processed_per_iteration = 32;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    float sub_sampling = 1.f;
+
+    if((input->info()->format() != Format::RGB888 || output->info()->format() != Format::YUV444) && (input->info()->format() != Format::RGBA8888 || output->info()->format() != Format::YUV444))
+    {
+        win.set_dimension_step(Window::DimY, 2);
+        sub_sampling = 0.5f;
+    }
+
+    unsigned int output_plane_count = 3;
+
+    if(output->info()->format() == Format::NV12 || output->info()->format() == Format::NV21)
+    {
+        output_plane_count = 2;
+    }
+
+    AccessWindowHorizontal output0_access(output->plane(0)->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowRectangle  output1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, sub_sampling, sub_sampling);
+    AccessWindowRectangle  output2_access(output_plane_count == 2 ? nullptr : output->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1, sub_sampling, sub_sampling);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+                              output0_access,
+                              output1_access,
+                              output2_access);
+
+    output0_access.set_valid_region(win, input->info()->valid_region());
+    output1_access.set_valid_region(win, input->info()->valid_region());
+    output2_access.set_valid_region(win, input->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+
+void NEColorConvertKernel::configure(const IMultiImage *input, IMultiImage *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_ON(input == output);
+
+    set_shape_if_empty(*output->plane(0)->info(), input->plane(0)->info()->tensor_shape());
+
+    switch(output->info()->format())
+    {
+        case Format::NV12:
+        {
+            TensorShape subsampled_shape = input->plane(0)->info()->tensor_shape();
+            subsampled_shape.set(0, subsampled_shape[0] / 2);
+            subsampled_shape.set(1, subsampled_shape[1] / 2);
+
+            set_shape_if_empty(*output->plane(1)->info(), subsampled_shape);
+
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(1)->info()->tensor_shape());
+            break;
+        }
+        case Format::IYUV:
+        {
+            TensorShape subsampled_shape = input->plane(0)->info()->tensor_shape();
+            subsampled_shape.set(0, subsampled_shape[0] / 2);
+            subsampled_shape.set(1, subsampled_shape[1] / 2);
+
+            set_shape_if_empty(*output->plane(1)->info(), subsampled_shape);
+            set_shape_if_empty(*output->plane(2)->info(), subsampled_shape);
+
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(1)->info()->tensor_shape());
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(2)->info()->tensor_shape());
+            break;
+        }
+        case Format::YUV444:
+            set_shape_if_empty(*output->plane(1)->info(), input->plane(0)->info()->tensor_shape());
+            set_shape_if_empty(*output->plane(2)->info(), input->plane(0)->info()->tensor_shape());
+
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output->plane(1));
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output->plane(2));
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output->plane(0));
+
+    switch(input->info()->format())
+    {
+        case Format::NV12:
+        {
+            switch(output->info()->format())
+            {
+                case Format::IYUV:
+                    _func = colorconvert_nv12_to_iyuv<true>;
+                    break;
+                case Format::YUV444:
+                    _func = colorconvert_nv12_to_yuv4<true>;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
+            break;
+        }
+        case Format::NV21:
+        {
+            switch(output->info()->format())
+            {
+                case Format::IYUV:
+                    _func = colorconvert_nv12_to_iyuv<false>;
+                    break;
+                case Format::YUV444:
+                    _func = colorconvert_nv12_to_yuv4<false>;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
+            break;
+        }
+        case Format::IYUV:
+        {
+            switch(output->info()->format())
+            {
+                case Format::NV12:
+                    _func = colorconvert_iyuv_to_nv12;
+                    break;
+                case Format::YUV444:
+                    _func = colorconvert_iyuv_to_yuv4;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+
+    _input  = input;
+    _output = output;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 32;
+    constexpr float        input_sub_sampling                = 0.5f;
+    const float            output_sub_sampling               = output->info()->format() == Format::YUV444 ? 1.f : 0.5f;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->plane(0)->info(), Steps(num_elems_processed_per_iteration));
+    win.set_dimension_step(Window::DimY, 2);
+
+    unsigned int input_plane_count = 3;
+
+    if(input->info()->format() == Format::NV12 || input->info()->format() == Format::NV21)
+    {
+        input_plane_count = 2;
+    }
+
+    unsigned int output_plane_count = 3;
+
+    if(output->info()->format() == Format::NV12 || output->info()->format() == Format::NV21)
+    {
+        output_plane_count = 2;
+    }
+
+    AccessWindowHorizontal output0_access(output->plane(0)->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowRectangle  output1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, output_sub_sampling, output_sub_sampling);
+    AccessWindowRectangle  output2_access(output_plane_count == 2 ? nullptr : output->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1, output_sub_sampling, output_sub_sampling);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->plane(0)->info(), 0, num_elems_processed_per_iteration),
+                              AccessWindowRectangle(input->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, input_sub_sampling, input_sub_sampling),
+                              AccessWindowRectangle(input_plane_count == 2 ? nullptr : input->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1, input_sub_sampling, input_sub_sampling),
+                              output0_access,
+                              output1_access,
+                              output2_access);
+
+    ValidRegion intersect_region = intersect_valid_regions(input->plane(0)->info()->valid_region(),
+                                                           input->plane(1)->info()->valid_region());
+
+    if(input_plane_count == 3)
+    {
+        intersect_region = intersect_valid_regions(intersect_region, input->plane(2)->info()->valid_region());
+    }
+
+    output0_access.set_valid_region(win, intersect_region);
+    output1_access.set_valid_region(win, intersect_region);
+    output2_access.set_valid_region(win, intersect_region);
+
+    INEKernel::configure(win);
+}
+
+void NEColorConvertKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (*_func)(_input, _output, window);
+}
diff --git a/src/core/NEON/kernels/NEConvolutionKernel.cpp b/src/core/NEON/kernels/NEConvolutionKernel.cpp
new file mode 100644
index 0000000000..30e91ef253
--- /dev/null
+++ b/src/core/NEON/kernels/NEConvolutionKernel.cpp
@@ -0,0 +1,1618 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <array>
+#include <cstdint>
+#include <cstring>
+#include <tuple>
+
+namespace arm_compute
+{
+namespace
+{
+const uint16x8_t max_int16 = vdupq_n_u16(INT16_MAX);
+
+inline void store_results(const int32x4_t &out, const int32x4_t &out2, int16_t *output)
+{
+    const int16x8_t s16results = vcombine_s16(vqmovn_s32(out),
+                                              vqmovn_s32(out2));
+    vst1q_s16(output, s16results);
+}
+
+inline void store_results(const int32x4_t &out, const int32x4_t &out2, uint8_t *output)
+{
+    const uint8x8_t u8results = vqmovn_u16(vcombine_u16(vqmovun_s32(out),
+                                                        vqmovun_s32(out2)));
+    vst1_u8(output, u8results);
+}
+
+inline void store_results(const uint32x4_t &out, const uint32x4_t &out2, int16_t *output)
+{
+    const uint16x8_t u16results = vcombine_u16(vqmovn_u32(out), vqmovn_u32(out2));
+    const int16x8_t  s16results = vreinterpretq_s16_u16(vminq_u16(u16results, max_int16));
+    vst1q_s16(output, s16results);
+}
+
+inline void store_results(const uint32x4_t &out, const uint32x4_t &out2, uint8_t *output)
+{
+    const uint8x8_t u8results = vqmovn_u16(vcombine_u16(vqmovn_u32(out),
+                                                        vqmovn_u32(out2)));
+    vst1_u8(output, u8results);
+}
+
+inline void store_results(const int16x8_t &out, const int16x8_t &out2, int16_t *output)
+{
+    vst1q_s16(output, out);
+    vst1q_s16(output + 8, out2);
+}
+
+inline void store_results(const int16x8_t &out, const int16x8_t &out2, uint8_t *output)
+{
+    const uint8x16_t u8results = vcombine_u8(vqmovun_s16(out),
+                                             vqmovun_s16(out2));
+    vst1q_u8(output, u8results);
+}
+
+inline void store_results(const uint16x8_t &out, const uint16x8_t &out2, uint8_t *output)
+{
+    const uint8x16_t u8results = vcombine_u8(vqmovn_u16(out),
+                                             vqmovn_u16(out2));
+    vst1q_u8(output, u8results);
+}
+
+inline void store_results(const uint16x8_t &out, const uint16x8_t &out2, int16_t *output)
+{
+    vst1q_s16(output, vreinterpretq_s16_u16(vminq_u16(out, max_int16)));
+    vst1q_s16(output + 8, vreinterpretq_s16_u16(vminq_u16(out2, max_int16)));
+}
+
+inline void convolve_row3x1_unrolled(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16x4_t &mat0, const int16x4_t &mat1, const int16x4_t &mat2)
+{
+    // Convert to s16 and split in blocks of 4 values:
+    const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
+    const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
+
+    const int16x4x3_t row =
+    {
+        {
+            vget_low_s16(s16_tmp0),
+            vget_high_s16(s16_tmp0),
+            vget_low_s16(s16_tmp1)
+        }
+    };
+
+    // Calculate row left value for pixels [0,3]
+    out = vmlal_s16(out, row.val[0], mat0);
+    // Calculate row middle value for pixels [0,3]
+    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
+    // Calculate row right value for pixels [0,3]
+    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
+
+    // Calculate row left value for pixels [4,7]
+    out2 = vmlal_s16(out2, row.val[1], mat0);
+    // Calculate row middle value for pixels [4,7]
+    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
+    // Calculate row right value for pixels [4,7]
+    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
+}
+
+inline void convolve_row3x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
+{
+    const int16x4_t mat0 = vld1_dup_s16(convolution);
+    const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
+    const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
+
+    convolve_row3x1_unrolled(out, out2, row_data, mat0, mat1, mat2);
+}
+
+inline void convolve_row5x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
+{
+    const int16x4_t mat0 = vld1_dup_s16(convolution);
+    const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
+    const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
+    const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
+    const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
+
+    // Convert to s16 and split in blocks of 4 values:
+    const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
+    const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
+
+    const int16x4x3_t row =
+    {
+        {
+            vget_low_s16(s16_tmp0),
+            vget_high_s16(s16_tmp0),
+            vget_low_s16(s16_tmp1)
+        }
+    };
+
+    // Calculate row left 2 value for pixels [0,3]
+    out = vmlal_s16(out, row.val[0], mat0);
+    // Calculate row left 1 value for pixels [0,3]
+    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
+    // Calculate row middle value for pixels [0,3]
+    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
+    // Calculate row right +1 value for pixels [0,3]
+    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
+    // Calculate row right +2 value for pixels [0,3]
+    out = vmlal_s16(out, row.val[1], mat4);
+
+    // Calculate row left 2 value for pixels [4,7]
+    out2 = vmlal_s16(out2, row.val[1], mat0);
+    // Calculate row left 1 value for pixels [4,7]
+    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
+    // Calculate row middle value for pixels [4,7]
+    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
+    // Calculate row right +1 value for pixels [4,7]
+    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
+    // Calculate row right +2 value for pixels [4,7]
+    out2 = vmlal_s16(out2, row.val[2], mat4);
+}
+
+inline void convolve_row7x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
+{
+    const int16x4_t mat0 = vld1_dup_s16(convolution);
+    const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
+    const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
+    const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
+    const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
+    const int16x4_t mat5 = vld1_dup_s16(convolution + 5);
+    const int16x4_t mat6 = vld1_dup_s16(convolution + 6);
+
+    // Convert to s16 and split in blocks of 4 values:
+    const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
+    const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
+
+    const int16x4x4_t row =
+    {
+        {
+            vget_low_s16(s16_tmp0),
+            vget_high_s16(s16_tmp0),
+            vget_low_s16(s16_tmp1),
+            vget_high_s16(s16_tmp1)
+        }
+    };
+
+    // Calculate row left 3 value for pixels [0,3]
+    out = vmlal_s16(out, row.val[0], mat0);
+    // Calculate row left 2 value for pixels [0,3]
+    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
+    // Calculate row left 1 value for pixels [0,3]
+    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
+    // Calculate row middle value for pixels [0,3]
+    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
+    // Calculate row right +1 value for pixels [0,3]
+    out = vmlal_s16(out, row.val[1], mat4);
+    // Calculate row right +2 value for pixels [0,3]
+    out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 1), mat5);
+    // Calculate row right +3 value for pixels [0,3]
+    out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 2), mat6);
+
+    // Calculate row left 3 value for pixels [4,7]
+    out2 = vmlal_s16(out2, row.val[1], mat0);
+    // Calculate row left 2 value for pixels [4,7]
+    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
+    // Calculate row left 1 value for pixels [4,7]
+    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
+    // Calculate row middle value for pixels [4,7]
+    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
+    // Calculate row right +1 value for pixels [4,7]
+    out2 = vmlal_s16(out2, row.val[2], mat4);
+    // Calculate row right +2 value for pixels [4,7]
+    out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 1), mat5);
+    // Calculate row right +3 value for pixels [4,7]
+    out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 2), mat6);
+}
+
+inline void convolve_row9x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
+{
+    const int16x4_t mat0 = vld1_dup_s16(convolution);
+    const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
+    const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
+    const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
+    const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
+    const int16x4_t mat5 = vld1_dup_s16(convolution + 5);
+    const int16x4_t mat6 = vld1_dup_s16(convolution + 6);
+    const int16x4_t mat7 = vld1_dup_s16(convolution + 7);
+    const int16x4_t mat8 = vld1_dup_s16(convolution + 8);
+
+    // Convert to s16 and split in blocks of 4 values:
+    const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
+    const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
+
+    const int16x4x4_t row =
+    {
+        {
+            vget_low_s16(s16_tmp0),
+            vget_high_s16(s16_tmp0),
+            vget_low_s16(s16_tmp1),
+            vget_high_s16(s16_tmp1)
+        }
+    };
+
+    // Calculate row left 4 value for pixels [0,3]
+    out = vmlal_s16(out, row.val[0], mat0);
+    // Calculate row left 3 value for pixels [0,3]
+    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
+    // Calculate row left 2 value for pixels [0,3]
+    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
+    // Calculate row left 1 value for pixels [0,3]
+    out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
+    // Calculate row middle value for pixels [0,3]
+    out = vmlal_s16(out, row.val[1], mat4);
+    // Calculate row right +1 value for pixels [0,3]
+    out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 1), mat5);
+    // Calculate row right +2 value for pixels [0,3]
+    out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 2), mat6);
+    // Calculate row right +3 value for pixels [0,3]
+    out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 3), mat7);
+    // Calculate row right +4 value for pixels [0,3]
+    out = vmlal_s16(out, row.val[2], mat8);
+
+    // Calculate row left 4 value for pixels [0,3]
+    out2 = vmlal_s16(out2, row.val[1], mat0);
+    // Calculate row left 3 value for pixels [0,3]
+    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
+    // Calculate row left 2 value for pixels [0,3]
+    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
+    // Calculate row left 1 value for pixels [0,3]
+    out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
+    // Calculate row middle value for pixels [0,3]
+    out2 = vmlal_s16(out2, row.val[2], mat4);
+    // Calculate row right +1 value for pixels [0,3]
+    out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 1), mat5);
+    // Calculate row right +2 value for pixels [0,3]
+    out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 2), mat6);
+    // Calculate row right +3 value for pixels [0,3]
+    out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 3), mat7);
+    // Calculate row right +4 value for pixels [0,3]
+    out2 = vmlal_s16(out2, row.val[3], mat8);
+}
+} // namespace
+
+/****************************************************************************************\
+ *                                    Square Convolution                                *
+\****************************************************************************************/
+
+template <unsigned int matrix_size>
+NEConvolutionKernel<matrix_size>::NEConvolutionKernel()
+    : INESimpleKernel(), _scale(0), _convolution{ {} }
+{
+}
+
+template <unsigned int matrix_size>
+BorderSize             NEConvolutionKernel<matrix_size>::border_size() const
+{
+    return BorderSize(matrix_size / 2);
+}
+
+template <unsigned int matrix_size>
+void NEConvolutionKernel<matrix_size>::configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv);
+
+    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+
+    _input  = input;
+    _output = output;
+
+    std::copy_n(conv, _convolution.size(), _convolution.begin());
+
+    if(scale == 0)
+    {
+        _scale = calculate_matrix_scale(_convolution.data(), matrix_size);
+    }
+    else
+    {
+        _scale = scale;
+    }
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, matrix_size),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+template <>
+template <typename OutputType>
+void NEConvolutionKernel<3>::convolution(const Window &win)
+{
+    static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
+    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
+
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+
+    // Load the matrix's coefficients into NEON registers:
+    const int16x4_t   mat00     = vld1_dup_s16(_convolution.data());
+    const int16x4_t   mat01     = vld1_dup_s16(_convolution.data() + 1);
+    const int16x4_t   mat02     = vld1_dup_s16(_convolution.data() + 2);
+    const int16x4_t   mat10     = vld1_dup_s16(_convolution.data() + 3);
+    const int16x4_t   mat11     = vld1_dup_s16(_convolution.data() + 4);
+    const int16x4_t   mat12     = vld1_dup_s16(_convolution.data() + 5);
+    const int16x4_t   mat20     = vld1_dup_s16(_convolution.data() + 6);
+    const int16x4_t   mat21     = vld1_dup_s16(_convolution.data() + 7);
+    const int16x4_t   mat22     = vld1_dup_s16(_convolution.data() + 8);
+    const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
+
+    const unsigned char *input_top_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, -1));
+    const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, 0));
+    const unsigned char *input_low_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, 1));
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        int32x4_t out  = vdupq_n_s32(0);
+        int32x4_t out2 = vdupq_n_s32(0);
+
+        // Load 16 bytes from the top row:
+        const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
+        convolve_row3x1_unrolled(out, out2, top_data, mat00, mat01, mat02);
+
+        // Load 16 bytes from the middle row:
+        const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
+        convolve_row3x1_unrolled(out, out2, mid_data, mat10, mat11, mat12);
+
+        // Load 16 bytes from the middle row:
+        const uint8x16_t low_data = vld1q_u8(input_low_ptr + input.offset());
+        convolve_row3x1_unrolled(out, out2, low_data, mat20, mat21, mat22);
+
+        // Apply scale
+        if(_scale != 1)
+        {
+            // Convert to F32, scale and convert back to S32
+            out  = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
+            out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
+        }
+
+        // Clamp and store as U8 or S16:
+        store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
+    },
+    input, output);
+}
+
+template <>
+template <typename OutputType>
+void NEConvolutionKernel<5>::convolution(const Window &win)
+{
+    static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
+    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
+
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+
+    const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
+
+    const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, -2));
+    const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, -1));
+    const unsigned char *input_mid_ptr  = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 0));
+    const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 1));
+    const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 2));
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        int32x4_t out  = vdupq_n_s32(0);
+        int32x4_t out2 = vdupq_n_s32(0);
+
+        // Load 16 bytes from the top2 row:
+        const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
+        convolve_row5x1(out, out2, data_t2, _convolution.data());
+
+        // Load 16 bytes from the top1 row:
+        const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
+        convolve_row5x1(out, out2, data_t1, _convolution.data() + 5);
+
+        // Load 16 bytes from the middle row:
+        const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
+        convolve_row5x1(out, out2, data_m, _convolution.data() + 10);
+
+        // Load 16 bytes from the low1 row:
+        const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
+        convolve_row5x1(out, out2, data_b1, _convolution.data() + 15);
+
+        // Load 16 bytes from the low2 row:
+        const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
+        convolve_row5x1(out, out2, data_b2, _convolution.data() + 20);
+
+        // Apply scale
+        if(_scale != 1)
+        {
+            // Convert to F32, scale and convert back to S32
+            out  = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
+            out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
+        }
+
+        // Clamp and store as U8 or S16:
+        store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
+    },
+    input, output);
+}
+
+template <>
+template <typename OutputType>
+void NEConvolutionKernel<7>::convolution(const Window &win)
+{
+    static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
+    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
+
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+
+    const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
+
+    const unsigned char *input_top3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -3));
+    const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -2));
+    const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -1));
+    const unsigned char *input_mid_ptr  = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 0));
+    const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 1));
+    const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 2));
+    const unsigned char *input_low3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 3));
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        int32x4_t out  = vdupq_n_s32(0);
+        int32x4_t out2 = vdupq_n_s32(0);
+
+        // Load 16 bytes from the top3 row:
+        const uint8x16_t data_t3 = vld1q_u8(input_top3_ptr + input.offset());
+        convolve_row7x1(out, out2, data_t3, _convolution.data());
+
+        // Load 16 bytes from the top2 row:
+        const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
+        convolve_row7x1(out, out2, data_t2, _convolution.data() + 7);
+
+        // Load 16 bytes from the top1 row:
+        const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
+        convolve_row7x1(out, out2, data_t1, _convolution.data() + 14);
+
+        // Load 16 bytes from the middle row:
+        const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
+        convolve_row7x1(out, out2, data_m, _convolution.data() + 21);
+
+        // Load 16 bytes from the low1 row:
+        const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
+        convolve_row7x1(out, out2, data_b1, _convolution.data() + 28);
+
+        // Load 16 bytes from the low2 row:
+        const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
+        convolve_row7x1(out, out2, data_b2, _convolution.data() + 35);
+
+        // Load 16 bytes from the low3 row:
+        const uint8x16_t data_b3 = vld1q_u8(input_low3_ptr + input.offset());
+        convolve_row7x1(out, out2, data_b3, _convolution.data() + 42);
+
+        // Apply scale
+        if(_scale != 1)
+        {
+            // Convert to F32, scale and convert back to S32
+            out  = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
+            out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
+        }
+
+        // Clamp and store as U8 or S16:
+        store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
+    },
+    input, output);
+}
+
+template <>
+template <typename OutputType>
+void NEConvolutionKernel<9>::convolution(const Window &win)
+{
+    static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
+    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
+
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+
+    const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
+
+    const unsigned char *input_top4_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -4));
+    const unsigned char *input_top3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -3));
+    const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -2));
+    const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -1));
+    const unsigned char *input_mid_ptr  = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 0));
+    const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 1));
+    const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 2));
+    const unsigned char *input_low3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 3));
+    const unsigned char *input_low4_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 4));
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        int32x4_t out  = vdupq_n_s32(0);
+        int32x4_t out2 = vdupq_n_s32(0);
+
+        // Load 16 bytes from the top4 row:
+        const uint8x16_t data_t4 = vld1q_u8(input_top4_ptr + input.offset());
+        convolve_row9x1(out, out2, data_t4, _convolution.data());
+
+        // Load 16 bytes from the top3 row:
+        const uint8x16_t data_t3 = vld1q_u8(input_top3_ptr + input.offset());
+        convolve_row9x1(out, out2, data_t3, _convolution.data() + 9);
+
+        // Load 16 bytes from the top2 row:
+        const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
+        convolve_row9x1(out, out2, data_t2, _convolution.data() + 18);
+
+        // Load 16 bytes from the top1 row:
+        const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
+        convolve_row9x1(out, out2, data_t1, _convolution.data() + 27);
+
+        // Load 16 bytes from the middle row:
+        const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
+        convolve_row9x1(out, out2, data_m, _convolution.data() + 36);
+
+        // Load 16 bytes from the low1 row:
+        const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
+        convolve_row9x1(out, out2, data_b1, _convolution.data() + 45);
+
+        // Load 16 bytes from the low2 row:
+        const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
+        convolve_row9x1(out, out2, data_b2, _convolution.data() + 54);
+
+        // Load 16 bytes from the low3 row:
+        const uint8x16_t data_b3 = vld1q_u8(input_low3_ptr + input.offset());
+        convolve_row9x1(out, out2, data_b3, _convolution.data() + 63);
+
+        // Load 16 bytes from the low4 row:
+        const uint8x16_t data_b4 = vld1q_u8(input_low4_ptr + input.offset());
+        convolve_row9x1(out, out2, data_b4, _convolution.data() + 72);
+
+        // Apply scale
+        if(_scale != 1)
+        {
+            // Convert to F32, scale and convert back to S32
+            out  = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
+            out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
+        }
+
+        // Clamp and store as U8 or S16:
+        store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
+    },
+    input, output);
+}
+
+template <unsigned int matrix_size>
+void NEConvolutionKernel<matrix_size>::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    switch(_output->info()->format())
+    {
+        case Format::U8:
+            convolution<uint8_t>(window);
+            break;
+        case Format::S16:
+            convolution<int16_t>(window);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+    }
+}
+
+template class arm_compute::NEConvolutionKernel<3>;
+template class arm_compute::NEConvolutionKernel<5>;
+template class arm_compute::NEConvolutionKernel<7>;
+template class arm_compute::NEConvolutionKernel<9>;
+
+/****************************************************************************************\
+ *                              Separable Square Convolution                            *
+\****************************************************************************************/
+
+template <unsigned int matrix_size>
+NESeparableConvolutionHorKernel<matrix_size>::NESeparableConvolutionHorKernel()
+    : _conv_row{ { 0 } }, _border_size(0)
+{
+}
+
+template <unsigned int matrix_size>
+BorderSize             NESeparableConvolutionHorKernel<matrix_size>::border_size() const
+{
+    return _border_size;
+}
+
+template <unsigned int matrix_size>
+void NESeparableConvolutionHorKernel<matrix_size>::configure(const ITensor *input, ITensor *output, const int16_t *conv_row, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv_row);
+
+    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::S16, DataType::S32);
+
+    _input  = input;
+    _output = output;
+    std::copy_n(conv_row, _conv_row.size(), _conv_row.begin());
+    _border_size = BorderSize(border_undefined ? 0 : matrix_size / 2, matrix_size / 2);
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+
+    Window                 win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), -border_size().left, num_elems_read_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+template <unsigned int matrix_size>
+void NESeparableConvolutionHorKernel<matrix_size>::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    switch(_output->info()->data_type())
+    {
+        case DataType::U16:
+            convolve<uint16_t>(window);
+            break;
+        case DataType::S16:
+            convolve<int16_t>(window);
+            break;
+        case DataType::S32:
+            convolve<int32_t>(window);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported intermediate data type!");
+            break;
+    }
+}
+
+template <>
+template <>
+inline void NESeparableConvolutionHorKernel<5>::convolve<uint16_t>(const Window &window)
+{
+    Window win_in(window);
+    win_in.shift(Window::DimX, -2);
+
+    Iterator input(_input, win_in);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t data = vld1q_u8(input.ptr());
+
+        const uint16x8x2_t data_u16 =
+        {
+            {
+                vmovl_u8(vget_low_u8(data)),
+                vmovl_u8(vget_high_u8(data))
+            }
+        };
+
+        uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
+
+        vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), out);
+    },
+    input, output);
+}
+
+template <>
+template <>
+inline void NESeparableConvolutionHorKernel<5>::convolve<int16_t>(const Window &window)
+{
+    Window win_in(window);
+    win_in.shift(Window::DimX, -2);
+
+    Iterator input(_input, win_in);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t data = vld1q_u8(input.ptr());
+
+        const int16x8x2_t data_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
+            }
+        };
+
+        int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
+
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
+    },
+    input, output);
+}
+
+template <>
+template <>
+void NESeparableConvolutionHorKernel<5>::convolve<int32_t>(const Window &window)
+{
+    Window win_in(window);
+    win_in.shift(Window::DimX, -2);
+
+    Iterator input(_input, win_in);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t data = vld1q_u8(input.ptr());
+
+        const int16x8x2_t data_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
+            }
+        };
+
+        const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
+        const int16x8_t data_s16_m  = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
+        const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
+        const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
+
+        int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[1]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[2]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[3]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[4]);
+
+        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), out_low);
+
+        int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[1]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[2]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[3]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[4]);
+
+        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, out_high);
+    },
+    input, output);
+}
+
+template <>
+template <>
+inline void NESeparableConvolutionHorKernel<7>::convolve<uint16_t>(const Window &window)
+{
+    Window win_in(window);
+    win_in.shift(Window::DimX, -3);
+
+    Iterator input(_input, win_in);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t data = vld1q_u8(input.ptr());
+
+        const uint16x8x2_t data_u16 =
+        {
+            {
+                vmovl_u8(vget_low_u8(data)),
+                vmovl_u8(vget_high_u8(data))
+            }
+        };
+
+        uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 5), _conv_row[5]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 6), _conv_row[6]);
+
+        vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), out);
+    },
+    input, output);
+}
+
+template <>
+template <>
+inline void NESeparableConvolutionHorKernel<7>::convolve<int16_t>(const Window &window)
+{
+    Window win_in(window);
+    win_in.shift(Window::DimX, -3);
+
+    Iterator input(_input, win_in);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t data = vld1q_u8(input.ptr());
+
+        const int16x8x2_t data_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
+            }
+        };
+
+        int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 5), _conv_row[5]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 6), _conv_row[6]);
+
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
+    },
+    input, output);
+}
+
+template <>
+template <>
+void NESeparableConvolutionHorKernel<7>::convolve<int32_t>(const Window &window)
+{
+    Window win_in(window);
+    win_in.shift(Window::DimX, -3);
+
+    Iterator input(_input, win_in);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t data = vld1q_u8(input.ptr());
+
+        const int16x8x2_t data_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
+            }
+        };
+
+        const int16x8_t data_s16_l2 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
+        const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
+        const int16x8_t data_s16_m  = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
+        const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
+        const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 5);
+        const int16x8_t data_s16_r3 = vextq_s16(data_s16.val[0], data_s16.val[1], 6);
+
+        int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_l2), _conv_row[1]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[2]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[3]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[4]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[5]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_r3), _conv_row[6]);
+
+        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), out_low);
+
+        int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_l2), _conv_row[1]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[2]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[3]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[4]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[5]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_r3), _conv_row[6]);
+
+        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, out_high);
+    },
+    input, output);
+}
+
+template <>
+template <>
+inline void NESeparableConvolutionHorKernel<9>::convolve<uint16_t>(const Window &window)
+{
+    Window win_in(window);
+    win_in.shift(Window::DimX, -4);
+
+    Iterator input(_input, win_in);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t data = vld1q_u8(input.ptr());
+
+        const uint16x8x2_t data_u16 =
+        {
+            {
+                vmovl_u8(vget_low_u8(data)),
+                vmovl_u8(vget_high_u8(data))
+            }
+        };
+
+        uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 5), _conv_row[5]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 6), _conv_row[6]);
+        out            = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 7), _conv_row[7]);
+        out            = vmlaq_n_u16(out, data_u16.val[1], _conv_row[8]);
+
+        vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), out);
+    },
+    input, output);
+}
+
+template <>
+template <>
+inline void NESeparableConvolutionHorKernel<9>::convolve<int16_t>(const Window &window)
+{
+    Window win_in(window);
+    win_in.shift(Window::DimX, -4);
+
+    Iterator input(_input, win_in);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t data = vld1q_u8(input.ptr());
+
+        const int16x8x2_t data_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
+            }
+        };
+
+        int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 5), _conv_row[5]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 6), _conv_row[6]);
+        out           = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 7), _conv_row[7]);
+        out           = vmlaq_n_s16(out, data_s16.val[1], _conv_row[8]);
+
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
+    },
+    input, output);
+}
+
+template <>
+template <>
+void NESeparableConvolutionHorKernel<9>::convolve<int32_t>(const Window &window)
+{
+    Window win_in(window);
+    win_in.shift(Window::DimX, -4);
+
+    Iterator input(_input, win_in);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t data = vld1q_u8(input.ptr());
+
+        const int16x8x2_t data_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
+            }
+        };
+
+        const int16x8_t data_s16_l3 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
+        const int16x8_t data_s16_l2 = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
+        const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
+        const int16x8_t data_s16_m  = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
+        const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 5);
+        const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 6);
+        const int16x8_t data_s16_r3 = vextq_s16(data_s16.val[0], data_s16.val[1], 7);
+
+        int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_l3), _conv_row[1]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_l2), _conv_row[2]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[3]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[4]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[5]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[6]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16_r3), _conv_row[7]);
+        out_low           = vmlal_n_s16(out_low, vget_low_s16(data_s16.val[1]), _conv_row[8]);
+
+        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), out_low);
+
+        int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_l3), _conv_row[1]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_l2), _conv_row[2]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[3]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[4]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[5]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[6]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16_r3), _conv_row[7]);
+        out_high           = vmlal_n_s16(out_high, vget_high_s16(data_s16.val[1]), _conv_row[8]);
+
+        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, out_high);
+    },
+    input, output);
+}
+
+template class arm_compute::NESeparableConvolutionHorKernel<5>;
+template class arm_compute::NESeparableConvolutionHorKernel<7>;
+template class arm_compute::NESeparableConvolutionHorKernel<9>;
+
+template <unsigned int matrix_size>
+NESeparableConvolutionVertKernel<matrix_size>::NESeparableConvolutionVertKernel()
+    : _conv_col{ { 0 } }, _scale(0)
+{
+}
+
+template <unsigned int matrix_size>
+BorderSize             NESeparableConvolutionVertKernel<matrix_size>::border_size() const
+{
+    return BorderSize(matrix_size / 2, 0);
+}
+
+template <unsigned int matrix_size>
+void NESeparableConvolutionVertKernel<matrix_size>::configure(const ITensor *input, ITensor *output, const int16_t *conv_col, uint32_t scale, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv_col);
+
+    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON(scale == 0);
+
+    _input  = input;
+    _output = output;
+    std::copy_n(conv_col, _conv_col.size(), _conv_col.begin());
+    _scale = scale;
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 16;
+
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), 0, -border_size().top, num_elems_read_per_iteration, matrix_size),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+template <unsigned int matrix_size>
+void NESeparableConvolutionVertKernel<matrix_size>::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    switch(_input->info()->data_type())
+    {
+        case DataType::U16:
+            switch(_output->info()->data_type())
+            {
+                case DataType::U8:
+                    convolution_u16<uint8_t>(window);
+                    break;
+                case DataType::S16:
+                    convolution_u16<int16_t>(window);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+            }
+            break;
+        case DataType::S16:
+            switch(_output->info()->data_type())
+            {
+                case DataType::U8:
+                    convolution_s16<uint8_t>(window);
+                    break;
+                case DataType::S16:
+                    convolution_s16<int16_t>(window);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+            }
+            break;
+        case DataType::S32:
+            switch(_output->info()->data_type())
+            {
+                case DataType::U8:
+                    convolution_s32<uint8_t>(window);
+                    break;
+                case DataType::S16:
+                    convolution_s32<int16_t>(window);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+            }
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported intermediate data type!");
+            break;
+    }
+}
+
+template <unsigned int matrix_size>
+template <typename OutputType>
+void NESeparableConvolutionVertKernel<matrix_size>::convolution_u16(const Window &win)
+{
+    static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
+
+    Window win_in(win);
+    win_in.set_dimension_step(Window::DimX, 8);
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, win);
+
+    std::array<unsigned char *, matrix_size> input_ptrs{ {} };
+    const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
+    const int         k_half       = matrix_size / 2;
+
+    // Set row pointers
+    for(int i = -k_half; i <= k_half; ++i)
+    {
+        input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
+    }
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        uint16x8_t out0 = vdupq_n_u16(0);
+        uint16x8_t out1 = vdupq_n_u16(0);
+
+        // First half
+        for(unsigned int r = 0; r < matrix_size; ++r)
+        {
+            const uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t *>(input_ptrs[r] + in.offset()));
+            out0                  = vmlaq_n_u16(out0, data, _conv_col[r]);
+        }
+
+        in.increment(Window::DimX);
+
+        // Second half
+        for(unsigned int r = 0; r < matrix_size; ++r)
+        {
+            const uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t *>(input_ptrs[r] + in.offset()));
+            out1                  = vmlaq_n_u16(out1, data, _conv_col[r]);
+        }
+
+        //scale the result if needed
+        if(_scale != 1)
+        {
+            float32x4_t out0_f32_high = vcvtq_f32_u32(vmovl_u16(vget_high_u16(out0)));
+            float32x4_t out0_f32_low  = vcvtq_f32_u32(vmovl_u16(vget_low_u16(out0)));
+            out0_f32_high             = vmulq_f32(out0_f32_high, oneoverscale);
+            out0_f32_low              = vmulq_f32(out0_f32_low, oneoverscale);
+            store_results(vcvtq_u32_f32(out0_f32_low), vcvtq_u32_f32(out0_f32_high), reinterpret_cast<OutputType *>(out.ptr()));
+
+            float32x4_t out1_f32_high = vcvtq_f32_u32(vmovl_u16(vget_high_u16(out1)));
+            float32x4_t out1_f32_low  = vcvtq_f32_u32(vmovl_u16(vget_low_u16(out1)));
+            out1_f32_high             = vmulq_f32(out1_f32_high, oneoverscale);
+            out1_f32_low              = vmulq_f32(out1_f32_low, oneoverscale);
+            store_results(vcvtq_u32_f32(out1_f32_low), vcvtq_u32_f32(out1_f32_high), reinterpret_cast<OutputType *>(out.ptr()) + 8);
+        }
+        else
+        {
+            store_results(out0, out1, reinterpret_cast<OutputType *>(out.ptr()));
+        }
+    },
+    in, out);
+}
+
+template <unsigned int matrix_size>
+template <typename OutputType>
+void NESeparableConvolutionVertKernel<matrix_size>::convolution_s16(const Window &win)
+{
+    static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
+
+    Window win_in(win);
+    win_in.set_dimension_step(Window::DimX, 8);
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, win);
+
+    std::array<unsigned char *, matrix_size> input_ptrs{ {} };
+    const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
+    const int         k_half       = matrix_size / 2;
+
+    // Set row pointers
+    for(int i = -k_half; i <= k_half; ++i)
+    {
+        input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
+    }
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        int16x8_t out0 = vdupq_n_s16(0);
+        int16x8_t out1 = vdupq_n_s16(0);
+
+        // First half
+        for(unsigned int r = 0; r < matrix_size; ++r)
+        {
+            const int16x8_t data = vld1q_s16(reinterpret_cast<const int16_t *>(input_ptrs[r] + in.offset()));
+            out0                 = vmlaq_n_s16(out0, data, _conv_col[r]);
+        }
+
+        in.increment(Window::DimX);
+
+        // Second half
+        for(unsigned int r = 0; r < matrix_size; ++r)
+        {
+            const int16x8_t data = vld1q_s16(reinterpret_cast<const int16_t *>(input_ptrs[r] + in.offset()));
+            out1                 = vmlaq_n_s16(out1, data, _conv_col[r]);
+        }
+
+        //scale the result if needed
+        if(_scale != 1)
+        {
+            float32x4_t out0_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(out0)));
+            float32x4_t out0_f32_low  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(out0)));
+            out0_f32_high             = vmulq_f32(out0_f32_high, oneoverscale);
+            out0_f32_low              = vmulq_f32(out0_f32_low, oneoverscale);
+            store_results(vcvtq_s32_f32(out0_f32_low), vcvtq_s32_f32(out0_f32_high), reinterpret_cast<OutputType *>(out.ptr()));
+
+            float32x4_t out1_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(out1)));
+            float32x4_t out1_f32_low  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(out1)));
+            out1_f32_high             = vmulq_f32(out1_f32_high, oneoverscale);
+            out1_f32_low              = vmulq_f32(out1_f32_low, oneoverscale);
+            store_results(vcvtq_s32_f32(out1_f32_low), vcvtq_s32_f32(out1_f32_high), reinterpret_cast<OutputType *>(out.ptr()) + 8);
+        }
+        else
+        {
+            store_results(out0, out1, reinterpret_cast<OutputType *>(out.ptr()));
+        }
+    },
+    in, out);
+}
+
+template <unsigned int matrix_size>
+template <typename OutputType>
+void NESeparableConvolutionVertKernel<matrix_size>::convolution_s32(const Window &win)
+{
+    static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
+
+    Window win_in(win);
+    win_in.set_dimension_step(Window::DimX, 8);
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, win);
+
+    std::array<unsigned char *, matrix_size> input_ptrs{ {} };
+    const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
+    const int         k_half       = matrix_size / 2;
+
+    // Set row pointers
+    for(int i = -k_half; i <= k_half; ++i)
+    {
+        input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
+    }
+
+    const int32x4_t zero = vdupq_n_s32(0);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        int32x4x2_t out0 =
+        {
+            {
+                zero,
+                zero
+            }
+        };
+
+        int32x4x2_t out1 =
+        {
+            {
+                zero,
+                zero
+            }
+        };
+
+        // First half
+        for(unsigned int r = 0; r < matrix_size; ++r)
+        {
+            const int32x4x2_t data = vld2q_s32(reinterpret_cast<const int32_t *>(input_ptrs[r] + in.offset()));
+            out0.val[0]            = vmlaq_n_s32(out0.val[0], data.val[0], _conv_col[r]);
+            out0.val[1]            = vmlaq_n_s32(out0.val[1], data.val[1], _conv_col[r]);
+        }
+
+        in.increment(Window::DimX);
+
+        // Second half
+        for(unsigned int r = 0; r < matrix_size; ++r)
+        {
+            const int32x4x2_t data = vld2q_s32(reinterpret_cast<const int32_t *>(input_ptrs[r] + in.offset()));
+            out1.val[0]            = vmlaq_n_s32(out1.val[0], data.val[0], _conv_col[r]);
+            out1.val[1]            = vmlaq_n_s32(out1.val[1], data.val[1], _conv_col[r]);
+        }
+
+        //scale the result if needed
+        if(_scale != 1)
+        {
+            float32x4_t out0_f32_odd  = vcvtq_f32_s32(out0.val[0]);
+            float32x4_t out0_f32_even = vcvtq_f32_s32(out0.val[1]);
+            out0_f32_odd              = vmulq_f32(out0_f32_odd, oneoverscale);
+            out0_f32_even             = vmulq_f32(out0_f32_even, oneoverscale);
+            out0.val[0]               = vcvtq_s32_f32(out0_f32_odd);
+            out0.val[1]               = vcvtq_s32_f32(out0_f32_even);
+
+            float32x4_t out1_f32_odd  = vcvtq_f32_s32(out1.val[0]);
+            float32x4_t out1_f32_even = vcvtq_f32_s32(out1.val[1]);
+            out1_f32_odd              = vmulq_f32(out1_f32_odd, oneoverscale);
+            out1_f32_even             = vmulq_f32(out1_f32_even, oneoverscale);
+            out1.val[0]               = vcvtq_s32_f32(out1_f32_odd);
+            out1.val[1]               = vcvtq_s32_f32(out1_f32_even);
+        }
+
+        const int32x4x2_t out0_s32 = vzipq_s32(out0.val[0], out0.val[1]);
+        store_results(out0_s32.val[0], out0_s32.val[1], reinterpret_cast<OutputType *>(out.ptr()));
+
+        const int32x4x2_t out1_s32 = vzipq_s32(out1.val[0], out1.val[1]);
+        store_results(out1_s32.val[0], out1_s32.val[1], reinterpret_cast<OutputType *>(out.ptr()) + 8);
+    },
+    in, out);
+}
+
+template class arm_compute::NESeparableConvolutionVertKernel<5>;
+template class arm_compute::NESeparableConvolutionVertKernel<7>;
+template class arm_compute::NESeparableConvolutionVertKernel<9>;
+
+/****************************************************************************************\
+ *                                 Rectangle Convolution                                *
+\****************************************************************************************/
+
+NEConvolutionRectangleKernel::NEConvolutionRectangleKernel()
+    : _input(nullptr), _output(nullptr), _scale(0), _convolution(), _border_size(), _func_idx(0)
+{
+}
+
+BorderSize NEConvolutionRectangleKernel::border_size() const
+{
+    return _border_size;
+}
+
+void NEConvolutionRectangleKernel::configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv);
+
+    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON(width != 3 && width != 5 && width != 7 && width != 9);
+    ARM_COMPUTE_ERROR_ON(height != 3 && height != 5 && height != 7 && height != 9);
+    ARM_COMPUTE_ERROR_ON(0 == scale);
+
+    _input       = input;
+    _output      = output;
+    _scale       = scale;
+    _border_size = BorderSize(height / 2, width / 2);
+
+    // Setup the convolution matrix
+    const uint32_t nr_elements = width * height;
+    _convolution.resize(nr_elements);
+    std::copy_n(conv, nr_elements, _convolution.begin());
+
+    // Set function index to help choose appropriate function in run()
+    _func_idx = get_index(height) * 4 + get_index(width);
+    ARM_COMPUTE_ERROR_ON(_func_idx > (_nr_supported_sizes * _nr_supported_sizes));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+
+    Window                 win           = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, _border_size);
+    AccessWindowHorizontal output_access = AccessWindowHorizontal(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, height),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, _border_size);
+
+    INEKernel::configure(win);
+}
+
+void NEConvolutionRectangleKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    using ConvolutionRectangleFunction = void (NEConvolutionRectangleKernel::*)(const Window & window);
+
+    // uint8_t function table
+    static const std::array<ConvolutionRectangleFunction, 16> func_table_u8 =
+    {
+        {
+            &NEConvolutionRectangleKernel::convolution<uint8_t, 3, 3>,
+            &NEConvolutionRectangleKernel::convolution<uint8_t, 3, 5>,
+            &NEConvolutionRectangleKernel::convolution<uint8_t, 3, 7>,
+            &NEConvolutionRectangleKernel::convolution<uint8_t, 3, 9>,
+            &NEConvolutionRectangleKernel::convolution<uint8_t, 5, 3>,
+            &NEConvolutionRectangleKernel::convolution<uint8_t, 5, 5>,
+            &NEConvolutionRectangleKernel::convolution<uint8_t, 5, 7>,
+            &NEConvolutionRectangleKernel::convolution<uint8_t, 5, 9>,
+            &NEConvolutionRectangleKernel::convolution<uint8_t, 7, 3>,
+            &NEConvolutionRectangleKernel::convolution<uint8_t, 7, 5>,
+            &NEConvolutionRectangleKernel::convolution<uint8_t, 7, 7>,
+            &NEConvolutionRectangleKernel::convolution<uint8_t, 7, 9>,
+            &NEConvolutionRectangleKernel::convolution<uint8_t, 9, 3>,
+            &NEConvolutionRectangleKernel::convolution<uint8_t, 9, 5>,
+            &NEConvolutionRectangleKernel::convolution<uint8_t, 9, 7>,
+            &NEConvolutionRectangleKernel::convolution<uint8_t, 9, 9>
+        }
+    };
+    // int16_t function table
+    static const std::array<ConvolutionRectangleFunction, 16> func_table_s16 =
+    {
+        {
+            &NEConvolutionRectangleKernel::convolution<int16_t, 3, 3>,
+            &NEConvolutionRectangleKernel::convolution<int16_t, 3, 5>,
+            &NEConvolutionRectangleKernel::convolution<int16_t, 3, 7>,
+            &NEConvolutionRectangleKernel::convolution<int16_t, 3, 9>,
+            &NEConvolutionRectangleKernel::convolution<int16_t, 5, 3>,
+            &NEConvolutionRectangleKernel::convolution<int16_t, 5, 5>,
+            &NEConvolutionRectangleKernel::convolution<int16_t, 5, 7>,
+            &NEConvolutionRectangleKernel::convolution<int16_t, 5, 9>,
+            &NEConvolutionRectangleKernel::convolution<int16_t, 7, 3>,
+            &NEConvolutionRectangleKernel::convolution<int16_t, 7, 5>,
+            &NEConvolutionRectangleKernel::convolution<int16_t, 7, 7>,
+            &NEConvolutionRectangleKernel::convolution<int16_t, 7, 9>,
+            &NEConvolutionRectangleKernel::convolution<int16_t, 9, 3>,
+            &NEConvolutionRectangleKernel::convolution<int16_t, 9, 5>,
+            &NEConvolutionRectangleKernel::convolution<int16_t, 9, 7>,
+            &NEConvolutionRectangleKernel::convolution<int16_t, 9, 9>
+        }
+    };
+
+    // Run appropriate function
+    switch(_output->info()->format())
+    {
+        case Format::U8:
+            ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_u8.size());
+            (this->*func_table_u8[_func_idx])(window);
+            break;
+        case Format::S16:
+            ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_s16.size());
+            (this->*func_table_s16[_func_idx])(window);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+    }
+}
+
+unsigned int NEConvolutionRectangleKernel::get_index(uint32_t val)
+{
+    switch(val)
+    {
+        case 3:
+            return 0;
+        case 5:
+            return 1;
+        case 7:
+            return 2;
+        case 9:
+            return 3;
+        default:
+            ARM_COMPUTE_ERROR("Not supported dimension size");
+            return 0;
+    }
+}
+
+template <typename OutputType, unsigned int rows, unsigned int cols>
+void NEConvolutionRectangleKernel::convolution(const Window &win)
+{
+    static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
+    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
+
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+
+    std::array<unsigned char *, rows> input_ptrs{ {} };
+    const int16_t    *conv       = _convolution.data();
+    const float32x4_t scale_val  = vdupq_n_f32(1.0f / _scale);
+    const int         k_row_half = rows / 2;
+    const int         k_col_half = cols / 2;
+
+    // Set row pointers
+    for(int i = -k_row_half; i <= k_row_half; ++i)
+    {
+        input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
+    }
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        int32x4_t out  = vdupq_n_s32(0);
+        int32x4_t out2 = vdupq_n_s32(0);
+
+        // Perform appropriate convolution
+        for(unsigned int r = 0; r < rows; ++r)
+        {
+            const uint8x16_t data = vld1q_u8(input_ptrs[r] + input.offset());
+            if(3 == cols)
+            {
+                convolve_row3x1(out, out2, data, conv + r * cols);
+            }
+            else if(5 == cols)
+            {
+                convolve_row5x1(out, out2, data, conv + r * cols);
+            }
+            else if(7 == cols)
+            {
+                convolve_row7x1(out, out2, data, conv + r * cols);
+            }
+            else if(9 == cols)
+            {
+                convolve_row9x1(out, out2, data, conv + r * cols);
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR("Unsupported number of columns");
+            }
+        }
+
+        // Apply scale
+        if(_scale != 1)
+        {
+            // Convert to F32, scale and convert back to S32
+            out  = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
+            out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
+        }
+
+        // Clamp and store as U8 or S16:
+        store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
+    },
+    input, output);
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp b/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
new file mode 100644
index 0000000000..32789cbe33
--- /dev/null
+++ b/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IDistribution1D.h"
+#include "arm_compute/core/ILut.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+
+using namespace arm_compute;
+
+NECumulativeDistributionKernel::NECumulativeDistributionKernel()
+    : _input(nullptr), _distribution(nullptr), _cumulative_sum(nullptr), _output(nullptr)
+{
+}
+
+bool NECumulativeDistributionKernel::is_parallelisable() const
+{
+    return false;
+}
+
+void NECumulativeDistributionKernel::configure(const IImage *input, const IDistribution1D *distribution, IDistribution1D *cumulative_sum, ILut *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, distribution, cumulative_sum, output);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+
+    set_format_if_unknown(*input->info(), Format::U8);
+
+    ARM_COMPUTE_ERROR_ON(distribution->num_bins() != cumulative_sum->num_bins());
+    ARM_COMPUTE_ERROR_ON(distribution->num_bins() != output->num_elements());
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(input->info()->data_type() != output->type());
+
+    _input          = input;
+    _distribution   = distribution;
+    _cumulative_sum = cumulative_sum;
+    _output         = output;
+
+    INEKernel::configure(calculate_max_window(*input->info()));
+}
+
+void NECumulativeDistributionKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_distribution->buffer() == nullptr);
+    ARM_COMPUTE_ERROR_ON(_cumulative_sum->buffer() == nullptr);
+    ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
+    ARM_COMPUTE_ERROR_ON_MSG(_distribution->num_bins() < 256, "Distribution must have 256 bins");
+
+    // Calculate the cumulative distribution (summed histogram).
+    const uint32_t *hist           = _distribution->buffer();
+    uint32_t       *cumulative_sum = _cumulative_sum->buffer();
+    uint8_t        *output         = _output->buffer();
+
+    // Calculate cumulative distribution
+    std::partial_sum(hist, hist + _histogram_size, cumulative_sum);
+
+    // Get the number of pixels that have the lowest value in the input image
+    const uint32_t cd_min = *std::find_if(hist, hist + _histogram_size, [](const uint32_t &v)
+    {
+        return v > 0;
+    });
+    const uint32_t image_size = cumulative_sum[_histogram_size - 1];
+
+    ARM_COMPUTE_ERROR_ON(cd_min > image_size);
+
+    // Create mapping lookup table
+    if(image_size == cd_min)
+    {
+        std::iota(output, output + _histogram_size, 0);
+    }
+    else
+    {
+        const float diff = image_size - cd_min;
+
+        for(unsigned int x = 0; x < _histogram_size; ++x)
+        {
+            output[x] = lround((cumulative_sum[x] - cd_min) / diff * 255.0f);
+        }
+    }
+}
diff --git a/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp b/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp
new file mode 100644
index 0000000000..902490ec38
--- /dev/null
+++ b/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+NEDepthConcatenateKernel::NEDepthConcatenateKernel()
+    : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0)
+{
+}
+
+BorderSize NEDepthConcatenateKernel::border_size() const
+{
+    return BorderSize(_top_bottom, _left_right);
+}
+
+void NEDepthConcatenateKernel::configure(const ITensor *input, unsigned int depth_offset, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
+
+    // The gaps between the two lowest dimensions of input and output need to be divisible by 2
+    // Otherwise it is not clear how the padding should be added onto the input tensor
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2);
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2);
+
+    _input        = input;
+    _output       = output;
+    _depth_offset = depth_offset;
+    _left_right   = (output->info()->dimension(0) - input->info()->dimension(0)) / 2;
+    _top_bottom   = (output->info()->dimension(1) - input->info()->dimension(1)) / 2;
+
+    const unsigned int num_elems_processed_per_iteration = 4;
+    const unsigned int num_elems_read_per_iteration      = 4;
+    const unsigned int num_rows_read_per_iteration       = 1;
+
+    // The window needs to be based on input as we copy all the depths of input
+    Window win = calculate_max_enlarged_window(*input->info(), Steps(num_elems_processed_per_iteration), border_size());
+
+    AccessWindowRectangle  input_access(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NEDepthConcatenateKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    // Offset output
+    const unsigned int offset_to_first_elements_in_bytes = _output->info()->offset_first_element_in_bytes() + _left_right * _output->info()->strides_in_bytes()[0] + _top_bottom *
+                                                           _output->info()->strides_in_bytes()[1] + _depth_offset * _output->info()->strides_in_bytes()[2];
+    uint8_t           *output_ptr                        = _output->buffer() + offset_to_first_elements_in_bytes;
+
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto in_ptr  = reinterpret_cast<const float *>(input.ptr());
+        const auto out_ptr = reinterpret_cast<float *>(output_ptr + output.offset());
+
+        vst1q_f32(out_ptr, vld1q_f32(in_ptr));
+    },
+    input, output);
+}
diff --git a/src/core/NEON/kernels/NEDepthConvertKernel.cpp b/src/core/NEON/kernels/NEDepthConvertKernel.cpp
new file mode 100644
index 0000000000..56612a7703
--- /dev/null
+++ b/src/core/NEON/kernels/NEDepthConvertKernel.cpp
@@ -0,0 +1,384 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+NEDepthConvertKernel::NEDepthConvertKernel()
+    : _policy(), _shift(0)
+{
+}
+
+void NEDepthConvertKernel::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(shift >= 8);
+    ARM_COMPUTE_ERROR_ON(input == output);
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == output->info()->data_type(), "Input and output data_types must be different");
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && (output->info()->data_type() != DataType::F32),
+                             "Only data_types supported [in] QS8 ->  [out] F32");
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::S16 && output->info()->data_type() != DataType::U16
+                                                                            && output->info()->data_type() != DataType::S32),
+                             "Only data_types supported [in] U8 -> [out] U16, S16, S32");
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32),
+                             "Only data_types supported [in] U16 ->  [out] U8, U32");
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::S32),
+                             "Only data_types supported [in] S16 ->  [out] U8, S32");
+
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && (output->info()->data_type() != DataType::QS8),
+                             "Only data_types supported [in] F32 ->  [out] QS8");
+
+    _policy = policy;
+    _shift  = shift;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    INESimpleKernel::configure(input, output, num_elems_processed_per_iteration);
+}
+
+void NEDepthConvertKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(nullptr == _input);
+    ARM_COMPUTE_ERROR_ON(nullptr == _output);
+    ARM_COMPUTE_ERROR_ON(_input == _output);
+
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    switch(_input->info()->data_type())
+    {
+        case DataType::QS8:
+        {
+            const int fixed_point_position = _input->info()->fixed_point_position();
+
+            switch(_output->info()->data_type())
+            {
+                case DataType::F32:
+                {
+                    /* Up-conversion QS8 -> F32 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const int8x16_t texels_s8 = vld1q_s8(reinterpret_cast<const int8_t *>(input.ptr()));
+
+                        float32x4x2_t texels_low  = vcvt_f32_qs8(vget_low_s8(texels_s8), fixed_point_position);
+                        float32x4x2_t texels_high = vcvt_f32_qs8(vget_high_s8(texels_s8), fixed_point_position);
+
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()), texels_low.val[0]);
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, texels_low.val[1]);
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, texels_high.val[0]);
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, texels_high.val[1]);
+                    },
+                    input, output);
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Output data type not supported");
+            }
+            break;
+        }
+        case DataType::U8:
+        {
+            const int16x8_t b = vdupq_n_s16(_shift);
+
+            switch(_output->info()->data_type())
+            {
+                case DataType::S16:
+                {
+                    /* Up-conversion U8 -> S16 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
+
+                        const int16x8x2_t texels =
+                        {
+                            {
+                                vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
+                                vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
+                            }
+                        };
+
+                        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), texels.val[0]);
+                        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, texels.val[1]);
+                    },
+                    input, output);
+                    break;
+                }
+                case DataType::S32:
+                {
+                    /* Up-conversion U8 -> S32 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
+
+                        const int16x8x2_t texels =
+                        {
+                            {
+                                vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
+                                vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
+                            }
+                        };
+
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), vmovl_s16(vget_low_s16(texels.val[0])));
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, vmovl_s16(vget_high_s16(texels.val[0])));
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 8, vmovl_s16(vget_low_s16(texels.val[1])));
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 12, vmovl_s16(vget_high_s16(texels.val[1])));
+                    },
+                    input, output);
+                    break;
+                }
+                case DataType::U16:
+                {
+                    /* Up-conversion U8 -> U16 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const uint8x16_t texels_u8 = vld1q_u8(input.ptr());
+
+                        const uint16x8x2_t texels =
+                        {
+                            {
+                                vshlq_u16(vmovl_u8(vget_low_u8(texels_u8)), b),
+                                vshlq_u16(vmovl_u8(vget_high_u8(texels_u8)), b)
+                            }
+                        };
+
+                        vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), texels.val[0]);
+                        vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()) + 8, texels.val[1]);
+                    },
+                    input, output);
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Output data type not supported");
+            }
+            break;
+        }
+        case DataType::S16:
+        {
+            switch(_output->info()->data_type())
+            {
+                case DataType::U8:
+                {
+                    const int16x8_t b = vdupq_n_s16(-static_cast<int16_t>(_shift));
+
+                    /* Down-conversion S16 -> U8 */
+                    if(ConvertPolicy::SATURATE == _policy)
+                    {
+                        execute_window_loop(window, [&](const Coordinates & id)
+                        {
+                            const int16x8x2_t texels =
+                            {
+                                {
+                                    vqshlq_s16(vld1q_s16(reinterpret_cast<int16_t *>(input.ptr())), b),
+                                    vqshlq_s16(vld1q_s16(reinterpret_cast<int16_t *>(input.ptr()) + 8), b)
+                                }
+                            };
+
+                            vst1q_u8(output.ptr(), vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
+                        },
+                        input, output);
+                    }
+                    else
+                    {
+                        execute_window_loop(window, [&](const Coordinates & id)
+                        {
+                            const int16x8x2_t texels =
+                            {
+                                {
+                                    vshlq_s16(vld1q_s16(reinterpret_cast<int16_t *>(input.ptr())), b),
+                                    vshlq_s16(vld1q_s16(reinterpret_cast<int16_t *>(input.ptr()) + 8), b)
+                                }
+                            };
+
+                            vst1q_u8(output.ptr(), vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
+                                                               vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
+                        },
+                        input, output);
+                    }
+                    break;
+                }
+                case DataType::S32:
+                {
+                    const int32x4_t b = vdupq_n_s32(_shift);
+
+                    /* Up-conversion S16 -> S32 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const int16x8x2_t texels =
+                        {
+                            {
+                                vld1q_s16(reinterpret_cast<int16_t *>(input.ptr())),
+                                vld1q_s16(reinterpret_cast<int16_t *>(input.ptr()) + 8)
+                            }
+                        };
+
+                        const int32x4x4_t texels_s32 =
+                        {
+                            {
+                                vshlq_s32(vmovl_s16(vget_low_s16(texels.val[0])), b),
+                                vshlq_s32(vmovl_s16(vget_high_s16(texels.val[0])), b),
+                                vshlq_s32(vmovl_s16(vget_low_s16(texels.val[1])), b),
+                                vshlq_s32(vmovl_s16(vget_high_s16(texels.val[1])), b)
+                            }
+                        };
+
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), texels_s32.val[0]);
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, texels_s32.val[1]);
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 8, texels_s32.val[2]);
+                        vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 12, texels_s32.val[3]);
+                    },
+                    input, output);
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Output data type not supported");
+            }
+            break;
+        }
+        case DataType::U16:
+        {
+            switch(_output->info()->data_type())
+            {
+                case DataType::U8:
+                {
+                    const int16x8_t b = vdupq_n_s16(-static_cast<int16_t>(_shift));
+
+                    /* Down-conversion U16 -> U8 */
+                    if(ConvertPolicy::SATURATE == _policy)
+                    {
+                        execute_window_loop(window, [&](const Coordinates & id)
+                        {
+                            const uint16x8x2_t texels =
+                            {
+                                {
+                                    vqshlq_u16(vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr())), b),
+                                    vqshlq_u16(vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr()) + 8), b)
+                                }
+                            };
+
+                            vst1q_u8(output.ptr(), vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
+                        },
+                        input, output);
+                    }
+                    else
+                    {
+                        execute_window_loop(window, [&](const Coordinates & id)
+                        {
+                            const uint16x8x2_t texels =
+                            {
+                                {
+                                    vshlq_u16(vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr())), b),
+                                    vshlq_u16(vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr()) + 8), b)
+                                }
+                            };
+
+                            vst1q_u8(output.ptr(), vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
+                        },
+                        input, output);
+                    }
+                    break;
+                }
+                case DataType::U32:
+                {
+                    const int32x4_t b = vdupq_n_s32(_shift);
+
+                    /* Up-conversion U16 -> U32 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const uint16x8x2_t texels =
+                        {
+                            {
+                                vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr())),
+                                vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr()) + 8)
+                            }
+                        };
+
+                        vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr()), vshlq_u32(vmovl_u16(vget_low_u16(texels.val[0])), b));
+                        vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr()) + 4, vshlq_u32(vmovl_u16(vget_high_u16(texels.val[0])), b));
+                        vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr()) + 8, vshlq_u32(vmovl_u16(vget_low_u16(texels.val[1])), b));
+                        vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr()) + 12, vshlq_u32(vmovl_u16(vget_high_u16(texels.val[1])), b));
+                    },
+                    input, output);
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Output data type not supported");
+            }
+            break;
+        }
+        case DataType::F32:
+        {
+            switch(_output->info()->data_type())
+            {
+                case DataType::QS8:
+                {
+                    const int fixed_point_position = _output->info()->fixed_point_position();
+                    /* Down-conversion F32 -> QS8 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const float32x4x4_t texels_f32 =
+                        {
+                            {
+                                vld1q_f32(reinterpret_cast<const float *>(input.ptr())),
+                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 4),
+                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 8),
+                                vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 12)
+                            }
+                        };
+
+                        const qint8x16_t texels_s8 = vcvtq_qs8_f32(texels_f32, fixed_point_position);
+
+                        vst1q_s8(reinterpret_cast<int8_t *>(output.ptr()), texels_s8);
+                    },
+                    input, output);
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Output data type not supported");
+            }
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+    }
+}
diff --git a/src/core/NEON/kernels/NEDerivativeKernel.cpp b/src/core/NEON/kernels/NEDerivativeKernel.cpp
new file mode 100644
index 0000000000..bf7e0972d5
--- /dev/null
+++ b/src/core/NEON/kernels/NEDerivativeKernel.cpp
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEDerivativeKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+NEDerivativeKernel::NEDerivativeKernel()
+    : _func(nullptr), _input(nullptr), _output_x(nullptr), _output_y(nullptr)
+{
+}
+
+BorderSize NEDerivativeKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void NEDerivativeKernel::configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    const bool run_der_x = output_x != nullptr;
+    const bool run_der_y = output_y != nullptr;
+
+    if(run_der_x)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
+    }
+
+    if(run_der_y)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
+    }
+
+    _input    = input;
+    _output_x = output_x;
+    _output_y = output_y;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+    AccessWindowHorizontal out_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal out_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal in_x_access(input->info(), -border_size().left, num_elems_processed_per_iteration);
+    AccessWindowRectangle  in_y_access(input->info(), 0, -border_size().left, num_elems_processed_per_iteration, num_rows_read_per_iteration);
+    AccessWindowRectangle  in_xy_access(input->info(), -border_size().left, -border_size().top, num_elems_processed_per_iteration, num_rows_read_per_iteration);
+
+    if(run_der_x && run_der_y)
+    {
+        _func = &NEDerivativeKernel::derivative_xy;
+        update_window_and_padding(win, in_xy_access, out_x_access, out_y_access);
+        out_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+        out_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    }
+    else
+    {
+        if(run_der_x)
+        {
+            _func = &NEDerivativeKernel::derivative_x;
+            update_window_and_padding(win, in_x_access, out_x_access);
+            out_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+        }
+        else if(run_der_y)
+        {
+            _func = &NEDerivativeKernel::derivative_y;
+            update_window_and_padding(win, in_y_access, out_y_access);
+            out_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+        }
+        else
+        {
+            ARM_COMPUTE_ERROR("At least one output must be NOT NULL");
+        }
+    }
+
+    INEKernel::configure(win);
+}
+
+void NEDerivativeKernel::derivative_x(const Window &window)
+{
+    Iterator in(_input, window);
+    Iterator out_x(_output_x, window);
+
+    /* Apply 1-D centered point discrete derivative mask ([-1 0 1]) along the X direction */
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        /* Load left and right data */
+        const uint8x16_t l_data = vld1q_u8(in.ptr() - 1);
+        const uint8x16_t r_data = vld1q_u8(in.ptr() + 1);
+
+        /* Cast to int16 and perform the subtraction between the right and left data */
+        const int16x8_t out0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(r_data))),
+                                         vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(l_data))));
+
+        /* Cast to int16 and perform the subtraction between the right and left data */
+        const int16x8_t out1 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(r_data))),
+                                         vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(l_data))));
+
+        /* Store result of derivative along the X direction */
+        vst1q_s16(reinterpret_cast<int16_t *>(out_x.ptr()), out0);
+        vst1q_s16(reinterpret_cast<int16_t *>(out_x.ptr()) + 8, out1);
+    },
+    in, out_x);
+}
+
+void NEDerivativeKernel::derivative_y(const Window &window)
+{
+    Iterator in(_input, window);
+    Iterator out_y(_output_y, window);
+
+    const size_t stride = _input->info()->strides_in_bytes()[1];
+
+    /* Apply 1-D centered point discrete derivative mask ([-1 0 1]^T) along the Y direction */
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        /* Load top and bottom data */
+        const uint8x16_t t_data = vld1q_u8(in.ptr() - stride);
+        const uint8x16_t b_data = vld1q_u8(in.ptr() + stride);
+
+        /* Cast to int16 and perform the subtraction between the bottom and top data */
+        const int16x8_t out0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b_data))),
+                                         vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t_data))));
+
+        /* Cast to int16 and perform the subtraction between the bottom and top data */
+        const int16x8_t out1 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b_data))),
+                                         vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t_data))));
+
+        /* Store result of derivative along the Y direction */
+        vst1q_s16(reinterpret_cast<int16_t *>(out_y.ptr()), out0);
+        vst1q_s16(reinterpret_cast<int16_t *>(out_y.ptr()) + 8, out1);
+    },
+    in, out_y);
+}
+
+void NEDerivativeKernel::derivative_xy(const Window &window)
+{
+    Iterator in(_input, window);
+    Iterator out_x(_output_x, window);
+    Iterator out_y(_output_y, window);
+
+    const size_t stride = _input->info()->strides_in_bytes()[1];
+
+    /* Apply 1-D centered point discrete derivative masks ([-1 0 1] and [-1 0 1]^T) along the X and Y directions */
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        /* Load top, bottom, left and right data */
+        const uint8x16_t t_data = vld1q_u8(in.ptr() - stride);
+        const uint8x16_t b_data = vld1q_u8(in.ptr() + stride);
+        const uint8x16_t l_data = vld1q_u8(in.ptr() - 1);
+        const uint8x16_t r_data = vld1q_u8(in.ptr() + 1);
+
+        /* Cast to int16 and perform the subtraction between the bottom and top data */
+        const int16x8_t out0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b_data))),
+                                         vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t_data))));
+
+        /* Cast to int16 and perform the subtraction between the bottom and top data */
+        const int16x8_t out1 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b_data))),
+                                         vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t_data))));
+
+        /* Cast to int16 and perform the subtraction between the right and left data */
+        const int16x8_t out2 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(r_data))),
+                                         vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(l_data))));
+
+        /* Cast to int16 and perform the subtraction between the right and left data */
+        const int16x8_t out3 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(r_data))),
+                                         vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(l_data))));
+
+        /* Store result of derivative along the Y direction */
+        vst1q_s16(reinterpret_cast<int16_t *>(out_y.ptr()), out0);
+        vst1q_s16(reinterpret_cast<int16_t *>(out_y.ptr()) + 8, out1);
+
+        /* Store result of derivative along the X direction */
+        vst1q_s16(reinterpret_cast<int16_t *>(out_x.ptr()), out2);
+        vst1q_s16(reinterpret_cast<int16_t *>(out_x.ptr()) + 8, out3);
+    },
+    in, out_x, out_y);
+}
+
+void NEDerivativeKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (this->*_func)(window);
+}
diff --git a/src/core/NEON/kernels/NEDilateKernel.cpp b/src/core/NEON/kernels/NEDilateKernel.cpp
new file mode 100644
index 0000000000..867cf77c49
--- /dev/null
+++ b/src/core/NEON/kernels/NEDilateKernel.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEDilateKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+BorderSize NEDilateKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void NEDilateKernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
+{
+    _input  = input;
+    _output = output;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+void NEDilateKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+
+    Iterator in(_input, window);
+    Iterator out(_output, window);
+
+    const size_t in_stride = _input->info()->strides_in_bytes()[1];
+
+    execute_window_loop(window, [&](const Coordinates &)
+    {
+        uint8_t         *in_ptr   = in.ptr() - 1;
+        const uint8x16_t top_data = vld1q_u8(in_ptr - in_stride);
+        const uint8x16_t mid_data = vld1q_u8(in_ptr);
+        const uint8x16_t bot_data = vld1q_u8(in_ptr + in_stride);
+
+        uint8x8_t top_high_data = vget_high_u8(top_data);
+        uint8x8_t top_low_data  = vget_low_u8(top_data);
+
+        uint8x8_t mid_high_data = vget_high_u8(mid_data);
+        uint8x8_t mid_low_data  = vget_low_u8(mid_data);
+
+        uint8x8_t bot_high_data = vget_high_u8(bot_data);
+        uint8x8_t bot_low_data  = vget_low_u8(bot_data);
+
+        uint8x8_t p0, p1;
+
+        p0 = top_low_data;
+        p1 = vext_u8(top_low_data, top_high_data, 1);
+        p0 = vmax_u8(p0, p1);
+
+        p1 = vext_u8(top_low_data, top_high_data, 2);
+        p0 = vmax_u8(p0, p1);
+
+        p1 = mid_low_data;
+        p0 = vmax_u8(p0, p1);
+
+        p1 = vext_u8(mid_low_data, mid_high_data, 1);
+        p0 = vmax_u8(p0, p1);
+
+        p1 = vext_u8(mid_low_data, mid_high_data, 2);
+        p0 = vmax_u8(p0, p1);
+
+        p1 = bot_low_data;
+        p0 = vmax_u8(p0, p1);
+
+        p1 = vext_u8(bot_low_data, bot_high_data, 1);
+        p0 = vmax_u8(p0, p1);
+
+        p1 = vext_u8(bot_low_data, bot_high_data, 2);
+        p0 = vmax_u8(p0, p1);
+
+        vst1_u8(out.ptr(), p0);
+    },
+    in, out);
+}
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp
new file mode 100644
index 0000000000..effc50e7c0
--- /dev/null
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace
+{
+// Internal load
+inline float32x4_t internal_vld1q(const float *in)
+{
+    return vld1q_f32(in);
+}
+inline qint8x16_t internal_vld1q(const qint8_t *in)
+{
+    return vld1q_qs8(in);
+}
+inline qint16x8_t internal_vld1q(const qint16_t *in)
+{
+    return vld1q_qs16(in);
+}
+
+// Internal store
+inline void internal_vst1q(float *p, const float32x4_t &v)
+{
+    vst1q_f32(p, v);
+}
+inline void internal_vst1q(qint8_t *p, const qint8x16_t &v)
+{
+    vst1q_qs8(p, v);
+}
+inline void internal_vst1q(qint8_t *p, const qint16x8_t &v)
+{
+    vst1_qs8(p, vqmovn_s16(v));
+}
+inline void internal_vst1q(qint16_t *p, const qint16x8_t &v)
+{
+    vst1q_qs16(p, v);
+}
+
+// Internal vdup
+inline float32x4_t internal_vdupq_n(float v)
+{
+    return vdupq_n_f32(v);
+}
+inline qint8x16_t internal_vdupq_n(qint8_t v)
+{
+    return vdupq_n_qs8(v);
+}
+inline qint16x8_t internal_vdupq_n(qint16_t v)
+{
+    return vdupq_n_qs16(v);
+}
+
+// Internal vadd
+inline float32x4_t internal_vqaddq(const float32x4_t &x, const float32x4_t &y)
+{
+    return vaddq_f32(x, y);
+}
+inline qint8x16_t internal_vqaddq(const qint8x16_t &x, const qint8x16_t &y)
+{
+    return vqaddq_qs8(x, y);
+}
+inline qint16x8_t internal_vqaddq(const qint16x8_t &x, const qint16x8_t &y)
+{
+    return vqaddq_qs16(x, y);
+}
+
+template <typename T1, typename T2, bool in_place>
+void accumulate_bias(ITensor *input, const ITensor *bias, const Window window, ITensor *output)
+{
+    Iterator in(input, window);
+
+    if(in_place) // In place accumulate
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            // Get bias and pointer to input
+            const auto in_ptr = reinterpret_cast<T1 *>(in.ptr());
+            const auto vb     = internal_vdupq_n(static_cast<T1>(*reinterpret_cast<const T2 *>(bias->ptr_to_element(Coordinates(id.z())))));
+
+            // Accumulate bias
+            internal_vst1q(in_ptr, internal_vqaddq(internal_vld1q(in_ptr), vb));
+        },
+        in);
+    }
+    else // Out of place accumulate
+    {
+        Iterator out(output, window);
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            // Get bias and pointer to input
+            const auto in_ptr  = reinterpret_cast<const T1 *>(in.ptr());
+            const auto out_ptr = reinterpret_cast<T2 *>(out.ptr());
+            const auto vb      = internal_vdupq_n(static_cast<T1>(*reinterpret_cast<const T2 *>(bias->ptr_to_element(Coordinates(id.z())))));
+
+            // Accumulate bias
+            internal_vst1q(out_ptr, internal_vqaddq(internal_vld1q(in_ptr), vb));
+        },
+        in, out);
+    }
+}
+} // namespace
+
+NEDirectConvolutionLayerBiasAccumulateKernel::NEDirectConvolutionLayerBiasAccumulateKernel()
+    : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr)
+{
+}
+
+void NEDirectConvolutionLayerBiasAccumulateKernel::configure(ITensor *input, const ITensor *bias, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(input->info()->fixed_point_position() != bias->info()->fixed_point_position());
+    if(output != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QS16, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(bias, output);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(bias, output);
+    }
+    ARM_COMPUTE_ERROR_ON(bias->info()->num_dimensions() > 1);
+
+    _func   = nullptr;
+    _bias   = bias;
+    _input  = input;
+    _output = output;
+
+    const unsigned int num_elems_processed_per_iteration = 16 / element_size_from_data_type(input->info()->data_type());
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     bias_access(bias->info(), 0, 0, bias->info()->dimension(0), bias->info()->dimension(1));
+    if(output != nullptr)
+    {
+        AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+        update_window_and_padding(win, input_access, output_access, bias_access);
+        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    }
+    else
+    {
+        update_window_and_padding(win, input_access, bias_access);
+        input_access.set_valid_region(win, ValidRegion(Coordinates(), input->info()->tensor_shape()));
+    }
+    INEKernel::configure(win);
+
+    // Set appropriate function
+    if(input->info()->data_type() == DataType::F32)
+    {
+        _func = (output == nullptr) ? &accumulate_bias<float, float, true> : &accumulate_bias<float, float, false>;
+    }
+    else if(input->info()->data_type() == DataType::QS8)
+    {
+        _func = (output == nullptr) ? &accumulate_bias<qint8_t, qint8_t, true> : &accumulate_bias<qint8_t, qint8_t, false>;
+    }
+    else if(input->info()->data_type() == DataType::QS16 && bias->info()->data_type() == DataType::QS8)
+    {
+        _func = (output == nullptr) ? &accumulate_bias<qint16_t, qint8_t, true> : &accumulate_bias<qint16_t, qint8_t, false>;
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
+    }
+}
+
+void NEDirectConvolutionLayerBiasAccumulateKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (*_func)(_input, _bias, window, _output);
+}
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
new file mode 100644
index 0000000000..d6088981aa
--- /dev/null
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
@@ -0,0 +1,817 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+namespace
+{
+template <unsigned int stridex>
+float32x4_t internal_vld1q(const float *in);
+
+template <>
+float32x4_t internal_vld1q<1>(const float *in)
+{
+    return vld1q_f32(in);
+}
+
+template <>
+float32x4_t internal_vld1q<2>(const float *in)
+{
+    const float32x4x2_t tmp = vld2q_f32(in);
+    return tmp.val[0];
+}
+
+template <>
+float32x4_t internal_vld1q<3>(const float *in)
+{
+    const float32x4x3_t tmp = vld3q_f32(in);
+    return tmp.val[0];
+}
+
+template <unsigned int stridex>
+qint8x8_t internal_vld1q(const qint8_t *in);
+
+template <>
+qint8x8_t internal_vld1q<1>(const qint8_t *in)
+{
+    return vld1_qs8(in);
+}
+
+template <>
+qint8x8_t internal_vld1q<2>(const qint8_t *in)
+{
+    const qint8x8x2_t tmp = vld2_s8(in);
+    return tmp.val[0];
+}
+
+template <>
+qint8x8_t internal_vld1q<3>(const qint8_t *in)
+{
+    const qint8x8x3_t tmp = vld3_s8(in);
+    return tmp.val[0];
+}
+
+template <unsigned int stridex>
+qint16x8_t internal_vld1q(const qint16_t *in);
+
+template <>
+qint16x8_t internal_vld1q<1>(const qint16_t *in)
+{
+    return vld1q_s16(in);
+}
+
+inline float32x4_t internal_vdupq_n(float v)
+{
+    return vdupq_n_f32(v);
+}
+
+inline qint8x8_t internal_vdupq_n(qint8_t v)
+{
+    return vdup_n_qs8(v);
+}
+
+inline void internal_vst1q(float *p, const float32x4_t &v)
+{
+    vst1q_f32(p, v);
+}
+
+inline void internal_vst1q(qint16_t *p, const qint16x8_t &v)
+{
+    vst1q_qs16(p, v);
+}
+
+float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y, int fixed_point_position)
+{
+    ARM_COMPUTE_UNUSED(fixed_point_position);
+    return vmulq_f32(x, y);
+}
+
+qint16x8_t internal_vmull(const qint8x8_t &x, const qint8x8_t &y, int fixed_point_position)
+{
+    return vmull_qs8(x, y, fixed_point_position);
+}
+
+inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z, int fixed_point_position)
+{
+    ARM_COMPUTE_UNUSED(fixed_point_position);
+    return vmlaq_f32(x, y, z);
+}
+
+inline qint16x8_t internal_vmlal(const qint16x8_t &x, const qint8x8_t &y, const qint8x8_t &z, int fixed_point_position)
+{
+    return vqmlal_qs8(x, y, z, fixed_point_position);
+}
+
+template <typename T1, typename T2, unsigned int stridex>
+class convolver_1x1
+{
+public:
+    static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
+                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+    {
+        const int          input_stride_y       = input->info()->strides_in_bytes().y();
+        const int          input_stride_z       = input->info()->strides_in_bytes().z();
+        const int          output_stride_y      = output->info()->strides_in_bytes().y();
+        const int          output_stride_z      = output->info()->strides_in_bytes().z();
+        const int          kernel_stride_z      = weights->info()->strides_in_bytes().z();
+        const int          kernel_stride_w      = weights->info()->strides_in_bytes()[3];
+        const int          output_w             = output->info()->dimension(0);
+        const int          output_h             = output->info()->dimension(1);
+        const int          range_z              = window.z().end() - window.z().start();
+        const int          kernel_depth         = weights->info()->dimension(Window::DimZ);
+        const unsigned int conv_stride_y        = std::get<1>(conv_info.stride());
+        const int          fixed_point_position = input->info()->fixed_point_position();
+
+        // setup output window for the iterator
+        Window window_out = window;
+        window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX)));
+        window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY)));
+        window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z));
+
+        // setup input window for the iterator
+        Window window_in = window;
+        // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
+        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+        Window window_k = calculate_max_window(*weights->info(), Steps(1u));
+
+        Iterator out(output, window_out);
+        Iterator in(input, window_in);
+        Iterator k(weights, window_k);
+
+        const uint8_t *k_ptr = k.ptr();
+
+        execute_window_loop(window_out, [&](const Coordinates & id)
+        {
+            /*
+                For a detailed explanation on how the algorithm works refer to template <> class convolver_3x3<1>
+            */
+            const uint8_t *input_ptr = in.ptr();
+            uint8_t       *out_ptr   = out.ptr();
+            int            ih        = 0;
+            int            oh        = 0;
+            for(int oz = 0; oz < range_z; ++oz)
+            {
+                auto p_out_base = out_ptr + oz * output_stride_z;
+                // Step 1
+                {
+                    const auto k_val = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
+                    const auto vk    = internal_vdupq_n(*k_val);
+                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
+                    {
+                        const int offset_xy = ih * input_stride_y;
+                        auto      in_val    = reinterpret_cast<const T1 *>(input_ptr + (0 * input_stride_z + offset_xy));
+                        auto      p_out     = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
+                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)
+                        {
+                            internal_vst1q(p_out, internal_vmull(vk, internal_vld1q<stridex>(in_val), fixed_point_position));
+                        }
+                    }
+                }
+                // Step 2
+                for(int p = 1; p < kernel_depth; ++p)
+                {
+                    const auto k_val = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
+                    const auto vk    = internal_vdupq_n(*k_val);
+                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
+                    {
+                        const int offset_xy = ih * input_stride_y;
+                        auto      in_val    = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + offset_xy);
+                        auto      p_out     = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
+                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)
+                        {
+                            internal_vst1q(p_out, internal_vmlal(internal_vld1q<1>(p_out), vk, internal_vld1q<stridex>(in_val), fixed_point_position));
+                        }
+                    }
+                }
+            }
+        },
+        in, out);
+    }
+};
+
+inline float32x4x3_t load_matrix_row(const float *ptr)
+{
+    const float32x4x3_t r =
+    {
+        {
+            vld1q_dup_f32(ptr),
+            vld1q_dup_f32(1 + ptr),
+            vld1q_dup_f32(2 + ptr)
+        }
+    };
+    return r;
+}
+inline qint8x8x3_t load_matrix_row(const qint8_t *ptr)
+{
+    /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
+       r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
+    const qint8x8x3_t r =
+    {
+        {
+            vld1_dup_qs8(ptr),
+            vld1_dup_qs8(1 + ptr),
+            vld1_dup_qs8(2 + ptr)
+        }
+    };
+    return r;
+}
+
+template <unsigned int stridex>
+float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position);
+
+template <>
+inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
+{
+    ARM_COMPUTE_UNUSED(fixed_point_position);
+
+    const float32x4x3_t vtop =
+    {
+        {
+            vld1q_f32(in_top),
+            vld1q_f32(in_top + 4),
+            vld1q_f32(in_top + 8)
+        }
+    };
+    const float32x4x3_t vmid =
+    {
+        {
+            vld1q_f32(in_mid),
+            vld1q_f32(in_mid + 4),
+            vld1q_f32(in_mid + 8)
+        }
+    };
+    const float32x4x3_t vlow =
+    {
+        {
+            vld1q_f32(in_low),
+            vld1q_f32(in_low + 4),
+            vld1q_f32(in_low + 8)
+        }
+    };
+    float32x4x2_t out =
+    {
+        {
+            vmulq_f32(vtop.val[0], m0.val[0]),
+            vmulq_f32(vtop.val[1], m0.val[0])
+        }
+    };
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
+    out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]);
+    out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]);
+    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]);
+    out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]);
+    out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
+    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
+    return out;
+}
+
+template <>
+inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
+{
+    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
+    return out;
+}
+
+template <>
+inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
+{
+    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
+    return out;
+}
+
+template <unsigned int stridex>
+qint16x8x2_t convolve_3x3(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position);
+
+template <>
+inline qint16x8x2_t convolve_3x3<1>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position)
+{
+    ARM_COMPUTE_UNUSED(fixed_point_position);
+
+    const qint8x8x3_t vtop =
+    {
+        {
+            vld1_qs8(in_top),
+            vld1_qs8(in_top + 8),
+            vld1_qs8(in_top + 16)
+        }
+    };
+    const qint8x8x3_t vmid =
+    {
+        {
+            vld1_qs8(in_mid),
+            vld1_qs8(in_mid + 8),
+            vld1_qs8(in_mid + 16)
+        }
+    };
+    const qint8x8x3_t vlow =
+    {
+        {
+            vld1_qs8(in_low),
+            vld1_qs8(in_low + 8),
+            vld1_qs8(in_low + 16)
+        }
+    };
+    qint16x8x2_t out =
+    {
+        {
+            vmull_qs8(vtop.val[0], m0.val[0], fixed_point_position),
+            vmull_qs8(vtop.val[1], m0.val[0], fixed_point_position)
+        }
+    };
+    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vtop.val[0], vtop.val[1], 1), m0.val[1], fixed_point_position);
+    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vtop.val[0], vtop.val[1], 2), m0.val[2], fixed_point_position);
+    out.val[0] = vqmlal_qs8(out.val[0], vmid.val[0], m1.val[0], fixed_point_position);
+    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vmid.val[0], vmid.val[1], 1), m1.val[1], fixed_point_position);
+    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vmid.val[0], vmid.val[1], 2), m1.val[2], fixed_point_position);
+    out.val[0] = vqmlal_qs8(out.val[0], vlow.val[0], m2.val[0], fixed_point_position);
+    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vlow.val[0], vlow.val[1], 1), m2.val[1], fixed_point_position);
+    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vlow.val[0], vlow.val[1], 2), m2.val[2], fixed_point_position);
+    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vtop.val[1], vtop.val[2], 1), m0.val[1], fixed_point_position);
+    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vtop.val[1], vtop.val[2], 2), m0.val[2], fixed_point_position);
+    out.val[1] = vqmlal_qs8(out.val[1], vmid.val[1], m1.val[0], fixed_point_position);
+    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vmid.val[1], vmid.val[2], 1), m1.val[1], fixed_point_position);
+    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vmid.val[1], vmid.val[2], 2), m1.val[2], fixed_point_position);
+    out.val[1] = vqmlal_qs8(out.val[1], vlow.val[1], m2.val[0], fixed_point_position);
+    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vlow.val[1], vlow.val[2], 1), m2.val[1], fixed_point_position);
+    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vlow.val[1], vlow.val[2], 2), m2.val[2], fixed_point_position);
+    return out;
+}
+
+template <>
+inline qint16x8x2_t convolve_3x3<2>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position)
+{
+    qint16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 2), out.val[0], 1);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 4), out.val[0], 2);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 6), out.val[0], 3);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 0), out.val[0], 4);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 2), out.val[0], 5);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 4), out.val[0], 6);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 6), out.val[0], 7);
+    return out;
+}
+
+template <>
+inline qint16x8x2_t convolve_3x3<3>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position)
+{
+    qint16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 3), out.val[0], 1);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 6), out.val[0], 2);
+    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 1), out.val[0], 3);
+    return out;
+}
+
+template <unsigned int stridex>
+void store_results(float *buffer, const float32x4x2_t &values);
+
+template <>
+void store_results<1>(float *buffer, const float32x4x2_t &values)
+{
+    vst1q_f32(buffer, values.val[0]);
+    vst1q_f32(buffer + 4, values.val[1]);
+}
+
+template <>
+void store_results<2>(float *buffer, const float32x4x2_t &values)
+{
+    vst1q_f32(buffer, values.val[0]);
+}
+
+template <>
+void store_results<3>(float *buffer, const float32x4x2_t &values)
+{
+    vst1_f32(buffer, vget_low_f32(values.val[0]));
+}
+
+template <unsigned int stridex>
+void store_results(qint16_t *buffer, const qint16x8x2_t &values);
+
+template <>
+void store_results<1>(qint16_t *buffer, const qint16x8x2_t &values)
+{
+    vst1q_qs16(buffer, values.val[0]);
+    vst1q_qs16(buffer + 8, values.val[1]);
+}
+
+template <>
+void store_results<2>(qint16_t *buffer, const qint16x8x2_t &values)
+{
+    vst1q_qs16(buffer, values.val[0]);
+}
+
+template <>
+void store_results<3>(qint16_t *buffer, const qint16x8x2_t &values)
+{
+    vst1_qs16(buffer, vget_low_s16(values.val[0]));
+}
+
+template <unsigned int stridex>
+void accumulate_results(float *buffer, const float32x4x2_t &values);
+
+template <>
+void accumulate_results<1>(float *buffer, const float32x4x2_t &values)
+{
+    vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0]));
+    vst1q_f32(buffer + 4, vaddq_f32(vld1q_f32(buffer + 4), values.val[1]));
+}
+
+template <>
+void accumulate_results<2>(float *buffer, const float32x4x2_t &values)
+{
+    vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0]));
+}
+
+template <>
+void accumulate_results<3>(float *buffer, const float32x4x2_t &values)
+{
+    vst1_f32(buffer, vadd_f32(vld1_f32(buffer), vget_low_f32(values.val[0])));
+}
+
+template <unsigned int stridex>
+void accumulate_results(qint16_t *buffer, const qint16x8x2_t &values);
+
+template <>
+void accumulate_results<1>(qint16_t *buffer, const qint16x8x2_t &values)
+{
+    vst1q_qs16(buffer, vqaddq_qs16(vld1q_qs16(buffer), values.val[0]));
+    vst1q_qs16(buffer + 8, vqaddq_qs16(vld1q_qs16(buffer + 8), values.val[1]));
+}
+
+template <>
+void accumulate_results<2>(qint16_t *buffer, const qint16x8x2_t &values)
+{
+    vst1q_qs16(buffer, vqaddq_qs16(vld1q_qs16(buffer), values.val[0]));
+}
+
+template <>
+void accumulate_results<3>(qint16_t *buffer, const qint16x8x2_t &values)
+{
+    vst1_qs16(buffer, vqadd_qs16(vld1_qs16(buffer), vget_low_s16(values.val[0])));
+}
+
+template <unsigned int stridex>
+int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration);
+
+template <>
+int get_input_num_elems_processed<1>(unsigned int num_elems_written_per_iteration)
+{
+    return num_elems_written_per_iteration;
+}
+
+template <>
+int get_input_num_elems_processed<2>(unsigned int num_elems_written_per_iteration)
+{
+    return num_elems_written_per_iteration << 1;
+}
+
+template <>
+int get_input_num_elems_processed<3>(unsigned int num_elems_written_per_iteration)
+{
+    return num_elems_written_per_iteration * 3;
+}
+
+template <typename T1, typename T2, unsigned int stridex>
+class convolver_3x3
+{
+public:
+    static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
+                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+    {
+        ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);
+        const int          input_stride_x       = input->info()->strides_in_bytes().x();
+        const int          input_stride_y       = input->info()->strides_in_bytes().y();
+        const int          input_stride_z       = input->info()->strides_in_bytes().z();
+        const int          output_stride_y      = output->info()->strides_in_bytes().y();
+        const int          output_stride_z      = output->info()->strides_in_bytes().z();
+        const int          kernel_stride_x      = weights->info()->strides_in_bytes().x();
+        const int          kernel_stride_y      = weights->info()->strides_in_bytes().y();
+        const int          kernel_stride_z      = weights->info()->strides_in_bytes().z();
+        const int          kernel_stride_w      = weights->info()->strides_in_bytes()[3];
+        const int          output_w             = output->info()->dimension(0);
+        const int          output_h             = output->info()->dimension(1);
+        const int          num_planes_z         = window.z().end() - window.z().start();
+        const int          delta_input          = get_input_num_elems_processed<stridex>(num_elems_written_per_iteration);
+        const int          kernel_depth         = weights->info()->dimension(Window::DimZ);
+        const unsigned int conv_stride_y        = std::get<1>(conv_info.stride());
+        const unsigned int conv_pad_x           = std::get<0>(conv_info.pad());
+        const unsigned int conv_pad_y           = std::get<1>(conv_info.pad());
+        const int          fixed_point_position = input->info()->fixed_point_position();
+
+        // setup output window for the iterator
+        Window window_out = window;
+        window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX)));
+        window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY)));
+        window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));
+
+        // setup input window for the iterator
+        Window window_in = window;
+        // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
+        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+        Window window_k = calculate_max_window(*weights->info(), Steps(1u));
+
+        Iterator out(output, window_out);
+        Iterator in(input, window_in);
+        Iterator k(weights, window_k);
+
+        const uint8_t *k_ptr = k.ptr();
+
+        execute_window_loop(window_out, [&](const Coordinates & id)
+        {
+            const uint8_t *input_ptr = in.ptr() - conv_pad_x * input_stride_x - conv_pad_y * input_stride_y;
+            uint8_t       *out_ptr   = out.ptr();
+            int            ih        = 0;
+            int            oh        = 0;
+            /*
+                    Each thread executing this kernel computes one or more output's volume planes.
+
+                    Let's say the 3rd dimension of the output volume is 32, the first thread will compute the output for Z = [0,7], the second thread will compute the output for Z = [8,15],
+                    the third thread [16,24] and the fourth thread [25,31].
+
+                    The algorithm outer loop iterates over Z, P, Y, X where P is the depth/3rd dimension of each kernel. This order is not arbitrary, the main benefit of this
+                    is that we setup the neon registers containing the kernerl's values only once and then compute each XY using the preloaded registers as opposed as doing this for every XY value.
+
+                    The algorithm does not require allocating any additional memory amd computes the results directly in-place in two stages:
+                        1) Convolve plane 0 with kernel 0 and initialize the corresponding output plane with these values.
+                        2) Convolve the remaining planes and accumulate the results in the output's plane which has been initialized in step 1.
+            */
+
+            for(int oz = 0; oz < num_planes_z; ++oz)
+            {
+                uint8_t *p_out_base = out_ptr + oz * output_stride_z;
+                // Step 1
+                {
+                    const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto vk_r0    = load_matrix_row(ptr_k_r0);
+                    const auto vk_r1    = load_matrix_row(ptr_k_r1);
+                    const auto vk_r2    = load_matrix_row(ptr_k_r2);
+                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
+                    {
+                        auto in_top = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y);
+                        auto in_mid = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y);
+                        auto in_low = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y);
+                        auto p_out  = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
+                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
+                            in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
+                        {
+                            auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2, fixed_point_position);
+                            store_results<stridex>(p_out, vres);
+                        }
+                    }
+                }
+                // Step 2
+                for(int p = 1; p < kernel_depth; ++p)
+                {
+                    const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
+                    const auto vk_r0    = load_matrix_row(ptr_k_r0);
+                    const auto vk_r1    = load_matrix_row(ptr_k_r1);
+                    const auto vk_r2    = load_matrix_row(ptr_k_r2);
+                    for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
+                    {
+                        auto in_top = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 0) * input_stride_y);
+                        auto in_mid = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 1) * input_stride_y);
+                        auto in_low = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 2) * input_stride_y);
+                        auto p_out  = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
+                        for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
+                            in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
+                        {
+                            auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2, fixed_point_position);
+                            accumulate_results<stridex>(p_out, vres);
+                        }
+                    }
+                }
+            }
+        },
+        in, out);
+    }
+};
+
+template <typename T1, typename T2>
+inline void convolve_1x1(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
+                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+{
+    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
+    switch(conv_stride_x)
+    {
+        case 1:
+            convolver_1x1<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+            break;
+        case 2:
+            convolver_1x1<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+            break;
+        case 3:
+            convolver_1x1<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+
+template <typename T1, typename T2>
+inline void convolve_3x3(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
+                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+{
+    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
+    switch(conv_stride_x)
+    {
+        case 1:
+            convolver_3x3<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+            break;
+        case 2:
+            convolver_3x3<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+            break;
+        case 3:
+            convolver_3x3<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+} // namespace
+
+NEDirectConvolutionLayerKernel::NEDirectConvolutionLayerKernel()
+    : _input(nullptr), _weights(nullptr), _output(nullptr), _conv_info(), _border_size(0), _kernel_size(0), _num_elems_read_per_iteration(0), _num_elems_written_per_iteration(0)
+{
+}
+
+BorderSize NEDirectConvolutionLayerKernel::border_size() const
+{
+    return _border_size;
+}
+
+void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 1 && (std::get<0>(conv_info.pad()) || std::get<1>(conv_info.pad())),
+                             "Pad > 0 not supported for 1x1 weights");
+    ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 3 && (std::get<0>(conv_info.pad()) > 1 || std::get<1>(conv_info.pad()) > 1),
+                             "Pad > 1 not supported for 3x3 weights");
+    ARM_COMPUTE_ERROR_ON_MSG(std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported.");
+
+    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
+    const unsigned int conv_pad_x    = std::get<0>(conv_info.pad());
+    const unsigned int conv_pad_y    = std::get<1>(conv_info.pad());
+
+    _input       = input;
+    _weights     = weights;
+    _output      = output;
+    _conv_info   = conv_info;
+    _kernel_size = weights->info()->dimension(0);
+    _border_size = BorderSize(conv_pad_y, conv_pad_x);
+
+    Window win = calculate_max_window(*output->info());
+
+    switch(_kernel_size)
+    {
+        case 1:
+        {
+            _num_elems_written_per_iteration = (input->info()->data_type() == DataType::QS8) ? 8 : 4;
+            _num_elems_read_per_iteration    = conv_stride_x * _num_elems_written_per_iteration;
+
+            win = calculate_max_window(*output->info(), Steps(_num_elems_written_per_iteration));
+            AccessWindowHorizontal input_access(input->info(), 0, _num_elems_read_per_iteration);
+            AccessWindowHorizontal output_access(output->info(), 0, _num_elems_written_per_iteration);
+            update_window_and_padding(win, input_access, output_access);
+            output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+            break;
+        }
+        case 3:
+        {
+            if(input->info()->data_type() == DataType::F32)
+            {
+                _num_elems_read_per_iteration    = 12;
+                _num_elems_written_per_iteration = 16 >> conv_stride_x;
+            }
+            else
+            {
+                _num_elems_read_per_iteration    = 24;
+                _num_elems_written_per_iteration = 32 >> conv_stride_x;
+            }
+
+            // Calculate right and bottom border
+            const unsigned int conv_stride_y = std::get<1>(_conv_info.stride());
+            const int          input_width   = input->info()->dimension(0);
+            const int          input_height  = input->info()->dimension(1);
+            const int          upper_bound_w = ceil_to_multiple(((output->info()->dimension(0) - 1) * conv_stride_x + _kernel_size), _num_elems_read_per_iteration) - conv_pad_x - input_width;
+            const int          upper_bound_h = ((output->info()->dimension(1) - 1) * conv_stride_y - conv_pad_y + _kernel_size) - input_height;
+            _border_size.right               = std::max(upper_bound_w, static_cast<int>(_kernel_size));
+            _border_size.bottom              = std::max(upper_bound_h, static_cast<int>(_kernel_size));
+
+            // Create window and update padding
+            win = calculate_max_window(*output->info(), Steps(_num_elems_written_per_iteration));
+            AccessWindowStatic     input_access(input->info(), -conv_pad_x, -conv_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);
+            AccessWindowStatic     weights_access(weights->info(), 0, 0, _kernel_size, _kernel_size);
+            AccessWindowHorizontal output_access(output->info(), 0, _num_elems_written_per_iteration);
+            update_window_and_padding(win, input_access, weights_access, output_access);
+            output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Not implemented");
+            break;
+        }
+    }
+
+    INEKernel::configure(win);
+}
+
+void NEDirectConvolutionLayerKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
+
+    const int kernel_size = _weights->info()->dimension(0);
+
+    switch(kernel_size)
+    {
+        case 1:
+        {
+            if(_input->info()->data_type() == DataType::QS8)
+            {
+                convolve_1x1<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+            }
+            else
+            {
+                convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+            }
+            break;
+        }
+        case 3:
+        {
+            if(_input->info()->data_type() == DataType::QS8)
+            {
+                convolve_3x3<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+            }
+            else
+            {
+                convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+            }
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Only kernel sizes 1x1 and 3x3 are supported.");
+            break;
+        }
+    }
+}
diff --git a/src/core/NEON/kernels/NEErodeKernel.cpp b/src/core/NEON/kernels/NEErodeKernel.cpp
new file mode 100644
index 0000000000..398503627c
--- /dev/null
+++ b/src/core/NEON/kernels/NEErodeKernel.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEErodeKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+BorderSize NEErodeKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void NEErodeKernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
+{
+    _input  = input;
+    _output = output;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+void NEErodeKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+
+    Iterator in(_input, window);
+    Iterator out(_output, window);
+
+    const size_t in_stride = _input->info()->strides_in_bytes()[1];
+
+    execute_window_loop(window, [&](const Coordinates &)
+    {
+        uint8_t         *in_ptr   = in.ptr() - 1;
+        const uint8x16_t top_data = vld1q_u8(in_ptr - in_stride);
+        const uint8x16_t mid_data = vld1q_u8(in_ptr);
+        const uint8x16_t bot_data = vld1q_u8(in_ptr + in_stride);
+
+        uint8x8_t top_high_data = vget_high_u8(top_data);
+        uint8x8_t top_low_data  = vget_low_u8(top_data);
+
+        uint8x8_t mid_high_data = vget_high_u8(mid_data);
+        uint8x8_t mid_low_data  = vget_low_u8(mid_data);
+
+        uint8x8_t bot_high_data = vget_high_u8(bot_data);
+        uint8x8_t bot_low_data  = vget_low_u8(bot_data);
+
+        uint8x8_t p0, p1;
+
+        p0 = top_low_data;
+        p1 = vext_u8(top_low_data, top_high_data, 1);
+        p0 = vmin_u8(p0, p1);
+
+        p1 = vext_u8(top_low_data, top_high_data, 2);
+        p0 = vmin_u8(p0, p1);
+
+        p1 = mid_low_data;
+        p0 = vmin_u8(p0, p1);
+
+        p1 = vext_u8(mid_low_data, mid_high_data, 1);
+        p0 = vmin_u8(p0, p1);
+
+        p1 = vext_u8(mid_low_data, mid_high_data, 2);
+        p0 = vmin_u8(p0, p1);
+
+        p1 = bot_low_data;
+        p0 = vmin_u8(p0, p1);
+
+        p1 = vext_u8(bot_low_data, bot_high_data, 1);
+        p0 = vmin_u8(p0, p1);
+
+        p1 = vext_u8(bot_low_data, bot_high_data, 2);
+        p0 = vmin_u8(p0, p1);
+
+        vst1_u8(out.ptr(), p0);
+    },
+    in, out);
+}
diff --git a/src/core/NEON/kernels/NEFastCornersKernel.cpp b/src/core/NEON/kernels/NEFastCornersKernel.cpp
new file mode 100644
index 0000000000..9e8b5526a1
--- /dev/null
+++ b/src/core/NEON/kernels/NEFastCornersKernel.cpp
@@ -0,0 +1,474 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEFastCornersKernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <cstddef>
+#include <limits>
+
+using namespace arm_compute;
+
+NEFastCornersKernel::NEFastCornersKernel()
+    : INEKernel(), _input(nullptr), _output(nullptr), _threshold(0), _non_max_suppression(false)
+{
+}
+
+namespace
+{
+constexpr size_t PERMUTATIONS = 16;
+constexpr size_t PERM_SIZE    = 16;
+
+inline uint8x8x2_t create_permutation_index(size_t k)
+{
+    ARM_COMPUTE_ERROR_ON(k >= PERMUTATIONS);
+
+    static const uint8_t permutations_table[PERMUTATIONS][PERM_SIZE]
+    {
+        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 255, 255, 255, 255, 255, 255, 255 },
+        { 15, 0, 1, 2, 3, 4, 5, 6, 7, 255, 255, 255, 255, 255, 255, 255 },
+        { 14, 15, 0, 1, 2, 3, 4, 5, 6, 255, 255, 255, 255, 255, 255, 255 },
+        { 13, 14, 15, 0, 1, 2, 3, 4, 5, 255, 255, 255, 255, 255, 255, 255 },
+        { 12, 13, 14, 15, 0, 1, 2, 3, 4, 255, 255, 255, 255, 255, 255, 255 },
+        { 11, 12, 13, 14, 15, 0, 1, 2, 3, 255, 255, 255, 255, 255, 255, 255 },
+        { 10, 11, 12, 13, 14, 15, 0, 1, 2, 255, 255, 255, 255, 255, 255, 255 },
+        { 9, 10, 11, 12, 13, 14, 15, 0, 1, 255, 255, 255, 255, 255, 255, 255 },
+        { 8, 9, 10, 11, 12, 13, 14, 15, 0, 255, 255, 255, 255, 255, 255, 255 },
+        { 7, 8, 9, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255 },
+        { 6, 7, 8, 9, 10, 11, 12, 13, 14, 255, 255, 255, 255, 255, 255, 255 },
+        { 5, 6, 7, 8, 9, 10, 11, 12, 13, 255, 255, 255, 255, 255, 255, 255 },
+        { 4, 5, 6, 7, 8, 9, 10, 11, 12, 255, 255, 255, 255, 255, 255, 255 },
+        { 3, 4, 5, 6, 7, 8, 9, 10, 11, 255, 255, 255, 255, 255, 255, 255 },
+        { 2, 3, 4, 5, 6, 7, 8, 9, 10, 255, 255, 255, 255, 255, 255, 255 },
+        { 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255, 255 }
+
+    };
+
+    const uint8x8x2_t index =
+    {
+        {
+            vld1_u8(permutations_table[k]),
+            vld1_u8(permutations_table[k] + 8)
+        }
+    };
+
+    return index;
+}
+
+inline uint8x8x4_t create_circle_index_register()
+{
+    /*
+        This function creates the index registers to retrieve the 16 texels in the Bresenham circle of radius 3 with center in P.
+
+        . . F 0 1 . . .
+        . E . . . 2 . .
+        D . . . . . 3 .
+        C . . P . . 4 .
+        B . . . . . 5 .
+        . A . . . 6 . .
+        . . 9 8 7 . . .
+
+        Where . is an irrelevant texel value
+
+        We want to retrieve all texels [0,F]
+
+        The 4 registers in r will then be used to get these texels out of two tables in the function get_circle_texels()
+
+        The first table holds the top 4 rows of texels
+        . . F 0 1 . . .
+        . E . . . 2 . .
+        D . . . . . 3 .
+        C . . P . . 4 .
+
+        The second table the bottom 3 rows of texels
+        B . . . . . 5 .
+        . A . . . 6 . .
+        . . 9 8 7 . . .
+
+    */
+    static const uint8_t top_right[8] =
+    {
+        /* The register r.val[0] will be used to retrieve these texels:
+        . . . 0 1 . . .
+        . . . . . 2 . .
+        . . . . . . 3 .
+        . . . . . . 4 .
+        */
+        3 /* top table, first row, elem 4, value 0 in the diagram above */,
+        4 /* top table, first row, elem 5, value 1 in the diagram above */,
+        13 /* top table, second row, elem 6, value 2 in the diagram above */,
+        22 /* top table, third row, elem 7, value 3 in the diagram above*/,
+        30 /* top table, fourth row, elem 7, value 4 in the diagram above*/,
+        255,
+        255,
+        255
+    };
+
+    static const uint8_t bottom_right[8] =
+    {
+        /* The register r.val[1] will be used to retrieve these texels:
+        . . . . . . 5 .
+        . . . . . 6 . .
+        . . . . 7 . . .
+        */
+        255,
+        255,
+        255,
+        255,
+        255,
+        6 /* low table, first row, elem 7, value 5 in the diagram above*/,
+        13 /* low table, second row, elem 6, value 6 in the diagram above*/,
+        20 /* low table, third row, elem 5, value 7 in the diagram above*/
+    };
+
+    static const uint8_t top_left[8] =
+    {
+        /* The register r.val[2] will be used to retrieve these texels:
+        . . F . . . . .
+        . E . . . . . .
+        D . . . . . . .
+        C . . . . . . .
+        */
+        255,
+        255,
+        255,
+        255,
+        24 /* top table, fourth row, elem 1, value C in the diagram above */,
+        16 /* top table, third row, elem 1, value D in the diagram above*/,
+        9 /* top table, second row, elem 2, value E in the diagram above*/,
+        2 /* top table, first row, elem 3, value F in the diagram above*/
+    };
+
+    static const uint8_t bottom_left[8] =
+    {
+        /* The register r.val[3] will be used to retrieve these texels:
+        B . . . . . . .
+        . A . . . . . .
+        . . 9 8 . . . .
+        */
+        19 /* low table, third row, elem 4, value 8 in the diagram above */,
+        18 /* low table, third row, elem 3, value 9 in the diagram above */,
+        9 /* low table, second row, elem 2, value A in the diagram above */,
+        0 /* low table, first row, elem 1, value B in the diagram above */,
+        255,
+        255,
+        255,
+        255
+    };
+
+    const uint8x8x4_t reg =
+    {
+        {
+            vld1_u8(top_right),
+            vld1_u8(bottom_right),
+            vld1_u8(top_left),
+            vld1_u8(bottom_left)
+        }
+    };
+
+    return reg;
+}
+
+inline uint8x16_t get_circle_texels(const uint8x8x4_t &index, const uint8x8x4_t &tbl_hi, const uint8x8x3_t &tbl_lo)
+{
+    /*
+        This function loads the 16 texels in the Bresenham circle of radius 3 into the register 'texels'.
+        The parameter 'index' is an array of indices which was previously setup in setup_circle_index_register().
+        tbl_hi and tbl_lo are the two tables holding the texels in the window [(-3,-3),(+3,+3)] for a given texel P
+    */
+    return vcombine_u8(vtbx3_u8(vtbl4_u8(tbl_hi, index.val[0]), tbl_lo, index.val[1]),
+                       vtbx3_u8(vtbl4_u8(tbl_hi, index.val[2]), tbl_lo, index.val[3]));
+}
+
+inline uint8x16_t get_permutation_texels(const uint8x8x2_t &permutation_index, const uint8x8x2_t &tbl_circle)
+{
+    /*
+        This function stores the 9 texels of a give permutation X in the neon register 'texels'
+
+        'tbl_circle' is a LUT with the texels 0 to F
+
+        . . F 0 1 . . .
+        . E . . . 2 . .
+        D . . . . . 3 .
+        C . . P . . 4 .
+        B . . . . . 5 .
+        . A . . . 6 . .
+        . . 9 8 7 . . .
+
+        'permutation_index' is one of the permutations below:
+
+        { 0, 1, 2, 3, 4, 5, 6, 7, 8},
+        { F, 0, 1, 2, 3, 4, 5, 6, 7},
+        { E, F, 0, 1, 2, 3, 4, 5, 6},
+        { D, E, F, 0, 1, 2, 3, 4, 5},
+        { C, D, E, F, 0, 1, 2, 3, 4},
+        { B, C, D, E, F, 0, 1, 2, 3},
+        { A, B, C, D, E, F, 0, 1, 2},
+        { 9, A, B, C, D, E, F, 0, 1},
+        { 8, 9, A, B, C, D, E, F, 0},
+        { 7, 8, 9, A, B, C, D, E, F},
+        { 6, 7, 8, 9, A, B, C, D, E},
+        { 5, 6, 7, 8, 9, A, B, C, D},
+        { 4, 5, 6, 7, 8, 9, A, B, C},
+        { 3, 4, 5, 6, 7, 8, 9, A, B},
+        { 2, 3, 4, 5, 6, 7, 8, 9, A},
+        { 1, 2, 3, 4, 5, 6, 7, 8, 9},
+    */
+    static const uint8x8_t perm_right = vdup_n_u8(255); // init to 255 so that vtbx preserves the original values of the lanes
+
+    return vcombine_u8(vtbl2_u8(tbl_circle, permutation_index.val[0]),
+                       vtbx2_u8(perm_right, tbl_circle, permutation_index.val[1]));
+}
+
+inline bool is_permutation_brighter(const uint8x16_t &permutation, const uint8x16_t &pg)
+{
+    const uint8x16_t res_gt = vcgtq_u8(permutation, pg);
+
+    return vget_lane_u64(vreinterpret_u64_u8(vand_u8(vget_high_u8(res_gt), vget_low_u8(res_gt))), 0) == std::numeric_limits<uint64_t>::max();
+}
+
+inline bool is_permutation_darker(const uint8x16_t &permutation, const uint8x16_t &pl)
+{
+    const uint8x16_t res_lt    = vcltq_u8(permutation, pl);
+    const uint64x2_t u64res_lt = vreinterpretq_u64_u8(res_lt);
+    const uint64_t   t3        = vgetq_lane_u64(u64res_lt, 0);
+    const uint64_t   t4        = vgetq_lane_u64(u64res_lt, 1);
+
+    return std::numeric_limits<uint64_t>::max() == t3 && 255 == t4;
+}
+
+inline bool is_permutation_corner(const uint8x16_t &permutation, const uint8x16_t &pg, const uint8x16_t &pl)
+{
+    return is_permutation_brighter(permutation, pg) || is_permutation_darker(permutation, pl);
+}
+
+inline bool point_is_fast_corner(uint8_t p, uint8_t threshold, const uint8x8x2_t &tbl_circle_texels, uint8x8x2_t perm_indices[PERMUTATIONS])
+{
+    /*
+        This function determines whether the point 'p' is a corner.
+    */
+    uint8x16_t pg = vqaddq_u8(vdupq_n_u8(p), vdupq_n_u8(threshold));
+    uint8x16_t pl = vqsubq_u8(vdupq_n_u8(p), vdupq_n_u8(threshold));
+
+    bool corner_detected = false;
+
+    for(size_t j = 0; !corner_detected && j < PERMUTATIONS; ++j)
+    {
+        const uint8x16_t pe_texels = get_permutation_texels(perm_indices[j], tbl_circle_texels);
+        corner_detected            = is_permutation_corner(pe_texels, pg, pl);
+    }
+
+    return corner_detected;
+}
+
+inline uint8x8x2_t create_circle_tbl(const uint8_t *const __restrict buffer[7], size_t in_offset, const uint8x8x4_t &circle_index_r)
+{
+    /*
+        This function builds a LUT holding the 16 texels in the Brensenham circle radius 3.
+        circle_index_r is a vector of 4 registers to retrieve the texels from the two tables mentioned above.
+    */
+
+    //Load the texels in the window [(x-3,y-3),(x+3,y+3)].
+    //The top 4 rows are loaded in tbl_hi and the low 3 rows in tbl_lo.
+    //These two tables are then used to retrieve the texels in the Bresenham circle of radius 3.
+    const uint8x8x4_t tbl_window_hi =
+    {
+        {
+            vld1_u8(buffer[0] + in_offset),
+            vld1_u8(buffer[1] + in_offset),
+            vld1_u8(buffer[2] + in_offset),
+            vld1_u8(buffer[3] + in_offset)
+        }
+    };
+
+    const uint8x8x3_t tbl_window_lo =
+    {
+        {
+            vld1_u8(buffer[4] + in_offset),
+            vld1_u8(buffer[5] + in_offset),
+            vld1_u8(buffer[6] + in_offset)
+        }
+    };
+
+    const uint8x16_t circle_texels = get_circle_texels(circle_index_r, tbl_window_hi, tbl_window_lo);
+
+    const uint8x8x2_t tbl_circle_texels =
+    {
+        {
+            vget_low_u8(circle_texels),
+            vget_high_u8(circle_texels)
+        }
+    };
+
+    return tbl_circle_texels;
+}
+
+inline uint8_t get_point_score(uint8_t p, uint8_t tolerance, const uint8x8x2_t &tbl_circle, uint8x8x2_t perm_indices[PERMUTATIONS])
+{
+    uint8_t b = 255;
+    uint8_t a = tolerance;
+
+    while(b - a > 1)
+    {
+        const uint16_t ab = a + b;
+        const uint8_t  c  = ab >> 1;
+
+        if(point_is_fast_corner(p, c, tbl_circle, perm_indices))
+        {
+            a = c;
+        }
+        else
+        {
+            b = c;
+        }
+    }
+
+    return a;
+}
+} // namespace
+
+BorderSize NEFastCornersKernel::border_size() const
+{
+    return BorderSize(3);
+}
+
+void NEFastCornersKernel::configure(const IImage *input, IImage *output, uint8_t threshold, bool non_max_suppression, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MSG(border_undefined == false, "Not implemented");
+
+    _input               = input;
+    _output              = output;
+    _threshold           = threshold;
+    _non_max_suppression = non_max_suppression;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+    constexpr unsigned int num_elems_read_per_iteration      = 8;
+    constexpr unsigned int num_elems_written_per_iteration   = 1;
+    constexpr unsigned int num_rows_read_per_iteration       = 7;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+void NEFastCornersKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    std::array<uint8x8x2_t, PERMUTATIONS> perm_index{ {} };
+    /*
+        We use a LUT loaded with 7 rows of uint8_t from the input image [-3,-3]...[+3,+3] to retrieve the texels in the Brensenham circle radius 3 and put them in one neon register uint8x16_t.
+        The three lines below setup the neon index registers to get these texels out from the table
+    */
+    const uint8x8x4_t circle_index_r = create_circle_index_register();
+    /*
+        We put the 16 texels (circle) in a LUT to easily generate all the permutations. The for block below setups the indices for each permutation.
+    */
+    for(size_t k = 0; k < PERMUTATIONS; ++k)
+    {
+        perm_index[k] = create_permutation_index(k);
+    }
+
+    Iterator in(_input, window);
+    Iterator out(_output, window);
+
+    const uint8_t *const __restrict in_row[7] =
+    {
+        _input->ptr_to_element(Coordinates(-3, -3)),
+        _input->ptr_to_element(Coordinates(-3, -2)),
+        _input->ptr_to_element(Coordinates(-3, -1)),
+        _input->ptr_to_element(Coordinates(-3, 0)),
+        _input->ptr_to_element(Coordinates(-3, 1)),
+        _input->ptr_to_element(Coordinates(-3, 2)),
+        _input->ptr_to_element(Coordinates(-3, 3))
+    };
+
+    auto is_rejected = [](uint8_t p, uint8_t q, uint8_t a, uint8_t b)
+    {
+        const bool p_is_in_ab = (a <= p) && (p <= b);
+        const bool q_is_in_ab = (a <= q) && (q <= b);
+        return p_is_in_ab && q_is_in_ab;
+    };
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const size_t  in_offset = in.offset();
+        const uint8_t p0        = *in.ptr();
+        const uint8_t b         = std::min(p0 + _threshold, 255);
+        const uint8_t a         = std::max(p0 - _threshold, 0);
+        uint8_t       score     = 0;
+        /*
+            Fast check to discard points which cannot be corners and avoid the expensive computation of the potential 16 permutations
+
+            pixels 1 and 9 are examined, if both I1 and I9 are within [Ip - t, Ip + t], then candidate p is not a corner.
+        */
+        const uint8_t p1 = (in_offset + in_row[0])[3];
+        const uint8_t p9 = (in_offset + in_row[6])[3];
+
+        if(!is_rejected(p1, p9, a, b))
+        {
+            /* pixels 5 and 13 are further examined to check whether three of them are brighter than Ip + t or darker than Ip - t */
+            const uint8_t p5  = (in_offset + in_row[3])[6];
+            const uint8_t p13 = (in_offset + in_row[3])[0];
+
+            if(!is_rejected(p5, p13, a, b))
+            {
+                /* at this stage we use the full test with the 16 permutations to classify the point as corner or not */
+                const uint8x8x2_t tbl_circle_texel = create_circle_tbl(in_row, in_offset, circle_index_r);
+
+                if(point_is_fast_corner(p0, _threshold, tbl_circle_texel, perm_index.data()))
+                {
+                    if(_non_max_suppression)
+                    {
+                        score = get_point_score(p0, _threshold, tbl_circle_texel, perm_index.data());
+                    }
+                    else
+                    {
+                        score = 1;
+                    }
+                }
+            }
+        }
+
+        *out.ptr() = score;
+    },
+    in, out);
+}
diff --git a/src/core/NEON/kernels/NEFillArrayKernel.cpp b/src/core/NEON/kernels/NEFillArrayKernel.cpp
new file mode 100644
index 0000000000..7e7e1c2501
--- /dev/null
+++ b/src/core/NEON/kernels/NEFillArrayKernel.cpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEFillArrayKernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+NEFillArrayKernel::NEFillArrayKernel()
+    : _input(nullptr), _output(nullptr), _threshold(0)
+{
+}
+
+void NEFillArrayKernel::configure(const IImage *input, uint8_t threshold, IKeyPointArray *output)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+    _input     = input;
+    _output    = output;
+    _threshold = threshold;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+    constexpr unsigned int num_elems_read_per_iteration      = 1;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_read_per_iteration));
+
+    INEKernel::configure(win);
+}
+
+bool NEFillArrayKernel::is_parallelisable() const
+{
+    return false;
+}
+
+void NEFillArrayKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    Iterator input(_input, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8_t value = *input.ptr();
+
+        if(value >= _threshold)
+        {
+            KeyPoint p;
+            p.x               = id.x();
+            p.y               = id.y();
+            p.strength        = value;
+            p.tracking_status = 1;
+
+            if(!_output->push_back(p))
+            {
+                return; //Overflowed: stop trying to add more points
+            }
+        }
+    },
+    input);
+}
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
new file mode 100644
index 0000000000..bd99242b11
--- /dev/null
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <algorithm>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+NEFillBorderKernel::NEFillBorderKernel()
+    : _tensor(nullptr), _border_size(0), _mode(BorderMode::UNDEFINED), _constant_border_value(0)
+{
+}
+
+void NEFillBorderKernel::configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(tensor, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32);
+
+    _tensor                = tensor;
+    _border_size           = border_size;
+    _mode                  = border_mode;
+    _constant_border_value = constant_border_value;
+
+    _border_size.limit(tensor->info()->padding());
+
+    Window win;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    win.set(Window::DimY, Window::Dimension(0, 1, 1));
+    win.use_tensor_dimensions(_tensor->info(), Window::DimZ);
+    INEKernel::configure(win);
+}
+
+void NEFillBorderKernel::run(const Window &window)
+{
+    // If there is no border: early exit
+    if(_border_size.empty())
+    {
+        return;
+    }
+
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    switch(_mode)
+    {
+        case BorderMode::CONSTANT:
+        {
+            switch(_tensor->info()->data_type())
+            {
+                case DataType::U8:
+                    fill_constant_value_single_channel<uint8_t>(window);
+                    break;
+                case DataType::QS8:
+                case DataType::S8:
+                    fill_constant_value_single_channel<int8_t>(window);
+                    break;
+                case DataType::U16:
+                    fill_constant_value_single_channel<uint16_t>(window);
+                    break;
+                case DataType::S16:
+                case DataType::QS16:
+                    fill_constant_value_single_channel<int16_t>(window);
+                    break;
+                case DataType::U32:
+                    fill_constant_value_single_channel<uint32_t>(window);
+                    break;
+                case DataType::S32:
+                    fill_constant_value_single_channel<int32_t>(window);
+                    break;
+                case DataType::F32:
+                    static_assert(sizeof(float) == 4, "Float must be 32 bit");
+                    fill_constant_value_single_channel<float>(window);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not handled");
+            }
+            break;
+        }
+        case BorderMode::REPLICATE:
+        {
+            switch(_tensor->info()->data_type())
+            {
+                case DataType::U8:
+                    fill_replicate_single_channel<uint8_t>(window);
+                    break;
+                case DataType::QS8:
+                case DataType::S8:
+                    fill_replicate_single_channel<int8_t>(window);
+                    break;
+                case DataType::U16:
+                    fill_replicate_single_channel<uint16_t>(window);
+                    break;
+                case DataType::S16:
+                case DataType::QS16:
+                    fill_replicate_single_channel<int16_t>(window);
+                    break;
+                case DataType::U32:
+                    fill_replicate_single_channel<uint32_t>(window);
+                    break;
+                case DataType::S32:
+                    fill_replicate_single_channel<int32_t>(window);
+                    break;
+                case DataType::F32:
+                    static_assert(sizeof(float) == 4, "Float must be 32 bit");
+                    fill_replicate_single_channel<float>(window);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not handled");
+            }
+            break;
+        }
+        case BorderMode::UNDEFINED:
+            break; // Nothing to do here
+        default:
+            ARM_COMPUTE_ERROR("Unknown border mode");
+    }
+}
+
+template <typename T>
+void NEFillBorderKernel::fill_replicate_single_channel(const Window &window)
+{
+    uint8_t *const start_valid_region = _tensor->ptr_to_element(_tensor->info()->valid_region().anchor);
+    const size_t &width              = _tensor->info()->valid_region().shape[0];
+    const size_t &height             = _tensor->info()->valid_region().shape[1];
+
+    // Left and right border
+    Window vertical(window);
+    vertical.set(Window::DimY, Window::Dimension(0, height, 1));
+
+    Iterator vertical_it(_tensor, vertical);
+
+    execute_window_loop(vertical, [&](const Coordinates & id)
+    {
+        const auto row_start = reinterpret_cast<T *>(start_valid_region + vertical_it.offset());
+        const auto left_val  = *reinterpret_cast<T *>(vertical_it.ptr());
+        const auto right_val = *(reinterpret_cast<T *>(vertical_it.ptr()) + width - 1);
+
+        // Fill left and right borders
+        std::fill_n(row_start - _border_size.left, _border_size.left, left_val);
+        std::fill_n(row_start + width, _border_size.right, right_val);
+    },
+    vertical_it);
+
+    // Top and bottom border
+    Iterator plane_it(_tensor, window);
+
+    // Iterate over all XY planes
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto first_row = reinterpret_cast<T *>(start_valid_region + plane_it.offset());
+
+        // Top border
+        for(int i = -_border_size.top; i < 0; ++i)
+        {
+            const auto row_start = reinterpret_cast<T *>(start_valid_region + plane_it.offset() + i * _tensor->info()->strides_in_bytes()[1]);
+
+            // Copy top rows including left/right borders
+            std::copy_n(first_row - _border_size.left, _border_size.left + width + _border_size.right, row_start - _border_size.left);
+        }
+
+        const auto last_row = reinterpret_cast<T *>(start_valid_region + plane_it.offset() + (height - 1) * _tensor->info()->strides_in_bytes()[1]);
+
+        // Bottom border
+        for(unsigned int i = height; i < height + _border_size.bottom; ++i)
+        {
+            const auto row_start = reinterpret_cast<T *>(start_valid_region + plane_it.offset() + i * _tensor->info()->strides_in_bytes()[1]);
+
+            // Copy bottom rows including left/right borders
+            std::copy_n(last_row - _border_size.left, _border_size.left + width + _border_size.right, row_start - _border_size.left);
+        }
+    },
+    plane_it);
+}
+
+template <typename T>
+void NEFillBorderKernel::fill_constant_value_single_channel(const Window &window)
+{
+    T constant_border_value;
+    _constant_border_value.get(constant_border_value);
+
+    uint8_t *const start_valid_region = _tensor->ptr_to_element(_tensor->info()->valid_region().anchor);
+    const size_t &width              = _tensor->info()->valid_region().shape[0];
+    const size_t &height             = _tensor->info()->valid_region().shape[1];
+
+    // Left and right border
+    Window vertical(window);
+    vertical.set(Window::DimY, Window::Dimension(0, height, 1));
+
+    Iterator vertical_it(_tensor, vertical);
+
+    execute_window_loop(vertical, [&](const Coordinates & id)
+    {
+        const auto row_start = reinterpret_cast<T *>(start_valid_region + vertical_it.offset());
+
+        // Fill left and right borders
+        std::fill_n(row_start - _border_size.left, _border_size.left, constant_border_value);
+        std::fill_n(row_start + width, _border_size.right, constant_border_value);
+    },
+    vertical_it);
+
+    // Top and bottom border
+    Iterator plane_it(_tensor, window);
+
+    // Iterate over all XY planes
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Top border
+        for(int i = -_border_size.top; i < 0; ++i)
+        {
+            const auto row_start = reinterpret_cast<T *>(start_valid_region + plane_it.offset() + i * _tensor->info()->strides_in_bytes()[1]);
+
+            // Fill top rows including left/right borders
+            std::fill_n(row_start - _border_size.left, _border_size.left + width + _border_size.right, constant_border_value);
+        }
+
+        // Bottom border
+        for(unsigned int i = height; i < height + _border_size.bottom; ++i)
+        {
+            const auto row_start = reinterpret_cast<T *>(start_valid_region + plane_it.offset() + i * _tensor->info()->strides_in_bytes()[1]);
+
+            // Fill bottom rows including left/right borders
+            std::fill_n(row_start - _border_size.left, _border_size.left + width + _border_size.right, constant_border_value);
+        }
+    },
+    plane_it);
+}
diff --git a/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp b/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp
new file mode 100644
index 0000000000..699a5d9299
--- /dev/null
+++ b/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+NEFillInnerBorderKernel::NEFillInnerBorderKernel()
+    : _tensor(nullptr), _border_size(0), _constant_border_value(0)
+{
+}
+
+void NEFillInnerBorderKernel::configure(ITensor *input, BorderSize border_size, const PixelValue &constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::S32, DataType::F32);
+
+    _tensor                = input;
+    _border_size           = border_size;
+    _constant_border_value = constant_border_value;
+
+    Window win;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    win.set(Window::DimY, Window::Dimension(0, 1, 1));
+    win.use_tensor_dimensions(_tensor->info(), Window::DimZ);
+    INEKernel::configure(win);
+}
+
+void NEFillInnerBorderKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    // If there is no border: early exit
+    if(_border_size.empty())
+    {
+        return;
+    }
+
+    switch(_tensor->info()->data_type())
+    {
+        case DataType::U8:
+            fill_value_single_channel<uint8_t>(window);
+            break;
+        case DataType::S16:
+            fill_value_single_channel<int16_t>(window);
+            break;
+        case DataType::S32:
+            fill_value_single_channel<int32_t>(window);
+            break;
+        case DataType::F32:
+            static_assert(sizeof(float) == 4, "Float must be 32 bit");
+            fill_value_single_channel<float>(window);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not handled");
+            break;
+    }
+}
+
+template <typename T>
+void NEFillInnerBorderKernel::fill_value_single_channel(const Window &window)
+{
+    const size_t stride = _tensor->info()->strides_in_bytes()[1];
+    const size_t width  = _tensor->info()->dimension(0);
+    const size_t height = _tensor->info()->dimension(1);
+
+    T constant_border_value;
+    _constant_border_value.get(constant_border_value);
+
+    // Left and right border
+    // All X values are set at once
+    Window vertical(window);
+    vertical.set(Window::DimY, Window::Dimension(0, height, 1));
+
+    Iterator vertical_it(_tensor, vertical);
+
+    execute_window_loop(vertical, [&](const Coordinates & id)
+    {
+        std::fill_n(reinterpret_cast<T *>(vertical_it.ptr()), _border_size.left, constant_border_value);
+        std::fill_n(reinterpret_cast<T *>(vertical_it.ptr()) + width - _border_size.right, _border_size.right, constant_border_value);
+    },
+    vertical_it);
+
+    // Top and bottom border
+    // All values are set at once
+    Iterator horizontal_it(_tensor, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        for(size_t i = 0; i < _border_size.top; ++i)
+        {
+            std::fill_n(reinterpret_cast<T *>(horizontal_it.ptr() + i * stride), width, constant_border_value);
+        }
+
+        for(size_t i = 0; i < _border_size.bottom; ++i)
+        {
+            std::fill_n(reinterpret_cast<T *>(horizontal_it.ptr() + (height - i - 1) * stride), width, constant_border_value);
+        }
+    },
+    horizontal_it);
+}
diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
new file mode 100644
index 0000000000..3ff8b7b201
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+using namespace arm_compute;
+
+namespace
+{
+void gemm_interleave_8bit_elements(const ITensor *input, ITensor *output, const Window &window)
+{
+    const size_t in_stride = input->info()->strides_in_bytes()[1];
+
+    // Set window for output tensor
+    Window win_out(window);
+    win_out.scale(Window::DimY, 0.25f);
+    Iterator in(input, window);
+
+    win_out.set_dimension_step(Window::DimX, 32);
+    Iterator out(output, win_out);
+
+    execute_window_loop(window, [&](const Coordinates &)
+    {
+        const uint8x8x4_t data =
+        {
+            {
+                vld1_u8(in.ptr() + 0 * in_stride),
+                vld1_u8(in.ptr() + 1 * in_stride),
+                vld1_u8(in.ptr() + 2 * in_stride),
+                vld1_u8(in.ptr() + 3 * in_stride),
+            }
+        };
+        vst4_u8(out.ptr(), data);
+    },
+    in, out);
+}
+
+void gemm_interleave_16bit_elements(const ITensor *input, ITensor *output, const Window &window)
+{
+    const size_t in_stride = input->info()->strides_in_bytes()[1];
+
+    // Set window for output tensor
+    Window win_out(window);
+    win_out.scale(Window::DimY, 0.25f);
+    Iterator in(input, window);
+
+    win_out.set_dimension_step(Window::DimX, 16);
+    Iterator out(output, win_out);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint16x4x4_t data =
+        {
+            {
+                vld1_u16(reinterpret_cast<uint16_t *>(in.ptr() + 0 * in_stride)),
+                vld1_u16(reinterpret_cast<uint16_t *>(in.ptr() + 1 * in_stride)),
+                vld1_u16(reinterpret_cast<uint16_t *>(in.ptr() + 2 * in_stride)),
+                vld1_u16(reinterpret_cast<uint16_t *>(in.ptr() + 3 * in_stride)),
+            }
+        };
+        vst4_u16(reinterpret_cast<uint16_t *>(out.ptr()), data);
+    },
+    in, out);
+}
+
+void gemm_interleave_32bit_elements(const ITensor *input, ITensor *output, const Window &window)
+{
+    const size_t in_stride = input->info()->strides_in_bytes()[1];
+
+    // Set window for output tensor
+    Window win_out(window);
+    win_out.scale(Window::DimY, 0.25f);
+    Iterator in(input, window);
+
+    win_out.set_dimension_step(Window::DimX, 16);
+    Iterator out(output, win_out);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint32x4x4_t data =
+        {
+            {
+                vld1q_u32(reinterpret_cast<uint32_t *>(in.ptr() + 0 * in_stride)),
+                vld1q_u32(reinterpret_cast<uint32_t *>(in.ptr() + 1 * in_stride)),
+                vld1q_u32(reinterpret_cast<uint32_t *>(in.ptr() + 2 * in_stride)),
+                vld1q_u32(reinterpret_cast<uint32_t *>(in.ptr() + 3 * in_stride))
+            }
+        };
+        vst4q_u32(reinterpret_cast<uint32_t *>(out.ptr()), data);
+    },
+    in, out);
+}
+} // namespace
+
+NEGEMMInterleave4x4Kernel::NEGEMMInterleave4x4Kernel()
+    : _func(nullptr)
+{
+}
+
+void NEGEMMInterleave4x4Kernel::configure(const ITensor *input, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != input->info()->dimension(0) * 4);
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != std::ceil(input->info()->dimension(1) / 4.0f));
+
+    _input  = input;
+    _output = output;
+
+    unsigned int           num_elems_processed_per_iteration_x = 4;
+    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+
+    switch(input->info()->element_size())
+    {
+        case 1:
+            num_elems_processed_per_iteration_x = 8;
+            _func                               = &gemm_interleave_8bit_elements;
+            break;
+        case 2:
+            _func = &gemm_interleave_16bit_elements;
+            break;
+        case 4:
+            _func = &gemm_interleave_32bit_elements;
+            break;
+        default:
+            ARM_COMPUTE_ERROR_ON("Element size not supported");
+            break;
+    }
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y, 1, 4.0f, 0.25f);
+    AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+    update_window_and_padding(win, output_access, input_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+
+void NEGEMMInterleave4x4Kernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+    /*
+    *  This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
+    *         |a00 a01 a02 a03|
+    *         |a10 a11 a12 a13|
+    *         |a20 a21 a22 a23| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 |
+    *         |a30 a31 a32 a33|
+    *
+    *         After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
+    */
+    (*_func)(_input, _output, window);
+}
diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
new file mode 100644
index 0000000000..3558c686b1
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
@@ -0,0 +1,423 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+NEGEMMLowpMatrixMultiplyKernel::NEGEMMLowpMatrixMultiplyKernel()
+    : _input0(nullptr), _input1(nullptr), _output(nullptr), _a_offset(0), _b_offset(0), _output_offset(0), _output_mult_int(0), _shift(0)
+{
+}
+
+void NEGEMMLowpMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output,
+                                               int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+
+    _input0          = input0;
+    _input1          = input1;
+    _output          = output;
+    _a_offset        = a_offset;
+    _b_offset        = b_offset;
+    _output_offset   = output_offset;
+    _output_mult_int = output_mult_int;
+    _shift           = shift;
+
+    constexpr unsigned int num_elems_processed_per_iteration_x = 16;
+    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    AccessWindowRectangle  output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+    AccessWindowHorizontal in0_access(input0->info(), 0, num_elems_processed_per_iteration_x);
+    AccessWindowHorizontal in1_access(input1->info(), 0, num_elems_processed_per_iteration_x);
+
+    update_window_and_padding(win, in0_access, in1_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+    INEKernel::configure(win);
+}
+
+void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    const size_t in_b_stride = _input1->info()->strides_in_bytes()[1];
+    const size_t out_stride  = _output->info()->strides_in_bytes()[1];
+
+    /* Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix */
+    Window win_a(window);
+    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_a.set(Window::DimY, Window::Dimension(window.y().start() >> 2, window.y().end() >> 2, 1));
+
+    /* Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the cols of the output matrix */
+    Window win_b(window);
+    win_b.set(Window::DimX, Window::Dimension(window.x().start() >> 4, window.x().end() >> 4, in_b_stride));
+    win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    /* The step x and step y for the output matrix has been already set using in configure() */
+    Iterator ina(_input0, win_a);
+    Iterator inb(_input1, win_b);
+    Iterator out(_output, window);
+
+    const int32x4_t voffset_a = vdupq_n_s32(_a_offset);
+    const int32x4_t voffset_b = vdupq_n_s32(_b_offset);
+    const int32x4_t vshiftr   = vdupq_n_s32(-_shift);
+
+    const int width_b = _input1->info()->dimension(0);
+
+    // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW
+    // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
+    // All the values needed for computing a single 4x4 block will be read from consecutive memory positions
+    execute_window_loop(window, [&](const Coordinates &)
+    {
+        const uint8_t *mtx_a0 = ina.ptr();
+        const uint8_t *mtx_b0 = inb.ptr();
+
+        // Accumulators for the block 0
+        int32x4x4_t c0 =
+        {
+            {
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset)
+            }
+        };
+
+        // Accumulators for the block 1
+        int32x4x4_t c1 =
+        {
+            {
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset)
+            }
+        };
+
+        // Accumulators for the block 2
+        int32x4x4_t c2 =
+        {
+            {
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset)
+            }
+        };
+
+        // Accumulators for the block 3
+        int32x4x4_t c3 =
+        {
+            {
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset),
+                vdupq_n_s32(_output_offset)
+            }
+        };
+
+        int k = 0;
+        // This for loop performs 4 accumulations per iteration
+        for(; k <= (width_b - 64); k += 64, mtx_a0 += 16, mtx_b0 += 64)
+        {
+            const uint8x8_t p00  = vld1_u8(mtx_a0 + 0);
+            const uint8x8_t p01  = vld1_u8(mtx_a0 + 8);
+            const uint8x8_t q00l = vld1_u8(mtx_b0 + 0);
+            const uint8x8_t q00h = vld1_u8(mtx_b0 + 8);
+            const uint8x8_t q01l = vld1_u8(mtx_b0 + 16);
+            const uint8x8_t q01h = vld1_u8(mtx_b0 + 24);
+            const uint8x8_t q02l = vld1_u8(mtx_b0 + 32);
+            const uint8x8_t q02h = vld1_u8(mtx_b0 + 40);
+            const uint8x8_t q03l = vld1_u8(mtx_b0 + 48);
+            const uint8x8_t q03h = vld1_u8(mtx_b0 + 56);
+
+            const int32x4_t ia0l = vaddw_s16(voffset_a, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(p00))));
+            const int32x4_t ia0h = vaddw_s16(voffset_a, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(p00))));
+            const int32x4_t ia1l = vaddw_s16(voffset_a, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(p01))));
+            const int32x4_t ia1h = vaddw_s16(voffset_a, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(p01))));
+
+            const int32x2x4_t ia0 =
+            {
+                {
+                    vget_low_s32(ia0l),
+                    vget_high_s32(ia0l),
+                    vget_low_s32(ia0h),
+                    vget_high_s32(ia0h)
+                }
+            };
+
+            const int32x2x4_t ia1 =
+            {
+                {
+                    vget_low_s32(ia1l),
+                    vget_high_s32(ia1l),
+                    vget_low_s32(ia1h),
+                    vget_high_s32(ia1h)
+                }
+            };
+
+            const int32x4x4_t ib0 =
+            {
+                {
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q00l)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q00l)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q00h)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q00h))))
+                }
+            };
+
+            const int32x4x4_t ib1 =
+            {
+                {
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q01l)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q01l)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q01h)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q01h))))
+                }
+            };
+
+            const int32x4x4_t ib2 =
+            {
+                {
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q02l)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q02l)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q02h)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q02h))))
+                }
+            };
+
+            const int32x4x4_t ib3 =
+            {
+                {
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q03l)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q03l)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q03h)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q03h))))
+                }
+            };
+
+            // 4x4 block 0 - Accumulation 0
+            c0.val[0] = vmlaq_lane_s32(c0.val[0], ib0.val[0], ia0.val[0], 0);
+            c0.val[1] = vmlaq_lane_s32(c0.val[1], ib0.val[0], ia0.val[0], 1);
+            c0.val[2] = vmlaq_lane_s32(c0.val[2], ib0.val[0], ia0.val[1], 0);
+            c0.val[3] = vmlaq_lane_s32(c0.val[3], ib0.val[0], ia0.val[1], 1);
+            // 4x4 block 0 - Accumulation 1
+            c0.val[0] = vmlaq_lane_s32(c0.val[0], ib1.val[0], ia0.val[2], 0);
+            c0.val[1] = vmlaq_lane_s32(c0.val[1], ib1.val[0], ia0.val[2], 1);
+            c0.val[2] = vmlaq_lane_s32(c0.val[2], ib1.val[0], ia0.val[3], 0);
+            c0.val[3] = vmlaq_lane_s32(c0.val[3], ib1.val[0], ia0.val[3], 1);
+            // 4x4 block 0 - Accumulation 2
+            c0.val[0] = vmlaq_lane_s32(c0.val[0], ib2.val[0], ia1.val[0], 0);
+            c0.val[1] = vmlaq_lane_s32(c0.val[1], ib2.val[0], ia1.val[0], 1);
+            c0.val[2] = vmlaq_lane_s32(c0.val[2], ib2.val[0], ia1.val[1], 0);
+            c0.val[3] = vmlaq_lane_s32(c0.val[3], ib2.val[0], ia1.val[1], 1);
+            // 4x4 block 0 - Accumulation 3
+            c0.val[0] = vmlaq_lane_s32(c0.val[0], ib3.val[0], ia1.val[2], 0);
+            c0.val[1] = vmlaq_lane_s32(c0.val[1], ib3.val[0], ia1.val[2], 1);
+            c0.val[2] = vmlaq_lane_s32(c0.val[2], ib3.val[0], ia1.val[3], 0);
+            c0.val[3] = vmlaq_lane_s32(c0.val[3], ib3.val[0], ia1.val[3], 1);
+
+            // 4x4 block 1 - Accumulation 0
+            c1.val[0] = vmlaq_lane_s32(c1.val[0], ib0.val[1], ia0.val[0], 0);
+            c1.val[1] = vmlaq_lane_s32(c1.val[1], ib0.val[1], ia0.val[0], 1);
+            c1.val[2] = vmlaq_lane_s32(c1.val[2], ib0.val[1], ia0.val[1], 0);
+            c1.val[3] = vmlaq_lane_s32(c1.val[3], ib0.val[1], ia0.val[1], 1);
+            // 4x4 block 1 - Accumulation 1
+            c1.val[0] = vmlaq_lane_s32(c1.val[0], ib1.val[1], ia0.val[2], 0);
+            c1.val[1] = vmlaq_lane_s32(c1.val[1], ib1.val[1], ia0.val[2], 1);
+            c1.val[2] = vmlaq_lane_s32(c1.val[2], ib1.val[1], ia0.val[3], 0);
+            c1.val[3] = vmlaq_lane_s32(c1.val[3], ib1.val[1], ia0.val[3], 1);
+            // 4x4 block 1 - Accumulation 2
+            c1.val[0] = vmlaq_lane_s32(c1.val[0], ib2.val[1], ia1.val[0], 0);
+            c1.val[1] = vmlaq_lane_s32(c1.val[1], ib2.val[1], ia1.val[0], 1);
+            c1.val[2] = vmlaq_lane_s32(c1.val[2], ib2.val[1], ia1.val[1], 0);
+            c1.val[3] = vmlaq_lane_s32(c1.val[3], ib2.val[1], ia1.val[1], 1);
+            // 4x4 block 1 - Accumulation 3
+            c1.val[0] = vmlaq_lane_s32(c1.val[0], ib3.val[1], ia1.val[2], 0);
+            c1.val[1] = vmlaq_lane_s32(c1.val[1], ib3.val[1], ia1.val[2], 1);
+            c1.val[2] = vmlaq_lane_s32(c1.val[2], ib3.val[1], ia1.val[3], 0);
+            c1.val[3] = vmlaq_lane_s32(c1.val[3], ib3.val[1], ia1.val[3], 1);
+
+            // 4x4 block 2 - Accumulation 0
+            c2.val[0] = vmlaq_lane_s32(c2.val[0], ib0.val[2], ia0.val[0], 0);
+            c2.val[1] = vmlaq_lane_s32(c2.val[1], ib0.val[2], ia0.val[0], 1);
+            c2.val[2] = vmlaq_lane_s32(c2.val[2], ib0.val[2], ia0.val[1], 0);
+            c2.val[3] = vmlaq_lane_s32(c2.val[3], ib0.val[2], ia0.val[1], 1);
+            // 4x4 block 2 - Accumulation 1
+            c2.val[0] = vmlaq_lane_s32(c2.val[0], ib1.val[2], ia0.val[2], 0);
+            c2.val[1] = vmlaq_lane_s32(c2.val[1], ib1.val[2], ia0.val[2], 1);
+            c2.val[2] = vmlaq_lane_s32(c2.val[2], ib1.val[2], ia0.val[3], 0);
+            c2.val[3] = vmlaq_lane_s32(c2.val[3], ib1.val[2], ia0.val[3], 1);
+            // 4x4 block 2 - Accumulation 2
+            c2.val[0] = vmlaq_lane_s32(c2.val[0], ib2.val[2], ia1.val[0], 0);
+            c2.val[1] = vmlaq_lane_s32(c2.val[1], ib2.val[2], ia1.val[0], 1);
+            c2.val[2] = vmlaq_lane_s32(c2.val[2], ib2.val[2], ia1.val[1], 0);
+            c2.val[3] = vmlaq_lane_s32(c2.val[3], ib2.val[2], ia1.val[1], 1);
+            // 4x4 block 2 - Accumulation 3
+            c2.val[0] = vmlaq_lane_s32(c2.val[0], ib3.val[2], ia1.val[2], 0);
+            c2.val[1] = vmlaq_lane_s32(c2.val[1], ib3.val[2], ia1.val[2], 1);
+            c2.val[2] = vmlaq_lane_s32(c2.val[2], ib3.val[2], ia1.val[3], 0);
+            c2.val[3] = vmlaq_lane_s32(c2.val[3], ib3.val[2], ia1.val[3], 1);
+
+            // 4x4 block 3 - Accumulation 0
+            c3.val[0] = vmlaq_lane_s32(c3.val[0], ib0.val[3], ia0.val[0], 0);
+            c3.val[1] = vmlaq_lane_s32(c3.val[1], ib0.val[3], ia0.val[0], 1);
+            c3.val[2] = vmlaq_lane_s32(c3.val[2], ib0.val[3], ia0.val[1], 0);
+            c3.val[3] = vmlaq_lane_s32(c3.val[3], ib0.val[3], ia0.val[1], 1);
+            // 4x4 block 3 - Accumulation 1
+            c3.val[0] = vmlaq_lane_s32(c3.val[0], ib1.val[3], ia0.val[2], 0);
+            c3.val[1] = vmlaq_lane_s32(c3.val[1], ib1.val[3], ia0.val[2], 1);
+            c3.val[2] = vmlaq_lane_s32(c3.val[2], ib1.val[3], ia0.val[3], 0);
+            c3.val[3] = vmlaq_lane_s32(c3.val[3], ib1.val[3], ia0.val[3], 1);
+            // 4x4 block 3 - Accumulation 2
+            c3.val[0] = vmlaq_lane_s32(c3.val[0], ib2.val[3], ia1.val[0], 0);
+            c3.val[1] = vmlaq_lane_s32(c3.val[1], ib2.val[3], ia1.val[0], 1);
+            c3.val[2] = vmlaq_lane_s32(c3.val[2], ib2.val[3], ia1.val[1], 0);
+            c3.val[3] = vmlaq_lane_s32(c3.val[3], ib2.val[3], ia1.val[1], 1);
+            // 4x4 block 3 - Accumulation 3
+            c3.val[0] = vmlaq_lane_s32(c3.val[0], ib3.val[3], ia1.val[2], 0);
+            c3.val[1] = vmlaq_lane_s32(c3.val[1], ib3.val[3], ia1.val[2], 1);
+            c3.val[2] = vmlaq_lane_s32(c3.val[2], ib3.val[3], ia1.val[3], 0);
+            c3.val[3] = vmlaq_lane_s32(c3.val[3], ib3.val[3], ia1.val[3], 1);
+        }
+
+        // This for loop handles the left-over accumulations
+        for(; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)
+        {
+            const uint8x8_t p00  = vld1_u8(mtx_a0);
+            const uint8x8_t q00l = vld1_u8(mtx_b0);
+            const uint8x8_t q00h = vld1_u8(mtx_b0 + 8);
+
+            const int32x4_t ia0 = vaddw_s16(voffset_a, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(p00))));
+
+            const int32x2x2_t ia =
+            {
+                {
+                    vget_low_s32(ia0),
+                    vget_high_s32(ia0)
+                }
+            };
+
+            const int32x4x4_t ib0 =
+            {
+                {
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q00l)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q00l)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q00h)))),
+                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q00h))))
+                }
+            };
+
+            // 4x4 block 0
+            c0.val[0] = vmlaq_lane_s32(c0.val[0], ib0.val[0], ia.val[0], 0);
+            c0.val[1] = vmlaq_lane_s32(c0.val[1], ib0.val[0], ia.val[0], 1);
+            c0.val[2] = vmlaq_lane_s32(c0.val[2], ib0.val[0], ia.val[1], 0);
+            c0.val[3] = vmlaq_lane_s32(c0.val[3], ib0.val[0], ia.val[1], 1);
+
+            // 4x4 block 1
+            c1.val[0] = vmlaq_lane_s32(c1.val[0], ib0.val[1], ia.val[0], 0);
+            c1.val[1] = vmlaq_lane_s32(c1.val[1], ib0.val[1], ia.val[0], 1);
+            c1.val[2] = vmlaq_lane_s32(c1.val[2], ib0.val[1], ia.val[1], 0);
+            c1.val[3] = vmlaq_lane_s32(c1.val[3], ib0.val[1], ia.val[1], 1);
+
+            // 4x4 block 2
+            c2.val[0] = vmlaq_lane_s32(c2.val[0], ib0.val[2], ia.val[0], 0);
+            c2.val[1] = vmlaq_lane_s32(c2.val[1], ib0.val[2], ia.val[0], 1);
+            c2.val[2] = vmlaq_lane_s32(c2.val[2], ib0.val[2], ia.val[1], 0);
+            c2.val[3] = vmlaq_lane_s32(c2.val[3], ib0.val[2], ia.val[1], 1);
+
+            // 4x4 block 3
+            c3.val[0] = vmlaq_lane_s32(c3.val[0], ib0.val[3], ia.val[0], 0);
+            c3.val[1] = vmlaq_lane_s32(c3.val[1], ib0.val[3], ia.val[0], 1);
+            c3.val[2] = vmlaq_lane_s32(c3.val[2], ib0.val[3], ia.val[1], 0);
+            c3.val[3] = vmlaq_lane_s32(c3.val[3], ib0.val[3], ia.val[1], 1);
+        }
+
+        c0.val[0] = vshlq_s32(vmulq_n_s32(c0.val[0], _output_mult_int), vshiftr);
+        c0.val[1] = vshlq_s32(vmulq_n_s32(c0.val[1], _output_mult_int), vshiftr);
+        c0.val[2] = vshlq_s32(vmulq_n_s32(c0.val[2], _output_mult_int), vshiftr);
+        c0.val[3] = vshlq_s32(vmulq_n_s32(c0.val[3], _output_mult_int), vshiftr);
+
+        c1.val[0] = vshlq_s32(vmulq_n_s32(c1.val[0], _output_mult_int), vshiftr);
+        c1.val[1] = vshlq_s32(vmulq_n_s32(c1.val[1], _output_mult_int), vshiftr);
+        c1.val[2] = vshlq_s32(vmulq_n_s32(c1.val[2], _output_mult_int), vshiftr);
+        c1.val[3] = vshlq_s32(vmulq_n_s32(c1.val[3], _output_mult_int), vshiftr);
+
+        c2.val[0] = vshlq_s32(vmulq_n_s32(c2.val[0], _output_mult_int), vshiftr);
+        c2.val[1] = vshlq_s32(vmulq_n_s32(c2.val[1], _output_mult_int), vshiftr);
+        c2.val[2] = vshlq_s32(vmulq_n_s32(c2.val[2], _output_mult_int), vshiftr);
+        c2.val[3] = vshlq_s32(vmulq_n_s32(c2.val[3], _output_mult_int), vshiftr);
+
+        c3.val[0] = vshlq_s32(vmulq_n_s32(c3.val[0], _output_mult_int), vshiftr);
+        c3.val[1] = vshlq_s32(vmulq_n_s32(c3.val[1], _output_mult_int), vshiftr);
+        c3.val[2] = vshlq_s32(vmulq_n_s32(c3.val[2], _output_mult_int), vshiftr);
+        c3.val[3] = vshlq_s32(vmulq_n_s32(c3.val[3], _output_mult_int), vshiftr);
+
+        const uint8x16x4_t r =
+        {
+            {
+                vcombine_u8(vqmovun_s16(vcombine_s16(vqmovn_s32(c0.val[0]), vqmovn_s32(c1.val[0]))),
+                vqmovun_s16(vcombine_s16(vqmovn_s32(c2.val[0]), vqmovn_s32(c3.val[0])))),
+                vcombine_u8(vqmovun_s16(vcombine_s16(vqmovn_s32(c0.val[1]), vqmovn_s32(c1.val[1]))),
+                vqmovun_s16(vcombine_s16(vqmovn_s32(c2.val[1]), vqmovn_s32(c3.val[1])))),
+                vcombine_u8(vqmovun_s16(vcombine_s16(vqmovn_s32(c0.val[2]), vqmovn_s32(c1.val[2]))),
+                vqmovun_s16(vcombine_s16(vqmovn_s32(c2.val[2]), vqmovn_s32(c3.val[2])))),
+                vcombine_u8(vqmovun_s16(vcombine_s16(vqmovn_s32(c0.val[3]), vqmovn_s32(c1.val[3]))),
+                vqmovun_s16(vcombine_s16(vqmovn_s32(c2.val[3]), vqmovn_s32(c3.val[3]))))
+            }
+        };
+
+        uint8_t *const mtx_out = out.ptr();
+        vst1q_u8(mtx_out + 0 * out_stride, r.val[0]);
+        vst1q_u8(mtx_out + 1 * out_stride, r.val[1]);
+        vst1q_u8(mtx_out + 2 * out_stride, r.val[2]);
+        vst1q_u8(mtx_out + 3 * out_stride, r.val[3]);
+    },
+    ina, inb, out);
+}
diff --git a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
new file mode 100644
index 0000000000..7a3bae50c0
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+NEGEMMMatrixAccumulateBiasesKernel::NEGEMMMatrixAccumulateBiasesKernel()
+    : _accum(nullptr), _biases(nullptr)
+{
+}
+
+void NEGEMMMatrixAccumulateBiasesKernel::configure(ITensor *accum, const ITensor *biases)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
+    ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() != 1);
+
+    _biases = biases;
+    _accum  = accum;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*accum->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowStatic biases_access(biases->info(), 0, 0, biases->info()->dimension(0), biases->info()->dimension(1));
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(accum->info(), 0, num_elems_processed_per_iteration),
+                              biases_access);
+
+    AccessWindowHorizontal output_access(accum->info(), 0, num_elems_processed_per_iteration);
+
+    // Set the valid region for the accum tensor
+    Coordinates coord;
+    coord.set_num_dimensions(accum->info()->num_dimensions());
+    output_access.set_valid_region(win, ValidRegion(coord, accum->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NEGEMMMatrixAccumulateBiasesKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    Window win_biases;
+    win_biases.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), window.x().step()));
+    win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    Iterator in0_out(_accum, window);
+    Iterator in1(_biases, win_biases);
+
+    switch(_accum->info()->data_type())
+    {
+        case DataType::F32:
+        {
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                const float32x4x4_t accum  = vld4q_f32(reinterpret_cast<const float *>(in0_out.ptr()));
+                const float32x4x4_t biases = vld4q_f32(reinterpret_cast<const float *>(in1.ptr()));
+                const float32x4x4_t res =
+                {
+                    {
+                        vaddq_f32(accum.val[0], biases.val[0]),
+                        vaddq_f32(accum.val[1], biases.val[1]),
+                        vaddq_f32(accum.val[2], biases.val[2]),
+                        vaddq_f32(accum.val[3], biases.val[3])
+                    }
+                };
+
+                vst4q_f32(reinterpret_cast<float *>(in0_out.ptr()), res);
+            },
+            in0_out, in1);
+            break;
+        }
+        case DataType::QS8:
+        {
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                const qint8x16_t accum  = vld1q_qs8(reinterpret_cast<const qint8_t *>(in0_out.ptr()));
+                const qint8x16_t biases = vld1q_qs8(reinterpret_cast<const qint8_t *>(in1.ptr()));
+
+                vst1q_qs8(reinterpret_cast<qint8_t *>(in0_out.ptr()), vqaddq_qs8(accum, biases));
+            },
+            in0_out, in1);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Data type not supported");
+            break;
+    }
+}
diff --git a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
new file mode 100644
index 0000000000..71dd4c7aa1
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+void matrix_addition_f32(const ITensor *input, ITensor *output, const Window &window, float beta)
+{
+    const float32x4_t beta_f32 = vdupq_n_f32(beta);
+
+    Iterator in(input, window);
+    Iterator out(output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto in_ptr  = reinterpret_cast<const float *>(in.ptr());
+        const auto out_ptr = reinterpret_cast<float *>(out.ptr());
+
+        float32x4x4_t alpha_ab =
+        {
+            {
+                vld1q_f32(out_ptr + 0),
+                vld1q_f32(out_ptr + 4),
+                vld1q_f32(out_ptr + 8),
+                vld1q_f32(out_ptr + 12)
+            }
+        };
+
+        const float32x4x4_t c =
+        {
+            {
+                vld1q_f32(in_ptr + 0),
+                vld1q_f32(in_ptr + 4),
+                vld1q_f32(in_ptr + 8),
+                vld1q_f32(in_ptr + 12)
+            }
+        };
+
+        // Multiply matrix C by its weight and accumulate
+        alpha_ab.val[0] = vmlaq_f32(alpha_ab.val[0], c.val[0], beta_f32);
+        alpha_ab.val[1] = vmlaq_f32(alpha_ab.val[1], c.val[1], beta_f32);
+        alpha_ab.val[2] = vmlaq_f32(alpha_ab.val[2], c.val[2], beta_f32);
+        alpha_ab.val[3] = vmlaq_f32(alpha_ab.val[3], c.val[3], beta_f32);
+
+        vst1q_f32(out_ptr + 0, alpha_ab.val[0]);
+        vst1q_f32(out_ptr + 4, alpha_ab.val[1]);
+        vst1q_f32(out_ptr + 8, alpha_ab.val[2]);
+        vst1q_f32(out_ptr + 12, alpha_ab.val[3]);
+    },
+    in, out);
+}
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+void matrix_addition_f16(const ITensor *input, ITensor *output, const Window &window, float beta)
+{
+    const float16x8_t beta_f16 = vdupq_n_f16(beta);
+
+    Iterator in(input, window);
+    Iterator out(output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto in_ptr  = reinterpret_cast<const float16_t *>(in.ptr());
+        const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr());
+
+        float16x8x2_t alpha_ab =
+        {
+            {
+                vld1q_f16(out_ptr + 0),
+                vld1q_f16(out_ptr + 8)
+            }
+        };
+
+        float16x8x2_t c =
+        {
+            {
+                vld1q_f16(in_ptr + 0),
+                vld1q_f16(in_ptr + 8)
+            }
+        };
+
+        // Multiply matrix C by its weight and accumulate
+        alpha_ab.val[0] = vaddq_f16(alpha_ab.val[0], vmulq_f16(c.val[0], beta_f16));
+        alpha_ab.val[1] = vaddq_f16(alpha_ab.val[1], vmulq_f16(c.val[1], beta_f16));
+
+        vst1q_f16(out_ptr + 0, alpha_ab.val[0]);
+        vst1q_f16(out_ptr + 8, alpha_ab.val[1]);
+    },
+    in, out);
+}
+#endif
+
+void matrix_addition_qs8(const ITensor *input, ITensor *output, const Window &window, float beta)
+{
+    const int        fixed_point_position = input->info()->fixed_point_position();
+    const qint8x16_t beta_qs8             = vdupq_n_qs8(scvt_qs8_f32(beta, fixed_point_position));
+
+    Iterator in(input, window);
+    Iterator out(output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto in_ptr  = reinterpret_cast<const qint8_t *>(in.ptr());
+        const auto out_ptr = reinterpret_cast<qint8_t *>(out.ptr());
+
+        qint8x16_t       alpha_ab = vld1q_qs8(out_ptr);
+        const qint8x16_t c        = vld1q_qs8(in_ptr);
+
+        // Multiply matrix C by its weight and accumulate
+        alpha_ab = vqmlaq_qs8(alpha_ab, c, beta_qs8, fixed_point_position);
+
+        vst1q_qs8(out_ptr, alpha_ab);
+    },
+    in, out);
+}
+} // namespace
+
+NEGEMMMatrixAdditionKernel::NEGEMMMatrixAdditionKernel()
+    : INESimpleKernel(), _func(nullptr), _beta(0.0f)
+{
+}
+
+void NEGEMMMatrixAdditionKernel::configure(const ITensor *input, ITensor *output, float beta)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1));
+
+    switch(input->info()->data_type())
+    {
+        case DataType::F32:
+            _func = &matrix_addition_f32;
+            break;
+        case DataType::QS8:
+            _func = &matrix_addition_qs8;
+            break;
+        case DataType::F16:
+#ifdef ARM_COMPUTE_ENABLE_FP16
+            _func = &matrix_addition_f16;
+            break;
+#endif
+        default:
+            ARM_COMPUTE_ERROR("Data type not supported");
+            break;
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    INESimpleKernel::configure(input, output, num_elems_processed_per_iteration);
+
+    _beta = beta;
+}
+
+void NEGEMMMatrixAdditionKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+
+    if(_beta != 0.0f)
+    {
+        (*_func)(_input, _output, window, _beta);
+    }
+}
diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
new file mode 100644
index 0000000000..dcfbb13081
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
@@ -0,0 +1,1168 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
+
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+template <bool multiply_alpha>
+void vector_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
+{
+    const auto width_matrix_b  = static_cast<int>(output->info()->dimension(0));
+    const auto in_b_stride     = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
+    const auto num_elems_vec_a = static_cast<int>(input0->info()->dimension(0));
+
+    // The implementation computes 16 elements per iteration
+    const int window_start_x = 16 * window.thread_id();
+    const int window_step_x  = 16 * window.num_threads();
+    // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
+    const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
+
+    Window win_out(window);
+    win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+    win_out.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    Window win_a(window);
+    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_a.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Window win_b;
+    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+    if(input1->info()->num_dimensions() >= 3)
+    {
+        win_b = window;
+    }
+    win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+    win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    Iterator ina(input0, win_a);
+    Iterator inb(input1, win_b);
+    Iterator out(output, win_out);
+
+    execute_window_loop(win_out, [&](const Coordinates & id)
+    {
+        if(id.x() > width_matrix_b)
+        {
+            return;
+        }
+
+        float32x4_t acc0 = vdupq_n_f32(0.f);
+        float32x4_t acc1 = vdupq_n_f32(0.f);
+        float32x4_t acc2 = vdupq_n_f32(0.f);
+        float32x4_t acc3 = vdupq_n_f32(0.f);
+
+        auto vec_a    = reinterpret_cast<const float *>(ina.ptr());
+        auto matrix_b = reinterpret_cast<const float *>(inb.ptr());
+
+#if __arm__
+        asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+        asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
+        asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
+#endif
+
+        auto vec_a_end_addr = vec_a + num_elems_vec_a;
+        for(; vec_a <= (vec_a_end_addr - 4);)
+        {
+            float32x2_t a0l = vld1_f32(vec_a);
+
+            float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+            float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+            float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+            float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+
+            float32x4_t b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
+            float32x4_t b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
+            float32x4_t b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
+            float32x4_t b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
+
+#if __arm__
+            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
+#endif
+
+            acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
+            acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
+            acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
+            acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
+
+            acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
+            acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
+            acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
+            acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
+
+            vec_a += 2;
+            matrix_b += 2 * in_b_stride;
+
+            a0l = vld1_f32(vec_a);
+
+            b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+            b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+            b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+            b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+
+            b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
+            b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
+            b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
+            b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
+
+            acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
+            acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
+            acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
+            acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
+
+            acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
+            acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
+            acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
+            acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
+
+            vec_a += 2;
+            matrix_b += 2 * in_b_stride;
+        }
+
+        for(; vec_a < vec_a_end_addr;)
+        {
+            const float a0 = *vec_a;
+
+            const float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+            const float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+            const float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+            const float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+
+            acc0 = vmlaq_n_f32(acc0, b00, a0);
+            acc1 = vmlaq_n_f32(acc1, b01, a0);
+            acc2 = vmlaq_n_f32(acc2, b02, a0);
+            acc3 = vmlaq_n_f32(acc3, b03, a0);
+
+            vec_a += 1;
+            matrix_b += in_b_stride;
+        }
+
+        // Multiply by the weight of matrix product (alpha)
+        if(multiply_alpha)
+        {
+            const float32x4_t alpha_f32 = vdupq_n_f32(alpha);
+            acc0                        = vmulq_f32(acc0, alpha_f32);
+            acc1                        = vmulq_f32(acc1, alpha_f32);
+            acc2                        = vmulq_f32(acc2, alpha_f32);
+            acc3                        = vmulq_f32(acc3, alpha_f32);
+        }
+
+        const auto vec_out = reinterpret_cast<float *>(out.ptr());
+
+        vst1q_f32(vec_out + 0, acc0);
+        vst1q_f32(vec_out + 4, acc1);
+        vst1q_f32(vec_out + 8, acc2);
+        vst1q_f32(vec_out + 12, acc3);
+    },
+    ina, inb, out);
+}
+
+template <bool multiply_alpha>
+void vector_matrix_multiply_qs8(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
+{
+    const auto width_matrix_b       = static_cast<int>(output->info()->dimension(0));
+    const auto in_b_stride          = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
+    const auto num_elems_vec_a      = static_cast<int>(input0->info()->dimension(0));
+    const int  fixed_point_position = input0->info()->fixed_point_position();
+
+    // The implementation computes 32 elements per iteration
+    const int window_start_x = 32 * window.thread_id();
+    const int window_step_x  = 32 * window.num_threads();
+    // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
+    const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
+
+    Window win_out(window);
+    win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+    win_out.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    Window win_a(window);
+    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_a.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Window win_b;
+    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+    if(input1->info()->num_dimensions() >= 3)
+    {
+        win_b = window;
+    }
+    win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+    win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    Iterator ina(input0, win_a);
+    Iterator inb(input1, win_b);
+    Iterator out(output, win_out);
+
+    execute_window_loop(win_out, [&](const Coordinates & id)
+    {
+        if(id.x() > width_matrix_b)
+        {
+            return;
+        }
+
+        // Reset accumulators
+        qint16x8_t acc00_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc01_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc02_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc03_qs16 = vdupq_n_qs16(0);
+
+        auto vec_a    = reinterpret_cast<const qint8_t *>(ina.ptr());
+        auto matrix_b = reinterpret_cast<const qint8_t *>(inb.ptr());
+
+        auto vec_a_end_addr = vec_a + num_elems_vec_a;
+        for(; vec_a <= (vec_a_end_addr - 2);)
+        {
+            const qint8x8_t a0 = vld1_dup_qs8(vec_a + 0);
+            const qint8x8_t a1 = vld1_dup_qs8(vec_a + 1);
+
+            const qint8x8_t b00 = vld1_qs8(matrix_b + 0 + 0 * in_b_stride);
+            const qint8x8_t b01 = vld1_qs8(matrix_b + 8 + 0 * in_b_stride);
+            const qint8x8_t b02 = vld1_qs8(matrix_b + 16 + 0 * in_b_stride);
+            const qint8x8_t b03 = vld1_qs8(matrix_b + 24 + 0 * in_b_stride);
+            const qint8x8_t b10 = vld1_qs8(matrix_b + 0 + 1 * in_b_stride);
+            const qint8x8_t b11 = vld1_qs8(matrix_b + 8 + 1 * in_b_stride);
+            const qint8x8_t b12 = vld1_qs8(matrix_b + 16 + 1 * in_b_stride);
+            const qint8x8_t b13 = vld1_qs8(matrix_b + 24 + 1 * in_b_stride);
+
+            // First accumulation
+            acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position);
+            acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position);
+            acc02_qs16 = vqmlal_qs8(acc02_qs16, b02, a0, fixed_point_position);
+            acc03_qs16 = vqmlal_qs8(acc03_qs16, b03, a0, fixed_point_position);
+
+            // Second accumulation
+            acc00_qs16 = vqmlal_qs8(acc00_qs16, b10, a1, fixed_point_position);
+            acc01_qs16 = vqmlal_qs8(acc01_qs16, b11, a1, fixed_point_position);
+            acc02_qs16 = vqmlal_qs8(acc02_qs16, b12, a1, fixed_point_position);
+            acc03_qs16 = vqmlal_qs8(acc03_qs16, b13, a1, fixed_point_position);
+
+            vec_a += 2;
+            matrix_b += 2 * in_b_stride;
+        }
+
+        for(; vec_a < vec_a_end_addr;)
+        {
+            const qint8x8_t a0 = vld1_dup_qs8(vec_a);
+
+            const qint8x8_t b00 = vld1_qs8(matrix_b + 0);
+            const qint8x8_t b01 = vld1_qs8(matrix_b + 8);
+            const qint8x8_t b02 = vld1_qs8(matrix_b + 16);
+            const qint8x8_t b03 = vld1_qs8(matrix_b + 24);
+
+            acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position);
+            acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position);
+            acc02_qs16 = vqmlal_qs8(acc02_qs16, b02, a0, fixed_point_position);
+            acc03_qs16 = vqmlal_qs8(acc03_qs16, b03, a0, fixed_point_position);
+
+            vec_a += 1;
+            matrix_b += in_b_stride;
+        }
+
+        // Convert back to qint8x8_t and saturate
+        qint8x8_t acc00_qs8 = vqmovn_qs16(acc00_qs16);
+        qint8x8_t acc01_qs8 = vqmovn_qs16(acc01_qs16);
+        qint8x8_t acc02_qs8 = vqmovn_qs16(acc02_qs16);
+        qint8x8_t acc03_qs8 = vqmovn_qs16(acc03_qs16);
+
+        // Multiply by the weight of the matrix product (alpha)
+        if(multiply_alpha)
+        {
+            const qint8x8_t alpha_qs8 = vdup_n_qs8(scvt_qs8_f32(alpha, fixed_point_position));
+            acc00_qs8                 = vqmul_qs8(acc00_qs8, alpha_qs8, fixed_point_position);
+            acc01_qs8                 = vqmul_qs8(acc01_qs8, alpha_qs8, fixed_point_position);
+            acc02_qs8                 = vqmul_qs8(acc02_qs8, alpha_qs8, fixed_point_position);
+            acc03_qs8                 = vqmul_qs8(acc03_qs8, alpha_qs8, fixed_point_position);
+        }
+
+        const auto mtx_out0 = reinterpret_cast<qint8_t *>(out.ptr());
+
+        // Store 8x4 output elements
+        vst1_qs8(mtx_out0 + 0, acc00_qs8);
+        vst1_qs8(mtx_out0 + 8, acc01_qs8);
+        vst1_qs8(mtx_out0 + 16, acc02_qs8);
+        vst1_qs8(mtx_out0 + 24, acc03_qs8);
+    },
+    ina, inb, out);
+}
+
+template <bool multiply_alpha>
+void matrix_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
+{
+    const size_t in_b_stride          = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type());
+    const size_t out_stride1          = output->info()->strides_in_bytes()[1] / data_size_from_type(output->info()->data_type());
+    const size_t out_stride2          = out_stride1 * 2;
+    const size_t out_stride3          = out_stride1 * 3;
+    const int    num_elems_matrix_b_x = input1->info()->dimension(0);
+
+    // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
+    Window win_a(window);
+    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1));
+
+    Window win_b;
+    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+    if(input1->info()->num_dimensions() >= 3)
+    {
+        win_b = window;
+    }
+    // Set step_x and step_y for matrix B. Scale by a factor of 4 the X range as the input transposed matrix A has 4 times less the cols of the output matrix
+    // The step along the x direction is 2 times the in_b_stride because for each iteration we compute 2 blocks of size 4x4
+    win_b.set(Window::DimX, Window::Dimension(window.x().start() / 4, window.x().end() / 4, 2 * in_b_stride));
+    win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator ina(input0, win_a);
+    Iterator inb(input1, win_b);
+    Iterator out(output, window);
+
+    // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW
+    // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
+    // All the values needed for computing a single 4x4 block will be read from consecutive memory positions
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        auto mtx_a0 = reinterpret_cast<const float *>(ina.ptr());
+        auto mtx_b0 = reinterpret_cast<const float *>(inb.ptr());
+        auto mtx_b1 = mtx_b0 + in_b_stride;
+
+        float32x4_t acc00 = vdupq_n_f32(0.f);
+        float32x4_t acc10 = vdupq_n_f32(0.f);
+        float32x4_t acc20 = vdupq_n_f32(0.f);
+        float32x4_t acc30 = vdupq_n_f32(0.f);
+
+        float32x4_t acc01 = vdupq_n_f32(0.f);
+        float32x4_t acc11 = vdupq_n_f32(0.f);
+        float32x4_t acc21 = vdupq_n_f32(0.f);
+        float32x4_t acc31 = vdupq_n_f32(0.f);
+
+#if __arm__
+        asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+        asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+        asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+#endif
+
+        auto mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;
+        for(; mtx_b0 <= (mtx_b0_end_addr - 32);)
+        {
+            float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0);
+            float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1);
+            float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2);
+            float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3);
+
+            float32x4_t b00 = vld1q_f32(mtx_b0);
+            float32x4_t b10 = vld1q_f32(mtx_b1);
+            float32x4_t b01 = vld1q_f32(mtx_b0 + 4);
+            float32x4_t b11 = vld1q_f32(mtx_b1 + 4);
+
+#if __arm__
+            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+#endif
+
+            // 4x4 block 0
+            acc00 = vmlaq_f32(acc00, b00, a0);
+            acc10 = vmlaq_f32(acc10, b00, a1);
+            acc20 = vmlaq_f32(acc20, b00, a2);
+            acc30 = vmlaq_f32(acc30, b00, a3);
+
+            float32x4_t a4 = vld1q_dup_f32(mtx_a0 + 4);
+            float32x4_t a5 = vld1q_dup_f32(mtx_a0 + 5);
+            float32x4_t a6 = vld1q_dup_f32(mtx_a0 + 6);
+            float32x4_t a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+            // 4x4 block 1
+            acc01 = vmlaq_f32(acc01, b10, a0);
+            acc11 = vmlaq_f32(acc11, b10, a1);
+            acc21 = vmlaq_f32(acc21, b10, a2);
+            acc31 = vmlaq_f32(acc31, b10, a3);
+
+            // 4x4 block 0
+            acc00 = vmlaq_f32(acc00, b01, a4);
+            acc10 = vmlaq_f32(acc10, b01, a5);
+            acc20 = vmlaq_f32(acc20, b01, a6);
+            acc30 = vmlaq_f32(acc30, b01, a7);
+
+            // 4x4 block 1
+            acc01 = vmlaq_f32(acc01, b11, a4);
+            acc11 = vmlaq_f32(acc11, b11, a5);
+            acc21 = vmlaq_f32(acc21, b11, a6);
+            acc31 = vmlaq_f32(acc31, b11, a7);
+
+            mtx_a0 += 8;
+            mtx_b0 += 8;
+            mtx_b1 += 8;
+
+            a0 = vld1q_dup_f32(mtx_a0 + 0);
+            a1 = vld1q_dup_f32(mtx_a0 + 1);
+            a2 = vld1q_dup_f32(mtx_a0 + 2);
+            a3 = vld1q_dup_f32(mtx_a0 + 3);
+
+            b00 = vld1q_f32(mtx_b0);
+            b10 = vld1q_f32(mtx_b1);
+            b01 = vld1q_f32(mtx_b0 + 4);
+            b11 = vld1q_f32(mtx_b1 + 4);
+
+            // 4x4 block 0
+            acc00 = vmlaq_f32(acc00, b00, a0);
+            acc10 = vmlaq_f32(acc10, b00, a1);
+            acc20 = vmlaq_f32(acc20, b00, a2);
+            acc30 = vmlaq_f32(acc30, b00, a3);
+
+            a4 = vld1q_dup_f32(mtx_a0 + 4);
+            a5 = vld1q_dup_f32(mtx_a0 + 5);
+            a6 = vld1q_dup_f32(mtx_a0 + 6);
+            a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+            // 4x4 block 1
+            acc01 = vmlaq_f32(acc01, b10, a0);
+            acc11 = vmlaq_f32(acc11, b10, a1);
+            acc21 = vmlaq_f32(acc21, b10, a2);
+            acc31 = vmlaq_f32(acc31, b10, a3);
+
+            // 4x4 block 0
+            acc00 = vmlaq_f32(acc00, b01, a4);
+            acc10 = vmlaq_f32(acc10, b01, a5);
+            acc20 = vmlaq_f32(acc20, b01, a6);
+            acc30 = vmlaq_f32(acc30, b01, a7);
+
+            // 4x4 block 1
+            acc01 = vmlaq_f32(acc01, b11, a4);
+            acc11 = vmlaq_f32(acc11, b11, a5);
+            acc21 = vmlaq_f32(acc21, b11, a6);
+            acc31 = vmlaq_f32(acc31, b11, a7);
+
+            mtx_a0 += 8;
+            mtx_b0 += 8;
+            mtx_b1 += 8;
+
+            a0  = vld1q_dup_f32(mtx_a0 + 0);
+            a1  = vld1q_dup_f32(mtx_a0 + 1);
+            a2  = vld1q_dup_f32(mtx_a0 + 2);
+            a3  = vld1q_dup_f32(mtx_a0 + 3);
+            b00 = vld1q_f32(mtx_b0);
+            b10 = vld1q_f32(mtx_b1);
+            b01 = vld1q_f32(mtx_b0 + 4);
+            b11 = vld1q_f32(mtx_b1 + 4);
+
+#if __arm__
+            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+#endif
+
+            // 4x4 block 0
+            acc00 = vmlaq_f32(acc00, b00, a0);
+            acc10 = vmlaq_f32(acc10, b00, a1);
+            acc20 = vmlaq_f32(acc20, b00, a2);
+            acc30 = vmlaq_f32(acc30, b00, a3);
+
+            a4 = vld1q_dup_f32(mtx_a0 + 4);
+            a5 = vld1q_dup_f32(mtx_a0 + 5);
+            a6 = vld1q_dup_f32(mtx_a0 + 6);
+            a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+            // 4x4 block 1
+            acc01 = vmlaq_f32(acc01, b10, a0);
+            acc11 = vmlaq_f32(acc11, b10, a1);
+            acc21 = vmlaq_f32(acc21, b10, a2);
+            acc31 = vmlaq_f32(acc31, b10, a3);
+
+            // 4x4 block 0
+            acc00 = vmlaq_f32(acc00, b01, a4);
+            acc10 = vmlaq_f32(acc10, b01, a5);
+            acc20 = vmlaq_f32(acc20, b01, a6);
+            acc30 = vmlaq_f32(acc30, b01, a7);
+
+            // 4x4 block 1
+            acc01 = vmlaq_f32(acc01, b11, a4);
+            acc11 = vmlaq_f32(acc11, b11, a5);
+            acc21 = vmlaq_f32(acc21, b11, a6);
+            acc31 = vmlaq_f32(acc31, b11, a7);
+
+            mtx_a0 += 8;
+            mtx_b0 += 8;
+            mtx_b1 += 8;
+
+            a0  = vld1q_dup_f32(mtx_a0 + 0);
+            a1  = vld1q_dup_f32(mtx_a0 + 1);
+            a2  = vld1q_dup_f32(mtx_a0 + 2);
+            a3  = vld1q_dup_f32(mtx_a0 + 3);
+            b00 = vld1q_f32(mtx_b0);
+            b10 = vld1q_f32(mtx_b1);
+            b01 = vld1q_f32(mtx_b0 + 4);
+            b11 = vld1q_f32(mtx_b1 + 4);
+
+            // 4x4 block 0
+            acc00 = vmlaq_f32(acc00, b00, a0);
+            acc10 = vmlaq_f32(acc10, b00, a1);
+            acc20 = vmlaq_f32(acc20, b00, a2);
+            acc30 = vmlaq_f32(acc30, b00, a3);
+
+            a4 = vld1q_dup_f32(mtx_a0 + 4);
+            a5 = vld1q_dup_f32(mtx_a0 + 5);
+            a6 = vld1q_dup_f32(mtx_a0 + 6);
+            a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+            // 4x4 block 1
+            acc01 = vmlaq_f32(acc01, b10, a0);
+            acc11 = vmlaq_f32(acc11, b10, a1);
+            acc21 = vmlaq_f32(acc21, b10, a2);
+            acc31 = vmlaq_f32(acc31, b10, a3);
+
+            // 4x4 block 0
+            acc00 = vmlaq_f32(acc00, b01, a4);
+            acc10 = vmlaq_f32(acc10, b01, a5);
+            acc20 = vmlaq_f32(acc20, b01, a6);
+            acc30 = vmlaq_f32(acc30, b01, a7);
+
+            // 4x4 block 1
+            acc01 = vmlaq_f32(acc01, b11, a4);
+            acc11 = vmlaq_f32(acc11, b11, a5);
+            acc21 = vmlaq_f32(acc21, b11, a6);
+            acc31 = vmlaq_f32(acc31, b11, a7);
+
+            mtx_a0 += 8;
+            mtx_b0 += 8;
+            mtx_b1 += 8;
+        }
+
+        for(; mtx_b0 < mtx_b0_end_addr;)
+        {
+            float32x4_t a0  = vld1q_dup_f32(mtx_a0 + 0);
+            float32x4_t a1  = vld1q_dup_f32(mtx_a0 + 1);
+            float32x4_t a2  = vld1q_dup_f32(mtx_a0 + 2);
+            float32x4_t a3  = vld1q_dup_f32(mtx_a0 + 3);
+            float32x4_t b00 = vld1q_f32(mtx_b0);
+            float32x4_t b10 = vld1q_f32(mtx_b1);
+
+#if __arm__
+            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+#endif
+            // 4x4 block 0
+            acc00 = vmlaq_f32(acc00, b00, a0);
+            acc10 = vmlaq_f32(acc10, b00, a1);
+            acc20 = vmlaq_f32(acc20, b00, a2);
+            acc30 = vmlaq_f32(acc30, b00, a3);
+
+            // 4x4 block 1
+            acc01 = vmlaq_f32(acc01, b10, a0);
+            acc11 = vmlaq_f32(acc11, b10, a1);
+            acc21 = vmlaq_f32(acc21, b10, a2);
+            acc31 = vmlaq_f32(acc31, b10, a3);
+
+            mtx_a0 += 4;
+            mtx_b0 += 4;
+            mtx_b1 += 4;
+        }
+
+        // Multiply by the weight of matrix product (alpha)
+        if(multiply_alpha)
+        {
+            const float32x4_t alpha_f32 = vdupq_n_f32(alpha);
+            acc00                       = vmulq_f32(acc00, alpha_f32);
+            acc10                       = vmulq_f32(acc10, alpha_f32);
+            acc20                       = vmulq_f32(acc20, alpha_f32);
+            acc30                       = vmulq_f32(acc30, alpha_f32);
+            acc01                       = vmulq_f32(acc01, alpha_f32);
+            acc11                       = vmulq_f32(acc11, alpha_f32);
+            acc21                       = vmulq_f32(acc21, alpha_f32);
+            acc31                       = vmulq_f32(acc31, alpha_f32);
+        }
+
+        const auto mtx_out0 = reinterpret_cast<float *>(out.ptr());
+        const auto mtx_out1 = mtx_out0 + 4;
+
+        // Store the 4 blocks
+        vst1q_f32(mtx_out0, acc00);
+        vst1q_f32(mtx_out1, acc01);
+        vst1q_f32(mtx_out0 + out_stride1, acc10);
+        vst1q_f32(mtx_out1 + out_stride1, acc11);
+        vst1q_f32(mtx_out0 + out_stride2, acc20);
+        vst1q_f32(mtx_out1 + out_stride2, acc21);
+        vst1q_f32(mtx_out0 + out_stride3, acc30);
+        vst1q_f32(mtx_out1 + out_stride3, acc31);
+    },
+    ina, inb, out);
+}
+
+template <bool multiply_alpha>
+void matrix_matrix_multiply_f16(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
+{
+#ifdef ARM_COMPUTE_ENABLE_FP16
+    const size_t in_b_stride = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type());
+    const size_t out_stride  = output->info()->strides_in_bytes()[1] / data_size_from_type(output->info()->data_type());
+
+    // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
+    Window win_a(window);
+    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1));
+
+    Window win_b;
+    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+    if(input1->info()->num_dimensions() >= 3)
+    {
+        win_b = window;
+    }
+    // Set step_x and step_y for matrix B. Scale by a factor of 8 the X range as the input transposed matrix A has 8 times less the cols of the output matrix
+    win_b.set(Window::DimX, Window::Dimension(window.x().start() / 8, window.x().end() / 8, in_b_stride));
+    win_b.set(Window::DimY, Window::Dimension(0, 1, 0));
+
+    Iterator ina(input0, win_a);
+    Iterator inb(input1, win_b);
+    Iterator out(output, window);
+
+    // Number of iterations of inner loop. Since 8 is the number of accumulations per loop, num_it = (width_mtx_b / 4) / 8
+    const size_t num_it = ((input1->info()->dimension(0)) >> 2) >> 3;
+
+    const float16x8_t alpha_f16 = vdupq_n_f16(alpha);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto   *mtx_a0  = reinterpret_cast<const float16_t *>(ina.ptr());
+        const auto   *mtx_b0  = reinterpret_cast<const float16_t *>(inb.ptr());
+        auto         *mtx_out = reinterpret_cast<float16_t *>(out.ptr());
+        float16x8x4_t c =
+        {
+            {
+                vdupq_n_f16(0.f),
+                vdupq_n_f16(0.f),
+                vdupq_n_f16(0.f),
+                vdupq_n_f16(0.f)
+            }
+        };
+
+        /*
+        This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
+             |a00 a01 a02 a03 | a04 a05 a06 a07|
+             |a10 a11 a12 a13 | a14 a15 a16 a17|
+             |a20 a21 a22 a23 | a24 a25 a26 a27| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 | a40 a50 a60 a70 | ...
+             |a30 a31 a32 a33 | a34 a35 a36 a37|   | a04 a14 a24 a34 || a05 a15 a25 a35 || a06 a15 a26 a36 || a07 a17 a27 a37 | a44 a54 a64 a74 | ...
+             |a40 a41 a42 a43 | a44 a45 a46 a47|
+             |a50 a51 a52 a53 | a54 a55 a56 a57|
+             |a60 a61 a62 a63 | a64 a65 a66 a67|
+             |a70 a71 a72 a73 | a74 a75 a76 a77|
+
+             After this operation, the output matrix will have the following shape: [ height * 4, width / 4 ]
+
+        B Matrix has been transposed as shown below
+
+           |b00 b01 b02 b03 b04 b05 b06 b07|
+           |b10 b11 b12 b13 b14 b15 b16 b17|
+           |b20 b21 b22 b23 b24 b25 b26 b27|
+           |b30 b31 b32 b33 b34 b35 b36 b37|
+          ------------------->
+
+           |b00 b01 b02 b03 b04 b05 b06 b07||b10 b11 b12 b13 b14 b15 b16 b17||b20 b21 b22 b23 b24 b25 b26 b27||b30 b31 b32 b33 b34 b35 b36 b37|
+
+            c.val[0][0] = a00*b00 + a01*b10 + a02*b20 + a03*b30
+            c.val[0][1] = a00*b01 + a01*b11 + a02*b21 + a03*b31
+
+        The size of the output tensor's XY-plane must be the following shape [ width * 8, height / 8 ]. All other dimensions must have the same size.
+        */
+        for(size_t k = num_it; k > 0; mtx_a0 += 16, mtx_b0 += 32, --k)
+        {
+            const float16x8_t p00 = vld1q_f16(mtx_a0);
+            const float16x8_t p02 = vld1q_f16(mtx_a0 + 8);
+            const float16x8_t q00 = vld1q_f16(mtx_b0);
+            const float16x8_t q02 = vld1q_f16(mtx_b0 + 8);
+            const float16x8_t q04 = vld1q_f16(mtx_b0 + 16);
+            const float16x8_t q06 = vld1q_f16(mtx_b0 + 24);
+
+            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vgetq_lane_f16(p00, 0)));
+            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vgetq_lane_f16(p00, 1)));
+            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vgetq_lane_f16(p00, 2)));
+            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vgetq_lane_f16(p00, 3)));
+
+            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q02, vgetq_lane_f16(p00, 4)));
+            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q02, vgetq_lane_f16(p00, 5)));
+            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q02, vgetq_lane_f16(p00, 6)));
+            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q02, vgetq_lane_f16(p00, 7)));
+
+            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q04, vgetq_lane_f16(p02, 0)));
+            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q04, vgetq_lane_f16(p02, 1)));
+            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q04, vgetq_lane_f16(p02, 2)));
+            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q04, vgetq_lane_f16(p02, 3)));
+
+            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q06, vgetq_lane_f16(p02, 4)));
+            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q06, vgetq_lane_f16(p02, 5)));
+            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q06, vgetq_lane_f16(p02, 6)));
+            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q06, vgetq_lane_f16(p02, 7)));
+        }
+
+        if(multiply_alpha)
+        {
+            c.val[0] = vmulq_f16(c.val[0], alpha_f16);
+            c.val[1] = vmulq_f16(c.val[1], alpha_f16);
+            c.val[2] = vmulq_f16(c.val[2], alpha_f16);
+            c.val[3] = vmulq_f16(c.val[3], alpha_f16);
+        }
+
+        vst1q_f16(mtx_out + 0 * out_stride, c.val[0]);
+        vst1q_f16(mtx_out + 1 * out_stride, c.val[1]);
+        vst1q_f16(mtx_out + 2 * out_stride, c.val[2]);
+        vst1q_f16(mtx_out + 3 * out_stride, c.val[3]);
+    },
+    ina, inb, out);
+#else
+    ARM_COMPUTE_ERROR("Not implemented");
+#endif
+}
+
+template <bool multiply_alpha>
+void matrix_matrix_multiply_qs8(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
+{
+    const size_t    in_b_stride          = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type());
+    const size_t    out_stride1          = output->info()->strides_in_bytes()[1] / data_size_from_type(output->info()->data_type());
+    const size_t    out_stride2          = out_stride1 * 2;
+    const size_t    out_stride3          = out_stride1 * 3;
+    const int       num_elems_matrix_b_x = input1->info()->dimension(0);
+    const int       fixed_point_position = input0->info()->fixed_point_position();
+    const qint8x8_t alpha_qs8            = vdup_n_qs8(scvt_qs8_f32(alpha, fixed_point_position));
+    ARM_COMPUTE_UNUSED(alpha_qs8);
+
+    // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
+    Window win_a(window);
+    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1));
+
+    Window win_b;
+    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+    if(input1->info()->num_dimensions() >= 3)
+    {
+        win_b = window;
+    }
+    // Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the cols of the output matrix
+    // The step along the x direction is 2 times the in_b_stride because for each iteration we compute 2 blocks of size 16x4
+    win_b.set(Window::DimX, Window::Dimension(window.x().start() / 16, window.x().end() / 16, 2 * in_b_stride));
+    win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator ina(input0, win_a);
+    Iterator inb(input1, win_b);
+    Iterator out(output, window);
+
+    // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW
+    // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
+    // All the values needed for computing a single 32x4 block will be read from consecutive memory positions
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        auto mtx_a0 = reinterpret_cast<const qint8_t *>(ina.ptr());
+        auto mtx_b0 = reinterpret_cast<const qint8_t *>(inb.ptr());
+        auto mtx_b1 = mtx_b0 + in_b_stride;
+
+        qint16x8_t acc00_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc10_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc20_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc30_qs16 = vdupq_n_qs16(0);
+
+        qint16x8_t acc01_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc11_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc21_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc31_qs16 = vdupq_n_qs16(0);
+
+        qint16x8_t acc02_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc12_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc22_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc32_qs16 = vdupq_n_qs16(0);
+
+        qint16x8_t acc03_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc13_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc23_qs16 = vdupq_n_qs16(0);
+        qint16x8_t acc33_qs16 = vdupq_n_qs16(0);
+
+        int k = 0;
+        // This for loop performs 2 accumulations
+        for(; k <= (num_elems_matrix_b_x - 32); k += 32)
+        {
+            const qint8x8_t a0 = vld1_dup_qs8(mtx_a0 + 0);
+            const qint8x8_t a1 = vld1_dup_qs8(mtx_a0 + 1);
+            const qint8x8_t a2 = vld1_dup_qs8(mtx_a0 + 2);
+            const qint8x8_t a3 = vld1_dup_qs8(mtx_a0 + 3);
+            const qint8x8_t a4 = vld1_dup_qs8(mtx_a0 + 4);
+            const qint8x8_t a5 = vld1_dup_qs8(mtx_a0 + 5);
+            const qint8x8_t a6 = vld1_dup_qs8(mtx_a0 + 6);
+            const qint8x8_t a7 = vld1_dup_qs8(mtx_a0 + 7);
+
+            const qint8x8_t b00 = vld1_qs8(mtx_b0 + 0);
+            const qint8x8_t b01 = vld1_qs8(mtx_b0 + 8);
+            const qint8x8_t b10 = vld1_qs8(mtx_b1 + 0);
+            const qint8x8_t b11 = vld1_qs8(mtx_b1 + 8);
+
+            // First accumulation
+            acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position);
+            acc10_qs16 = vqmlal_qs8(acc10_qs16, b00, a1, fixed_point_position);
+            acc20_qs16 = vqmlal_qs8(acc20_qs16, b00, a2, fixed_point_position);
+            acc30_qs16 = vqmlal_qs8(acc30_qs16, b00, a3, fixed_point_position);
+            acc02_qs16 = vqmlal_qs8(acc02_qs16, b10, a0, fixed_point_position);
+            acc12_qs16 = vqmlal_qs8(acc12_qs16, b10, a1, fixed_point_position);
+            acc22_qs16 = vqmlal_qs8(acc22_qs16, b10, a2, fixed_point_position);
+            acc32_qs16 = vqmlal_qs8(acc32_qs16, b10, a3, fixed_point_position);
+
+            const qint8x8_t b02 = vld1_qs8(mtx_b0 + 16);
+            const qint8x8_t b03 = vld1_qs8(mtx_b0 + 24);
+            const qint8x8_t b12 = vld1_qs8(mtx_b1 + 16);
+            const qint8x8_t b13 = vld1_qs8(mtx_b1 + 24);
+
+            acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position);
+            acc11_qs16 = vqmlal_qs8(acc11_qs16, b01, a1, fixed_point_position);
+            acc21_qs16 = vqmlal_qs8(acc21_qs16, b01, a2, fixed_point_position);
+            acc31_qs16 = vqmlal_qs8(acc31_qs16, b01, a3, fixed_point_position);
+            acc03_qs16 = vqmlal_qs8(acc03_qs16, b11, a0, fixed_point_position);
+            acc13_qs16 = vqmlal_qs8(acc13_qs16, b11, a1, fixed_point_position);
+            acc23_qs16 = vqmlal_qs8(acc23_qs16, b11, a2, fixed_point_position);
+            acc33_qs16 = vqmlal_qs8(acc33_qs16, b11, a3, fixed_point_position);
+
+#if __arm__
+            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+#endif
+
+            // Second accumulation
+            acc00_qs16 = vqmlal_qs8(acc00_qs16, b02, a4, fixed_point_position);
+            acc10_qs16 = vqmlal_qs8(acc10_qs16, b02, a5, fixed_point_position);
+            acc20_qs16 = vqmlal_qs8(acc20_qs16, b02, a6, fixed_point_position);
+            acc30_qs16 = vqmlal_qs8(acc30_qs16, b02, a7, fixed_point_position);
+            acc01_qs16 = vqmlal_qs8(acc01_qs16, b03, a4, fixed_point_position);
+            acc11_qs16 = vqmlal_qs8(acc11_qs16, b03, a5, fixed_point_position);
+            acc21_qs16 = vqmlal_qs8(acc21_qs16, b03, a6, fixed_point_position);
+            acc31_qs16 = vqmlal_qs8(acc31_qs16, b03, a7, fixed_point_position);
+            acc02_qs16 = vqmlal_qs8(acc02_qs16, b12, a4, fixed_point_position);
+            acc12_qs16 = vqmlal_qs8(acc12_qs16, b12, a5, fixed_point_position);
+            acc22_qs16 = vqmlal_qs8(acc22_qs16, b12, a6, fixed_point_position);
+            acc32_qs16 = vqmlal_qs8(acc32_qs16, b12, a7, fixed_point_position);
+            acc03_qs16 = vqmlal_qs8(acc03_qs16, b13, a4, fixed_point_position);
+            acc13_qs16 = vqmlal_qs8(acc13_qs16, b13, a5, fixed_point_position);
+            acc23_qs16 = vqmlal_qs8(acc23_qs16, b13, a6, fixed_point_position);
+            acc33_qs16 = vqmlal_qs8(acc33_qs16, b13, a7, fixed_point_position);
+
+            mtx_a0 += 8;
+            mtx_b0 += 32;
+            mtx_b1 += 32;
+        }
+
+        // This for loop performs the left over accumulations
+        for(; k < num_elems_matrix_b_x; k += 16)
+        {
+            const qint8x8_t a0 = vld1_dup_qs8(mtx_a0 + 0);
+            const qint8x8_t a1 = vld1_dup_qs8(mtx_a0 + 1);
+            const qint8x8_t a2 = vld1_dup_qs8(mtx_a0 + 2);
+            const qint8x8_t a3 = vld1_dup_qs8(mtx_a0 + 3);
+
+            const qint8x8_t b00 = vld1_qs8(mtx_b0 + 0);
+            const qint8x8_t b01 = vld1_qs8(mtx_b0 + 8);
+            const qint8x8_t b10 = vld1_qs8(mtx_b1 + 0);
+            const qint8x8_t b11 = vld1_qs8(mtx_b1 + 8);
+
+            acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position);
+            acc10_qs16 = vqmlal_qs8(acc10_qs16, b00, a1, fixed_point_position);
+            acc20_qs16 = vqmlal_qs8(acc20_qs16, b00, a2, fixed_point_position);
+            acc30_qs16 = vqmlal_qs8(acc30_qs16, b00, a3, fixed_point_position);
+            acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position);
+            acc11_qs16 = vqmlal_qs8(acc11_qs16, b01, a1, fixed_point_position);
+            acc21_qs16 = vqmlal_qs8(acc21_qs16, b01, a2, fixed_point_position);
+            acc31_qs16 = vqmlal_qs8(acc31_qs16, b01, a3, fixed_point_position);
+            acc02_qs16 = vqmlal_qs8(acc02_qs16, b10, a0, fixed_point_position);
+            acc12_qs16 = vqmlal_qs8(acc12_qs16, b10, a1, fixed_point_position);
+            acc22_qs16 = vqmlal_qs8(acc22_qs16, b10, a2, fixed_point_position);
+            acc32_qs16 = vqmlal_qs8(acc32_qs16, b10, a3, fixed_point_position);
+            acc03_qs16 = vqmlal_qs8(acc03_qs16, b11, a0, fixed_point_position);
+            acc13_qs16 = vqmlal_qs8(acc13_qs16, b11, a1, fixed_point_position);
+            acc23_qs16 = vqmlal_qs8(acc23_qs16, b11, a2, fixed_point_position);
+            acc33_qs16 = vqmlal_qs8(acc33_qs16, b11, a3, fixed_point_position);
+
+            mtx_a0 += 4;
+            mtx_b0 += 16;
+            mtx_b1 += 16;
+        }
+
+        // Convert back to qint8x8_t and saturate
+        qint8x8_t acc00_qs8 = vqmovn_qs16(acc00_qs16);
+        qint8x8_t acc10_qs8 = vqmovn_qs16(acc10_qs16);
+        qint8x8_t acc20_qs8 = vqmovn_qs16(acc20_qs16);
+        qint8x8_t acc30_qs8 = vqmovn_qs16(acc30_qs16);
+
+        qint8x8_t acc01_qs8 = vqmovn_qs16(acc01_qs16);
+        qint8x8_t acc11_qs8 = vqmovn_qs16(acc11_qs16);
+        qint8x8_t acc21_qs8 = vqmovn_qs16(acc21_qs16);
+        qint8x8_t acc31_qs8 = vqmovn_qs16(acc31_qs16);
+
+        qint8x8_t acc02_qs8 = vqmovn_qs16(acc02_qs16);
+        qint8x8_t acc12_qs8 = vqmovn_qs16(acc12_qs16);
+        qint8x8_t acc22_qs8 = vqmovn_qs16(acc22_qs16);
+        qint8x8_t acc32_qs8 = vqmovn_qs16(acc32_qs16);
+
+        qint8x8_t acc03_qs8 = vqmovn_qs16(acc03_qs16);
+        qint8x8_t acc13_qs8 = vqmovn_qs16(acc13_qs16);
+        qint8x8_t acc23_qs8 = vqmovn_qs16(acc23_qs16);
+        qint8x8_t acc33_qs8 = vqmovn_qs16(acc33_qs16);
+
+        // Multiply by the weight of the matrix product (alpha)
+        if(multiply_alpha)
+        {
+            acc00_qs8 = vqmul_qs8(acc00_qs8, alpha_qs8, fixed_point_position);
+            acc10_qs8 = vqmul_qs8(acc10_qs8, alpha_qs8, fixed_point_position);
+            acc20_qs8 = vqmul_qs8(acc20_qs8, alpha_qs8, fixed_point_position);
+            acc30_qs8 = vqmul_qs8(acc30_qs8, alpha_qs8, fixed_point_position);
+            acc01_qs8 = vqmul_qs8(acc01_qs8, alpha_qs8, fixed_point_position);
+            acc11_qs8 = vqmul_qs8(acc11_qs8, alpha_qs8, fixed_point_position);
+            acc21_qs8 = vqmul_qs8(acc21_qs8, alpha_qs8, fixed_point_position);
+            acc31_qs8 = vqmul_qs8(acc31_qs8, alpha_qs8, fixed_point_position);
+            acc02_qs8 = vqmul_qs8(acc02_qs8, alpha_qs8, fixed_point_position);
+            acc12_qs8 = vqmul_qs8(acc12_qs8, alpha_qs8, fixed_point_position);
+            acc22_qs8 = vqmul_qs8(acc22_qs8, alpha_qs8, fixed_point_position);
+            acc32_qs8 = vqmul_qs8(acc32_qs8, alpha_qs8, fixed_point_position);
+            acc03_qs8 = vqmul_qs8(acc03_qs8, alpha_qs8, fixed_point_position);
+            acc13_qs8 = vqmul_qs8(acc13_qs8, alpha_qs8, fixed_point_position);
+            acc23_qs8 = vqmul_qs8(acc23_qs8, alpha_qs8, fixed_point_position);
+            acc33_qs8 = vqmul_qs8(acc33_qs8, alpha_qs8, fixed_point_position);
+        }
+
+        const auto mtx_out0 = reinterpret_cast<qint8_t *>(out.ptr());
+
+        // Store 32x4 output elements
+        vst1_qs8(mtx_out0 + 0, acc00_qs8);
+        vst1_qs8(mtx_out0 + 8, acc01_qs8);
+        vst1_qs8(mtx_out0 + 16, acc02_qs8);
+        vst1_qs8(mtx_out0 + 24, acc03_qs8);
+        vst1_qs8(mtx_out0 + out_stride1 + 0, acc10_qs8);
+        vst1_qs8(mtx_out0 + out_stride1 + 8, acc11_qs8);
+        vst1_qs8(mtx_out0 + out_stride1 + 16, acc12_qs8);
+        vst1_qs8(mtx_out0 + out_stride1 + 24, acc13_qs8);
+        vst1_qs8(mtx_out0 + out_stride2 + 0, acc20_qs8);
+        vst1_qs8(mtx_out0 + out_stride2 + 8, acc21_qs8);
+        vst1_qs8(mtx_out0 + out_stride2 + 16, acc22_qs8);
+        vst1_qs8(mtx_out0 + out_stride2 + 24, acc23_qs8);
+        vst1_qs8(mtx_out0 + out_stride3 + 0, acc30_qs8);
+        vst1_qs8(mtx_out0 + out_stride3 + 8, acc31_qs8);
+        vst1_qs8(mtx_out0 + out_stride3 + 16, acc32_qs8);
+        vst1_qs8(mtx_out0 + out_stride3 + 24, acc33_qs8);
+    },
+    ina, inb, out);
+}
+
+} // namespace
+
+NEGEMMMatrixMultiplyKernel::NEGEMMMatrixMultiplyKernel()
+    : _input0(nullptr), _input1(nullptr), _output(nullptr), _alpha(1.0f)
+{
+}
+
+void NEGEMMMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output, float alpha)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
+
+    if(output->info()->dimension(1) == 1)
+    {
+        ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
+    }
+
+    _input0 = input0;
+    _input1 = input1;
+    _output = output;
+    _alpha  = alpha;
+
+    unsigned int       num_elems_processed_per_iteration_x = 0;
+    const unsigned int num_elems_processed_per_iteration_y = 4;
+
+    // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
+    if((output->info()->dimension(1) == 1))
+    {
+        switch(input0->info()->data_type())
+        {
+            case DataType::F32:
+            {
+                num_elems_processed_per_iteration_x = 16;
+                break;
+            }
+            case DataType::QS8:
+            {
+                num_elems_processed_per_iteration_x = 32;
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Data type not supported");
+                break;
+            }
+        }
+
+        // Configure kernel window
+        Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
+
+        AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration_x);
+
+        update_window_and_padding(win,
+                                  AccessWindowHorizontal(input0->info(), 0, num_elems_processed_per_iteration_x),
+                                  AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration_x),
+                                  output_access);
+
+        Coordinates coord;
+        coord.set_num_dimensions(output->info()->num_dimensions());
+        output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape()));
+
+        INEKernel::configure(win);
+    }
+    else
+    {
+        switch(input0->info()->data_type())
+        {
+            case DataType::F32:
+            {
+                num_elems_processed_per_iteration_x = 8;
+                break;
+            }
+            case DataType::QS8:
+            {
+                num_elems_processed_per_iteration_x = 32;
+                break;
+            }
+            case DataType::F16:
+            {
+#ifdef ARM_COMPUTE_ENABLE_FP16
+                num_elems_processed_per_iteration_x = 8;
+                break;
+#endif
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Data type not supported");
+                break;
+            }
+        }
+
+        // Configure kernel window
+        Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+        AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+        update_window_and_padding(win,
+                                  AccessWindowRectangle(input0->info(), 0, 0, 4, 1, 1.f, 0.25f),
+                                  AccessWindowTranspose(input1->info(), 0, 0, 4, 1, 0.f, 0.25f),
+                                  output_access);
+
+        output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+        INEKernel::configure(win);
+    }
+}
+
+void NEGEMMMatrixMultiplyKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    bool multiply_alpha = std::abs(1.0f - _alpha) > 0.00001f;
+
+    // Check if the output tensor is a vector and the data type is F32. If so,the kernel runs the vector-matrix multiplication
+    if((_output->info()->dimension(1) == 1))
+    {
+        switch(_input0->info()->data_type())
+        {
+            case DataType::F32:
+            {
+                multiply_alpha ? vector_matrix_multiply_f32<true>(_input0, _input1, _output, window, _alpha) :
+                vector_matrix_multiply_f32<false>(_input0, _input1, _output, window, _alpha);
+                break;
+            }
+            case DataType::QS8:
+            {
+                multiply_alpha ? vector_matrix_multiply_qs8<true>(_input0, _input1, _output, window, _alpha) :
+                vector_matrix_multiply_qs8<false>(_input0, _input1, _output, window, _alpha);
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Data type not supported");
+                break;
+            }
+        }
+    }
+    else
+    {
+        switch(_input0->info()->data_type())
+        {
+            case DataType::F32:
+            {
+                multiply_alpha ? matrix_matrix_multiply_f32<true>(_input0, _input1, _output, window, _alpha) :
+                matrix_matrix_multiply_f32<false>(_input0, _input1, _output, window, _alpha);
+                break;
+            }
+            case DataType::QS8:
+            {
+                multiply_alpha ? matrix_matrix_multiply_qs8<true>(_input0, _input1, _output, window, _alpha) :
+                matrix_matrix_multiply_qs8<false>(_input0, _input1, _output, window, _alpha);
+                break;
+            }
+            case DataType::F16:
+            {
+#ifdef ARM_COMPUTE_ENABLE_FP16
+                multiply_alpha ? matrix_matrix_multiply_f16<true>(_input0, _input1, _output, window, _alpha) :
+                matrix_matrix_multiply_f16<false>(_input0, _input1, _output, window, _alpha);
+                break;
+#endif
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Data type not supported");
+                break;
+            }
+        }
+    }
+}
diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
new file mode 100644
index 0000000000..ccf5cb4de3
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstring>
+
+using namespace arm_compute;
+
+void NEGEMMTranspose1xWKernel::configure(const ITensor *input, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    TensorShape  output_shape{ input->info()->tensor_shape() };
+    const size_t transpose_w = 16 / input->info()->element_size();
+    output_shape.set(0, input->info()->dimension(1) * transpose_w);
+    output_shape.set(1, static_cast<size_t>(std::ceil((input->info()->dimension(0) / static_cast<float>(transpose_w)))));
+
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+    const float        scale_x                           = num_elems_processed_per_iteration;
+
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    Window                win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NEGEMMTranspose1xWKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+
+    /*
+     * Following an example of how the transposition1xW works when the input data type is F32
+     *
+     *         |a00 a01 a02 a03|
+     *         |a10 a11 a12 a13|
+     *         |a20 a21 a22 a23| = | a00 a01 a02 a03 || a10 a11 a12 a13 || a20 a21 a22 a23 || a30 a31 a32 a33 |
+     *         |a30 a31 a32 a33|
+     *
+     * The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
+     */
+
+    // Set window for output tensor. Set to 0 the X and Y dimensions in order to allow multi-threading implementation and future batched matrix multiplications
+    Window win_out(window);
+    win_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator in(_input, window);
+    Iterator out(_output, win_out);
+
+    switch(_input->info()->element_size())
+    {
+        case 1:
+        {
+            const size_t out_stride = _output->info()->strides_in_bytes()[1];
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                // Output address = base addr + (y * 16) + (x / 16 ) * stride
+                const uint8_t *in_ptr  = in.ptr();
+                uint8_t *const out_ptr = out.ptr() + (id.y() << 4) + (id.x() >> 4) * out_stride;
+                vst1q_u8(out_ptr, vld1q_u8(in_ptr));
+            },
+            in, out);
+            break;
+        }
+        case 2:
+        {
+            const size_t out_stride = _output->info()->strides_in_bytes()[1] / sizeof(int16_t);
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                // Output address = base addr + (y * 8) + (x / 8 ) * stride
+                const auto in_ptr  = reinterpret_cast<const uint16_t *>(in.ptr());
+                const auto out_ptr = reinterpret_cast<uint16_t *>(out.ptr()) + (id.y() << 3) + (id.x() >> 3) * out_stride;
+                vst1q_u16(out_ptr, vld1q_u16(in_ptr));
+            },
+            in, out);
+            break;
+        }
+        case 4:
+        {
+            const size_t out_stride = _output->info()->strides_in_bytes()[1] / sizeof(float);
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                // Output address = base addr + (y * 4) + (x / 4 ) * stride
+                const auto in_ptr  = reinterpret_cast<const uint32_t *>(in.ptr());
+                const auto out_ptr = reinterpret_cast<uint32_t *>(out.ptr()) + (id.y() << 2) + (id.x() >> 2) * out_stride;
+                vst1q_u32(out_ptr, vld1q_u32(in_ptr));
+            },
+            in, out);
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Element size not supported");
+            break;
+        }
+    }
+}
diff --git a/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp b/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp
new file mode 100644
index 0000000000..419f4825ef
--- /dev/null
+++ b/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+BorderSize NEGaussian3x3Kernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void NEGaussian3x3Kernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+void NEGaussian3x3Kernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    const uint8_t *input_bot_ptr = _input->ptr_to_element(Coordinates(-1, -1));
+    const uint8_t *input_mid_ptr = _input->ptr_to_element(Coordinates(-1, 0));
+    const uint8_t *input_top_ptr = _input->ptr_to_element(Coordinates(-1, +1));
+
+    static const int16x8_t two  = vdupq_n_s16(2);
+    static const int16x8_t four = vdupq_n_s16(4);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
+        uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
+        uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
+
+        const int16x8x2_t top_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data)))
+            }
+        };
+        const int16x8x2_t mid_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mid_data))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mid_data)))
+            }
+        };
+        const int16x8x2_t bot_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data)))
+            }
+        };
+
+        //top left
+        int16x8_t out = top_s16.val[0];
+        //top mid
+        out = vmlaq_s16(out, vextq_s16(top_s16.val[0], top_s16.val[1], 1), two);
+        //top right
+        out = vaddq_s16(out, vextq_s16(top_s16.val[0], top_s16.val[1], 2));
+        //mid left
+        out = vmlaq_s16(out, mid_s16.val[0], two);
+        //mid mid
+        out = vmlaq_s16(out, vextq_s16(mid_s16.val[0], mid_s16.val[1], 1), four);
+        //mid right
+        out = vmlaq_s16(out, vextq_s16(mid_s16.val[0], mid_s16.val[1], 2), two);
+        //bot left
+        out = vaddq_s16(out, bot_s16.val[0]);
+        //bot mid
+        out = vmlaq_s16(out, vextq_s16(bot_s16.val[0], bot_s16.val[1], 1), two);
+        //bot right
+        out = vaddq_s16(out, vextq_s16(bot_s16.val[0], bot_s16.val[1], 2));
+
+        vst1_u8(output.ptr(), vqshrun_n_s16(out, 4));
+    },
+    input, output);
+}
diff --git a/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp b/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp
new file mode 100644
index 0000000000..f872cc2f0a
--- /dev/null
+++ b/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+NEGaussian5x5HorKernel::NEGaussian5x5HorKernel()
+    : _border_size(0)
+{
+}
+
+BorderSize NEGaussian5x5HorKernel::border_size() const
+{
+    return _border_size;
+}
+
+void NEGaussian5x5HorKernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16);
+
+    _input       = input;
+    _output      = output;
+    _border_size = BorderSize(border_undefined ? 0 : 2, 2);
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+
+    Window                 win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), -border_size().left, num_elems_read_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+void NEGaussian5x5HorKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    Window win_in(window);
+    win_in.shift(Window::DimX, -2);
+
+    Iterator input(_input, win_in);
+    Iterator output(_output, window);
+
+    static const int16x8_t six  = vdupq_n_s16(6);
+    static const int16x8_t four = vdupq_n_s16(4);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        uint8x16_t data = vld1q_u8(input.ptr());
+
+        const int16x8x2_t data_s16 =
+        {
+            {
+                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
+                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
+            }
+        };
+
+        int16x8_t out = vaddq_s16(data_s16.val[0], vextq_s16(data_s16.val[0], data_s16.val[1], 4));
+        out           = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), four);
+        out           = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), six);
+        out           = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), four);
+
+        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
+    },
+    input, output);
+}
+
+BorderSize NEGaussian5x5VertKernel::border_size() const
+{
+    return BorderSize(2, 0);
+}
+
+void NEGaussian5x5VertKernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::S16);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8);
+
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    constexpr unsigned int num_elems_read_per_iteration      = 32;
+    constexpr unsigned int num_elems_written_per_iteration   = 16;
+    constexpr unsigned int num_rows_read_per_iteration       = 5;
+
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+void NEGaussian5x5VertKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    const uint8_t *input_top2_ptr = _input->ptr_to_element(Coordinates(0, -2));
+    const uint8_t *input_top_ptr  = _input->ptr_to_element(Coordinates(0, -1));
+    const uint8_t *input_mid_ptr  = _input->ptr_to_element(Coordinates(0, 0));
+    const uint8_t *input_low_ptr  = _input->ptr_to_element(Coordinates(0, 1));
+    const uint8_t *input_low2_ptr = _input->ptr_to_element(Coordinates(0, 2));
+
+    const uint16x8_t six  = vdupq_n_u16(6);
+    const uint16x8_t four = vdupq_n_u16(4);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const size_t input_offset_high_s16 = input.offset();
+        const size_t input_offset_low_s16  = input.offset() + 16;
+
+        //HIGH DATA
+        //top2
+        uint16x8_t data_high = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top2_ptr + input_offset_high_s16)));
+        uint16x8_t out_high  = data_high;
+        //top
+        data_high = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top_ptr + input_offset_high_s16)));
+        out_high  = vmlaq_u16(out_high, data_high, four);
+        //mid
+        data_high = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_mid_ptr + input_offset_high_s16)));
+        out_high  = vmlaq_u16(out_high, data_high, six);
+        //low
+        data_high = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low_ptr + input_offset_high_s16)));
+        out_high  = vmlaq_u16(out_high, data_high, four);
+        //low2
+        data_high = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low2_ptr + input_offset_high_s16)));
+        out_high  = vaddq_u16(out_high, data_high);
+
+        //LOW DATA
+        //top2
+        uint16x8_t data_low = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top2_ptr + input_offset_low_s16)));
+        uint16x8_t out_low  = data_low;
+        //top
+        data_low = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top_ptr + input_offset_low_s16)));
+        out_low  = vmlaq_u16(out_low, data_low, four);
+        //mid
+        data_low = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_mid_ptr + input_offset_low_s16)));
+        out_low  = vmlaq_u16(out_low, data_low, six);
+        //low
+        data_low = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low_ptr + input_offset_low_s16)));
+        out_low  = vmlaq_u16(out_low, data_low, four);
+        //low2
+        data_low = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low2_ptr + input_offset_low_s16)));
+        out_low  = vaddq_u16(out_low, data_low);
+
+        vst1q_u8(output.ptr(), vcombine_u8(vqshrn_n_u16(out_high, 8),
+                                           vqshrn_n_u16(out_low, 8)));
+    },
+    input, output);
+}
diff --git a/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp b/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp
new file mode 100644
index 0000000000..52d1fbf028
--- /dev/null
+++ b/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp
@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+using namespace arm_compute;
+
+NEGaussianPyramidHorKernel::NEGaussianPyramidHorKernel()
+    : _border_size(0), _l2_load_offset(0)
+{
+}
+
+BorderSize NEGaussianPyramidHorKernel::border_size() const
+{
+    return _border_size;
+}
+
+void NEGaussianPyramidHorKernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != 2 * output->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1));
+
+    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
+    }
+
+    _input       = input;
+    _output      = output;
+    _border_size = BorderSize(border_undefined ? 0 : 2, 2);
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    constexpr unsigned int num_elems_read_per_iteration      = 32;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr float        scale_x                           = 0.5f;
+
+    Window                 win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration, scale_x);
+
+    // Sub sampling selects odd pixels (1, 3, 5, ...) for images with even
+    // width and even pixels (0, 2, 4, ...) for images with odd width. (Whether
+    // a pixel is even or odd is determined based on the tensor shape not the
+    // valid region!)
+    // Thus the offset from which the first pixel (L2) for the convolution is
+    // loaded depends on the anchor and shape of the valid region.
+    // In the case of an even shape (= even image width) we need to load L2
+    // from -2 if the anchor is odd and from -1 if the anchor is even. That
+    // makes sure that L2 is always loaded from an odd pixel.
+    // On the other hand, for an odd shape (= odd image width) we need to load
+    // L2 from -1 if the anchor is odd and from -2 if the anchor is even to
+    // achieve the opposite effect.
+    // The condition can be simplified to checking whether anchor + shape is
+    // odd (-2) or even (-1) as only adding an odd and an even number will have
+    // an odd result.
+    _l2_load_offset = -border_size().left;
+
+    if((_input->info()->valid_region().anchor[0] + _input->info()->valid_region().shape[0]) % 2 == 0)
+    {
+        _l2_load_offset += 1;
+    }
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), _l2_load_offset, num_elems_read_per_iteration),
+                              output_access);
+
+    ValidRegion valid_region = input->info()->valid_region();
+    valid_region.anchor.set(0, std::ceil((valid_region.anchor[0] + (border_undefined ? border_size().left : 0)) / 2.f));
+    valid_region.shape.set(0, (valid_region.shape[0] - (border_undefined ? border_size().right : 0)) / 2 - valid_region.anchor[0]);
+
+    output_access.set_valid_region(win, valid_region);
+
+    INEKernel::configure(win);
+}
+
+void NEGaussianPyramidHorKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(window.x().step() % 2);
+
+    static const int16x8_t six  = vdupq_n_s16(6);
+    static const int16x8_t four = vdupq_n_s16(4);
+
+    Window win_in(window);
+    win_in.shift(Window::DimX, _l2_load_offset);
+
+    Iterator in(_input, win_in);
+
+    // The output is half the width of the input
+    Window win_out(window);
+    win_out.scale(Window::DimX, 0.5f);
+
+    Iterator out(_output, win_out);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16x2_t data_2q   = vld2q_u8(in.ptr());
+        const uint8x16_t &data_even = data_2q.val[0];
+        const uint8x16_t &data_odd  = data_2q.val[1];
+
+        const int16x8_t data_l2 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data_even)));
+        const int16x8_t data_l1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data_odd)));
+        const int16x8_t data_m  = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vextq_u8(data_even, data_even, 1))));
+        const int16x8_t data_r1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vextq_u8(data_odd, data_odd, 1))));
+        const int16x8_t data_r2 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vextq_u8(data_even, data_even, 2))));
+
+        int16x8_t out_val = vaddq_s16(data_l2, data_r2);
+        out_val           = vmlaq_s16(out_val, data_l1, four);
+        out_val           = vmlaq_s16(out_val, data_m, six);
+        out_val           = vmlaq_s16(out_val, data_r1, four);
+
+        vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()), out_val);
+    },
+    in, out);
+}
+
+NEGaussianPyramidVertKernel::NEGaussianPyramidVertKernel()
+    : _t2_load_offset(0)
+{
+}
+
+BorderSize NEGaussianPyramidVertKernel::border_size() const
+{
+    return BorderSize(2, 0);
+}
+
+void NEGaussianPyramidVertKernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != 2 * output->info()->dimension(1));
+
+    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
+    }
+
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    constexpr unsigned int num_rows_processed_per_iteration  = 2;
+
+    constexpr unsigned int num_elems_written_per_iteration = 16;
+    constexpr unsigned int num_rows_written_per_iteration  = 1;
+
+    constexpr unsigned int num_elems_read_per_iteration = 16;
+    constexpr unsigned int num_rows_read_per_iteration  = 5;
+
+    constexpr float scale_y = 0.5f;
+
+    Window                win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_rows_processed_per_iteration), border_undefined, border_size());
+    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_written_per_iteration, 1.f, scale_y);
+
+    // Determine whether we need to load even or odd rows. See above for a
+    // detailed explanation.
+    _t2_load_offset = -border_size().top;
+
+    if((_input->info()->valid_region().anchor[1] + _input->info()->valid_region().shape[1]) % 2 == 0)
+    {
+        _t2_load_offset += 1;
+    }
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), 0, _t2_load_offset, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_access);
+
+    ValidRegion valid_region = input->info()->valid_region();
+    valid_region.anchor.set(1, std::ceil((valid_region.anchor[1] + (border_undefined ? border_size().top : 0)) / 2.f));
+    valid_region.shape.set(1, (valid_region.shape[1] - (border_undefined ? border_size().bottom : 0)) / 2 - valid_region.anchor[1]);
+
+    output_access.set_valid_region(win, valid_region);
+
+    INEKernel::configure(win);
+}
+
+void NEGaussianPyramidVertKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(window.x().step() != 16);
+    ARM_COMPUTE_ERROR_ON(window.y().step() % 2);
+    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
+
+    static const uint16x8_t six  = vdupq_n_u16(6);
+    static const uint16x8_t four = vdupq_n_u16(4);
+
+    Window win_in(window);
+    // Need to load two times 8 values instead of 16 values once
+    win_in.set_dimension_step(Window::DimX, 8);
+    win_in.shift(Window::DimY, _t2_load_offset);
+
+    Iterator in(_input, win_in);
+
+    // Output's height is half of input's
+    Window win_out(window);
+    win_out.scale(Window::DimY, 0.5f);
+
+    Iterator out(_output, win_out);
+
+    const uint8_t *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, 0));
+    const uint8_t *input_top_ptr  = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, 1));
+    const uint8_t *input_mid_ptr  = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, 2));
+    const uint8_t *input_low_ptr  = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, 3));
+    const uint8_t *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, 4));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Low data
+        const uint16x8_t data_low_t2 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top2_ptr + in.offset())));
+        const uint16x8_t data_low_t1 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top_ptr + in.offset())));
+        const uint16x8_t data_low_m  = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_mid_ptr + in.offset())));
+        const uint16x8_t data_low_b1 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low_ptr + in.offset())));
+        const uint16x8_t data_low_b2 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low2_ptr + in.offset())));
+
+        uint16x8_t out_low = vaddq_u16(data_low_t2, data_low_b2);
+        out_low            = vmlaq_u16(out_low, data_low_t1, four);
+        out_low            = vmlaq_u16(out_low, data_low_m, six);
+        out_low            = vmlaq_u16(out_low, data_low_b1, four);
+
+        in.increment(Window::DimX);
+
+        // High data
+        const uint16x8_t data_high_t2 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top2_ptr + in.offset())));
+        const uint16x8_t data_high_t1 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top_ptr + in.offset())));
+        const uint16x8_t data_high_m  = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_mid_ptr + in.offset())));
+        const uint16x8_t data_high_b1 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low_ptr + in.offset())));
+        const uint16x8_t data_high_b2 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low2_ptr + in.offset())));
+
+        uint16x8_t out_high = vaddq_u16(data_high_t2, data_high_b2);
+        out_high            = vmlaq_u16(out_high, data_high_t1, four);
+        out_high            = vmlaq_u16(out_high, data_high_m, six);
+        out_high            = vmlaq_u16(out_high, data_high_b1, four);
+
+        vst1q_u8(out.ptr(), vcombine_u8(vqshrn_n_u16(out_low, 8), vqshrn_n_u16(out_high, 8)));
+    },
+    in, out);
+}
diff --git a/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp b/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
new file mode 100644
index 0000000000..404ad8a388
--- /dev/null
+++ b/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
@@ -0,0 +1,802 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/HOGInfo.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/Validate.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <cstring>
+
+using namespace arm_compute;
+
+namespace
+{
+void cell_width_lt8(const int16_t *__restrict mag_row_ptr, const uint8_t *__restrict phase_row_ptr, float *__restrict output_ptr,
+                    size_t mag_stride, size_t phase_stride, size_t cell_width, size_t cell_height, size_t num_bins, float phase_scale)
+{
+    const float32x4_t        scale_f32    = vdupq_n_f32(phase_scale);
+    static const float32x4_t one_f32      = vdupq_n_f32(1.0f);
+    static const float32x4_t zerofive_f32 = vdupq_n_f32(0.5f);
+    static const int32x4_t   zero_s32     = vdupq_n_s32(0);
+    static const int32x4_t   one_s32      = vdupq_n_s32(1);
+    const int32x4_t          num_bins_s32 = vdupq_n_s32(num_bins);
+
+    memset(output_ptr, 0, sizeof(float) * num_bins);
+
+    for(size_t yc = 0; yc < cell_height; ++yc)
+    {
+        int32_t xc = 0;
+
+        for(; xc <= static_cast<int32_t>(cell_width) - 4; xc += 4)
+        {
+            // Load magnitude and phase values
+            const uint8x8_t phase_u8 = vld1_u8(phase_row_ptr + xc + yc * phase_stride);
+            const int16x4_t mag_s16  = vld1_s16(mag_row_ptr + xc + yc * mag_stride);
+
+            // Convert magnitude and phase to float
+            const float32x4_t mag_f32   = vcvtq_f32_s32(vmovl_s16(mag_s16));
+            float32x4_t       phase_f32 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(phase_u8))));
+
+            // Scale phase: phase * scale + 0.5f
+            phase_f32 = vmlaq_f32(zerofive_f32, phase_f32, scale_f32);
+
+            // Compute histogram index.
+            int32x4_t hidx_s32 = vcvtq_s32_f32(phase_f32);
+
+            // Compute magnitude weights (w0 and w1)
+            const float32x4_t hidx_f32 = vcvtq_f32_s32(hidx_s32);
+
+            // w1 = phase_f32 - hidx_f32
+            const float32x4_t w1_f32 = vsubq_f32(phase_f32, hidx_f32);
+
+            // w0 = 1.0 - w1
+            const float32x4_t w0_f32 = vsubq_f32(one_f32, w1_f32);
+
+            // Compute contribute for splitting vote
+            const float32x4_t mag_w0_f32 = vmulq_f32(mag_f32, w0_f32);
+            const float32x4_t mag_w1_f32 = vmulq_f32(mag_f32, w1_f32);
+
+            // Weighted vote between 2 bins
+
+            // Check if the histogram index is equal to num_bins. If so, replace the index with 0
+            uint32x4_t mask = vceqq_s32(hidx_s32, num_bins_s32);
+            hidx_s32        = vbslq_s32(mask, zero_s32, hidx_s32);
+
+            // Bin 0
+            *(output_ptr + vgetq_lane_s32(hidx_s32, 0)) += vgetq_lane_f32(mag_w0_f32, 0);
+            *(output_ptr + vgetq_lane_s32(hidx_s32, 1)) += vgetq_lane_f32(mag_w0_f32, 1);
+            *(output_ptr + vgetq_lane_s32(hidx_s32, 2)) += vgetq_lane_f32(mag_w0_f32, 2);
+            *(output_ptr + vgetq_lane_s32(hidx_s32, 3)) += vgetq_lane_f32(mag_w0_f32, 3);
+
+            hidx_s32 = vaddq_s32(hidx_s32, one_s32);
+
+            // Check if the histogram index is equal to num_bins
+            mask     = vceqq_s32(hidx_s32, num_bins_s32);
+            hidx_s32 = vbslq_s32(mask, zero_s32, hidx_s32);
+
+            // Bin1
+            *(output_ptr + vgetq_lane_s32(hidx_s32, 0)) += vgetq_lane_f32(mag_w1_f32, 0);
+            *(output_ptr + vgetq_lane_s32(hidx_s32, 1)) += vgetq_lane_f32(mag_w1_f32, 1);
+            *(output_ptr + vgetq_lane_s32(hidx_s32, 2)) += vgetq_lane_f32(mag_w1_f32, 2);
+            *(output_ptr + vgetq_lane_s32(hidx_s32, 3)) += vgetq_lane_f32(mag_w1_f32, 3);
+        }
+
+        for(; xc < static_cast<int32_t>(cell_width); ++xc)
+        {
+            const float phase_value = *(phase_row_ptr + xc + yc * phase_stride) * phase_scale + 0.5f;
+            const float mag_value   = *(mag_row_ptr + xc + yc * mag_stride);
+
+            const float w1 = phase_value - std::floor(phase_value);
+
+            // The quantised phase is the histogram index [0, num_bins - 1] - Round
+            // Check limit of histogram index. If hidx == num_bins, hidx = 0
+            const auto hidx = static_cast<size_t>(phase_value) % num_bins;
+
+            // Weighted vote between 2 bins
+            *(output_ptr + hidx) += mag_value * (1.0f - w1);
+            *(output_ptr + ((hidx + 1) % (num_bins))) += mag_value * w1;
+        }
+    }
+}
+
+void cell_width_ge8(const int16_t *__restrict mag_row_ptr, const uint8_t *__restrict phase_row_ptr, float *__restrict output_ptr, size_t mag_stride, size_t phase_stride, size_t cell_width,
+                    size_t cell_height, size_t num_bins, float phase_scale)
+{
+    const float32x4_t        scale_f32    = vdupq_n_f32(phase_scale);
+    static const float32x4_t one_f32      = vdupq_n_f32(1.0f);
+    static const float32x4_t zerofive_f32 = vdupq_n_f32(0.5f);
+    static const int32x4_t   zero_s32     = vdupq_n_s32(0);
+    static const int32x4_t   one_s32      = vdupq_n_s32(1);
+    const int32x4_t          num_bins_s32 = vdupq_n_s32(num_bins);
+
+    memset(output_ptr, 0, sizeof(float) * num_bins);
+
+    for(size_t yc = 0; yc < cell_height; ++yc)
+    {
+        int32_t xc = 0;
+
+        for(; xc <= static_cast<int32_t>(cell_width) - 8; xc += 8)
+        {
+            // Load magnitude and phase values
+            const uint8x8_t phase_u8 = vld1_u8(phase_row_ptr + xc + yc * phase_stride);
+            const int16x8_t mag_s16  = vld1q_s16(mag_row_ptr + xc + yc * mag_stride);
+
+            // Convert phase to U16
+            const uint16x8_t phase_u16 = vmovl_u8(phase_u8);
+
+            // Convert magnitude to float32
+            const float32x4x2_t mag_f32 =
+            {
+                {
+                    vcvtq_f32_s32(vmovl_s16(vget_low_s16(mag_s16))),
+                    vcvtq_f32_s32(vmovl_s16(vget_high_s16(mag_s16)))
+                }
+            };
+
+            // Convert phase to float32
+            float32x4x2_t phase_f32 =
+            {
+                {
+                    vcvtq_f32_u32(vmovl_u16(vget_low_u16(phase_u16))),
+                    vcvtq_f32_u32(vmovl_u16(vget_high_u16(phase_u16)))
+                }
+            };
+
+            // Scale phase: phase * scale + 0.5f
+            phase_f32.val[0] = vmlaq_f32(zerofive_f32, phase_f32.val[0], scale_f32);
+            phase_f32.val[1] = vmlaq_f32(zerofive_f32, phase_f32.val[1], scale_f32);
+
+            // Compute histogram index.
+            int32x4x2_t hidx_s32 =
+            {
+                {
+                    vcvtq_s32_f32(phase_f32.val[0]),
+                    vcvtq_s32_f32(phase_f32.val[1])
+                }
+            };
+
+            // Compute magnitude weights (w0 and w1)
+            const float32x4x2_t hidx_f32 =
+            {
+                {
+                    vcvtq_f32_s32(hidx_s32.val[0]),
+                    vcvtq_f32_s32(hidx_s32.val[1])
+                }
+            };
+
+            float32x4x2_t w1_f32 =
+            {
+                {
+                    vsubq_f32(phase_f32.val[0], hidx_f32.val[0]),
+                    vsubq_f32(phase_f32.val[1], hidx_f32.val[1])
+                }
+            };
+
+            float32x4x2_t w0_f32 =
+            {
+                {
+                    vsubq_f32(one_f32, w1_f32.val[0]),
+                    vsubq_f32(one_f32, w1_f32.val[1])
+                }
+            };
+
+            // Compute contribute for splitting vote
+            const float32x4x2_t mag_w0_f32 =
+            {
+                {
+                    vmulq_f32(mag_f32.val[0], w0_f32.val[0]),
+                    vmulq_f32(mag_f32.val[1], w0_f32.val[1])
+                }
+            };
+
+            const float32x4x2_t mag_w1_f32 =
+            {
+                {
+                    vmulq_f32(mag_f32.val[0], w1_f32.val[0]),
+                    vmulq_f32(mag_f32.val[1], w1_f32.val[1])
+                }
+            };
+
+            // Weighted vote between 2 bins
+
+            // Check if the histogram index is equal to num_bins
+            uint32x4x2_t mask =
+            {
+                {
+                    vceqq_s32(hidx_s32.val[0], num_bins_s32),
+                    vceqq_s32(hidx_s32.val[1], num_bins_s32)
+                }
+            };
+
+            hidx_s32.val[0] = vbslq_s32(mask.val[0], zero_s32, hidx_s32.val[0]);
+            hidx_s32.val[1] = vbslq_s32(mask.val[1], zero_s32, hidx_s32.val[1]);
+
+            // First bin - Low
+            *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 0)) += vgetq_lane_f32(mag_w0_f32.val[0], 0);
+            *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 1)) += vgetq_lane_f32(mag_w0_f32.val[0], 1);
+            *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 2)) += vgetq_lane_f32(mag_w0_f32.val[0], 2);
+            *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 3)) += vgetq_lane_f32(mag_w0_f32.val[0], 3);
+
+            // First bin - high
+            *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 0)) += vgetq_lane_f32(mag_w0_f32.val[1], 0);
+            *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 1)) += vgetq_lane_f32(mag_w0_f32.val[1], 1);
+            *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 2)) += vgetq_lane_f32(mag_w0_f32.val[1], 2);
+            *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 3)) += vgetq_lane_f32(mag_w0_f32.val[1], 3);
+
+            hidx_s32.val[0] = vaddq_s32(hidx_s32.val[0], one_s32);
+            hidx_s32.val[1] = vaddq_s32(hidx_s32.val[1], one_s32);
+
+            // Check if the histogram index is equal to num_bins
+            mask.val[0] = vceqq_s32(hidx_s32.val[0], num_bins_s32);
+            mask.val[1] = vceqq_s32(hidx_s32.val[1], num_bins_s32);
+
+            hidx_s32.val[0] = vbslq_s32(mask.val[0], zero_s32, hidx_s32.val[0]);
+            hidx_s32.val[1] = vbslq_s32(mask.val[1], zero_s32, hidx_s32.val[1]);
+
+            // Second bin - Low
+            *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 0)) += vgetq_lane_f32(mag_w1_f32.val[0], 0);
+            *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 1)) += vgetq_lane_f32(mag_w1_f32.val[0], 1);
+            *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 2)) += vgetq_lane_f32(mag_w1_f32.val[0], 2);
+            *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 3)) += vgetq_lane_f32(mag_w1_f32.val[0], 3);
+
+            // Second bin - high
+            *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 0)) += vgetq_lane_f32(mag_w1_f32.val[1], 0);
+            *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 1)) += vgetq_lane_f32(mag_w1_f32.val[1], 1);
+            *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 2)) += vgetq_lane_f32(mag_w1_f32.val[1], 2);
+            *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 3)) += vgetq_lane_f32(mag_w1_f32.val[1], 3);
+        }
+
+        for(; xc < static_cast<int32_t>(cell_width); xc++)
+        {
+            const float phase_value = *(phase_row_ptr + xc + yc * phase_stride) * phase_scale + 0.5f;
+            const float mag_value   = *(mag_row_ptr + xc + yc * mag_stride);
+
+            const float w1 = phase_value - std::floor(phase_value);
+
+            // The quantised phase is the histogram index [0, num_bins - 1] - Round
+            // Check limit of histogram index. If hidx == num_bins, hidx = 0
+            const size_t hidx = static_cast<size_t>(phase_value) % num_bins;
+
+            // Weighted vote between 2 bins
+            *(output_ptr + hidx) += mag_value * (1.0f - w1);
+            *(output_ptr + ((hidx + 1) % (num_bins))) += mag_value * w1;
+        }
+    }
+}
+
+void l2_norm(const float *__restrict input_row_ptr, float *__restrict output_ptr, size_t input_stride,
+             size_t num_cells_per_block_height, size_t num_bins_block_x, size_t num_bins_block, float l2_hyst_threshold)
+{
+    ARM_COMPUTE_UNUSED(l2_hyst_threshold);
+
+    float       sum     = 0.0f;
+    float32x4_t sum_f32 = vdupq_n_f32(0.0f);
+
+    // Compute L2-Norm
+    for(size_t yc = 0; yc < num_cells_per_block_height; ++yc)
+    {
+        const float *const hist_ptr = input_row_ptr + yc * input_stride;
+
+        int32_t xc = 0;
+
+        for(; xc <= static_cast<int32_t>(num_bins_block_x) - 16; xc += 16)
+        {
+            const float32x4x4_t input_value =
+            {
+                {
+                    vld1q_f32(hist_ptr + xc + 0),
+                    vld1q_f32(hist_ptr + xc + 4),
+                    vld1q_f32(hist_ptr + xc + 8),
+                    vld1q_f32(hist_ptr + xc + 12)
+                }
+            };
+
+            // Compute input_value^2
+            sum_f32 = vmlaq_f32(sum_f32, input_value.val[0], input_value.val[0]);
+            sum_f32 = vmlaq_f32(sum_f32, input_value.val[1], input_value.val[1]);
+            sum_f32 = vmlaq_f32(sum_f32, input_value.val[2], input_value.val[2]);
+            sum_f32 = vmlaq_f32(sum_f32, input_value.val[3], input_value.val[3]);
+
+            vst1q_f32(&output_ptr[xc + 0 + yc * num_bins_block_x], input_value.val[0]);
+            vst1q_f32(&output_ptr[xc + 4 + yc * num_bins_block_x], input_value.val[1]);
+            vst1q_f32(&output_ptr[xc + 8 + yc * num_bins_block_x], input_value.val[2]);
+            vst1q_f32(&output_ptr[xc + 12 + yc * num_bins_block_x], input_value.val[3]);
+        }
+
+        // Compute left over
+        for(; xc < static_cast<int32_t>(num_bins_block_x); xc++)
+        {
+            const float input_value = hist_ptr[xc];
+
+            sum += input_value * input_value;
+
+            output_ptr[xc + yc * num_bins_block_x] = input_value;
+        }
+    }
+
+    sum += vgetq_lane_f32(sum_f32, 0);
+    sum += vgetq_lane_f32(sum_f32, 1);
+    sum += vgetq_lane_f32(sum_f32, 2);
+    sum += vgetq_lane_f32(sum_f32, 3);
+
+    const float       scale     = 1.0f / (std::sqrt(sum) + num_bins_block * 0.1f);
+    const float32x4_t scale_f32 = vdupq_n_f32(scale);
+
+    int32_t i = 0;
+
+    for(; i <= static_cast<int32_t>(num_bins_block) - 16; i += 16)
+    {
+        float32x4x4_t input_value =
+        {
+            {
+                vld1q_f32(&output_ptr[i + 0]),
+                vld1q_f32(&output_ptr[i + 4]),
+                vld1q_f32(&output_ptr[i + 8]),
+                vld1q_f32(&output_ptr[i + 12])
+            }
+        };
+
+        // Scale input_value
+        input_value.val[0] = vmulq_f32(input_value.val[0], scale_f32);
+        input_value.val[1] = vmulq_f32(input_value.val[1], scale_f32);
+        input_value.val[2] = vmulq_f32(input_value.val[2], scale_f32);
+        input_value.val[3] = vmulq_f32(input_value.val[3], scale_f32);
+
+        vst1q_f32(&output_ptr[i + 0], input_value.val[0]);
+        vst1q_f32(&output_ptr[i + 4], input_value.val[1]);
+        vst1q_f32(&output_ptr[i + 8], input_value.val[2]);
+        vst1q_f32(&output_ptr[i + 12], input_value.val[3]);
+    }
+
+    for(; i < static_cast<int32_t>(num_bins_block); ++i)
+    {
+        output_ptr[i] *= scale;
+    }
+}
+
+void l2hys_norm(const float *__restrict input_row_ptr, float *__restrict output_ptr, size_t input_stride, size_t num_cells_per_block_height, size_t num_bins_block_x, size_t num_bins_block,
+                float l2_hyst_threshold)
+{
+    float       sum     = 0.0f;
+    float32x4_t sum_f32 = vdupq_n_f32(0.0f);
+
+    // Compute L2-Hys
+    for(size_t yc = 0; yc < num_cells_per_block_height; ++yc)
+    {
+        const float *const hist_ptr = input_row_ptr + yc * input_stride;
+
+        int32_t xc = 0;
+
+        for(; xc <= static_cast<int32_t>(num_bins_block_x) - 16; xc += 16)
+        {
+            const float32x4x4_t input_value =
+            {
+                {
+                    vld1q_f32(hist_ptr + xc + 0),
+                    vld1q_f32(hist_ptr + xc + 4),
+                    vld1q_f32(hist_ptr + xc + 8),
+                    vld1q_f32(hist_ptr + xc + 12)
+                }
+            };
+
+            // Compute input_value^2
+            sum_f32 = vmlaq_f32(sum_f32, input_value.val[0], input_value.val[0]);
+            sum_f32 = vmlaq_f32(sum_f32, input_value.val[1], input_value.val[1]);
+            sum_f32 = vmlaq_f32(sum_f32, input_value.val[2], input_value.val[2]);
+            sum_f32 = vmlaq_f32(sum_f32, input_value.val[3], input_value.val[3]);
+
+            vst1q_f32(&output_ptr[xc + 0 + yc * num_bins_block_x], input_value.val[0]);
+            vst1q_f32(&output_ptr[xc + 4 + yc * num_bins_block_x], input_value.val[1]);
+            vst1q_f32(&output_ptr[xc + 8 + yc * num_bins_block_x], input_value.val[2]);
+            vst1q_f32(&output_ptr[xc + 12 + yc * num_bins_block_x], input_value.val[3]);
+        }
+
+        // Compute left over
+        for(; xc < static_cast<int32_t>(num_bins_block_x); ++xc)
+        {
+            const float input_value = hist_ptr[xc];
+
+            sum += input_value * input_value;
+
+            output_ptr[xc + yc * num_bins_block_x] = input_value;
+        }
+    }
+
+    sum += vgetq_lane_f32(sum_f32, 0);
+    sum += vgetq_lane_f32(sum_f32, 1);
+    sum += vgetq_lane_f32(sum_f32, 2);
+    sum += vgetq_lane_f32(sum_f32, 3);
+
+    float             scale                 = 1.0f / (std::sqrt(sum) + num_bins_block * 0.1f);
+    float32x4_t       scale_f32             = vdupq_n_f32(scale);
+    const float32x4_t l2_hyst_threshold_f32 = vdupq_n_f32(l2_hyst_threshold);
+
+    // Reset sum
+    sum_f32 = vdupq_n_f32(0.0f);
+    sum     = 0.0f;
+
+    int32_t i = 0;
+
+    for(; i <= static_cast<int32_t>(num_bins_block) - 16; i += 16)
+    {
+        float32x4x4_t input_value =
+        {
+            {
+                vld1q_f32(&output_ptr[i + 0]),
+                vld1q_f32(&output_ptr[i + 4]),
+                vld1q_f32(&output_ptr[i + 8]),
+                vld1q_f32(&output_ptr[i + 12])
+            }
+        };
+
+        // Scale input_value
+        input_value.val[0] = vmulq_f32(input_value.val[0], scale_f32);
+        input_value.val[1] = vmulq_f32(input_value.val[1], scale_f32);
+        input_value.val[2] = vmulq_f32(input_value.val[2], scale_f32);
+        input_value.val[3] = vmulq_f32(input_value.val[3], scale_f32);
+
+        // Clip input_value if over _threshold_l2hys
+        input_value.val[0] = vminq_f32(input_value.val[0], l2_hyst_threshold_f32);
+        input_value.val[1] = vminq_f32(input_value.val[1], l2_hyst_threshold_f32);
+        input_value.val[2] = vminq_f32(input_value.val[2], l2_hyst_threshold_f32);
+        input_value.val[3] = vminq_f32(input_value.val[3], l2_hyst_threshold_f32);
+
+        // Compute input_value^2
+        sum_f32 = vmlaq_f32(sum_f32, input_value.val[0], input_value.val[0]);
+        sum_f32 = vmlaq_f32(sum_f32, input_value.val[1], input_value.val[1]);
+        sum_f32 = vmlaq_f32(sum_f32, input_value.val[2], input_value.val[2]);
+        sum_f32 = vmlaq_f32(sum_f32, input_value.val[3], input_value.val[3]);
+
+        vst1q_f32(&output_ptr[i + 0], input_value.val[0]);
+        vst1q_f32(&output_ptr[i + 4], input_value.val[1]);
+        vst1q_f32(&output_ptr[i + 8], input_value.val[2]);
+        vst1q_f32(&output_ptr[i + 12], input_value.val[3]);
+    }
+
+    sum += vgetq_lane_f32(sum_f32, 0);
+    sum += vgetq_lane_f32(sum_f32, 1);
+    sum += vgetq_lane_f32(sum_f32, 2);
+    sum += vgetq_lane_f32(sum_f32, 3);
+
+    for(; i < static_cast<int32_t>(num_bins_block); ++i)
+    {
+        float input_value = output_ptr[i] * scale;
+
+        // Clip scaled input_value if over _threshold_L2hys
+        input_value = std::min(input_value, l2_hyst_threshold);
+
+        sum += input_value * input_value;
+
+        output_ptr[i] = input_value;
+    }
+
+    // We use the same constants of OpenCV
+    scale     = 1.0f / (std::sqrt(sum) + 1e-3f);
+    scale_f32 = vdupq_n_f32(scale);
+
+    // Rescale
+    i = 0;
+
+    for(; i <= static_cast<int32_t>(num_bins_block) - 16; i += 16)
+    {
+        float32x4x4_t input_value =
+        {
+            {
+                vld1q_f32(&output_ptr[i + 0]),
+                vld1q_f32(&output_ptr[i + 4]),
+                vld1q_f32(&output_ptr[i + 8]),
+                vld1q_f32(&output_ptr[i + 12])
+            }
+        };
+
+        // Scale input_value
+        input_value.val[0] = vmulq_f32(input_value.val[0], scale_f32);
+        input_value.val[1] = vmulq_f32(input_value.val[1], scale_f32);
+        input_value.val[2] = vmulq_f32(input_value.val[2], scale_f32);
+        input_value.val[3] = vmulq_f32(input_value.val[3], scale_f32);
+
+        vst1q_f32(&output_ptr[i + 0], input_value.val[0]);
+        vst1q_f32(&output_ptr[i + 4], input_value.val[1]);
+        vst1q_f32(&output_ptr[i + 8], input_value.val[2]);
+        vst1q_f32(&output_ptr[i + 12], input_value.val[3]);
+    }
+
+    for(; i < static_cast<int32_t>(num_bins_block); ++i)
+    {
+        // Store result
+        output_ptr[i] *= scale;
+    }
+}
+
+void l1_norm(const float *__restrict input_row_ptr, float *__restrict output_ptr, size_t input_stride, size_t num_cells_per_block_height, size_t num_bins_block_x, size_t num_bins_block,
+             float l2_hyst_threshold)
+{
+    ARM_COMPUTE_UNUSED(l2_hyst_threshold);
+
+    float       sum     = 0.0f;
+    float32x4_t sum_f32 = vdupq_n_f32(0.0f);
+
+    // Compute L1-Norm
+    for(size_t yc = 0; yc < num_cells_per_block_height; ++yc)
+    {
+        const float *const hist_ptr = input_row_ptr + yc * input_stride;
+
+        int32_t xc = 0;
+
+        for(; xc <= static_cast<int32_t>(num_bins_block_x) - 16; xc += 16)
+        {
+            const float32x4x4_t input_value =
+            {
+                {
+                    vld1q_f32(hist_ptr + xc + 0),
+                    vld1q_f32(hist_ptr + xc + 4),
+                    vld1q_f32(hist_ptr + xc + 8),
+                    vld1q_f32(hist_ptr + xc + 12)
+                }
+            };
+
+            // Compute |input_value|
+            sum_f32 += vabsq_f32(input_value.val[0]);
+            sum_f32 += vabsq_f32(input_value.val[1]);
+            sum_f32 += vabsq_f32(input_value.val[2]);
+            sum_f32 += vabsq_f32(input_value.val[3]);
+
+            vst1q_f32(&output_ptr[xc + 0 + yc * num_bins_block_x], input_value.val[0]);
+            vst1q_f32(&output_ptr[xc + 4 + yc * num_bins_block_x], input_value.val[1]);
+            vst1q_f32(&output_ptr[xc + 8 + yc * num_bins_block_x], input_value.val[2]);
+            vst1q_f32(&output_ptr[xc + 12 + yc * num_bins_block_x], input_value.val[3]);
+        }
+
+        for(; xc < static_cast<int32_t>(num_bins_block_x); xc++)
+        {
+            const float input_value = hist_ptr[xc];
+
+            sum += std::abs(input_value);
+
+            output_ptr[xc + yc * num_bins_block_x] = input_value;
+        }
+    }
+
+    sum += vgetq_lane_f32(sum_f32, 0);
+    sum += vgetq_lane_f32(sum_f32, 1);
+    sum += vgetq_lane_f32(sum_f32, 2);
+    sum += vgetq_lane_f32(sum_f32, 3);
+
+    const float       scale     = 1.0f / (std::sqrt(sum) + num_bins_block * 0.1f);
+    const float32x4_t scale_f32 = vdupq_n_f32(scale);
+
+    int32_t i = 0;
+
+    for(; i <= static_cast<int32_t>(num_bins_block) - 16; i += 16)
+    {
+        float32x4x4_t input_value =
+        {
+            {
+                vld1q_f32(&output_ptr[i + 0]),
+                vld1q_f32(&output_ptr[i + 4]),
+                vld1q_f32(&output_ptr[i + 8]),
+                vld1q_f32(&output_ptr[i + 12])
+            }
+        };
+
+        // Scale input_value
+        input_value.val[0] = vmulq_f32(input_value.val[0], scale_f32);
+        input_value.val[1] = vmulq_f32(input_value.val[1], scale_f32);
+        input_value.val[2] = vmulq_f32(input_value.val[2], scale_f32);
+        input_value.val[3] = vmulq_f32(input_value.val[3], scale_f32);
+
+        vst1q_f32(&output_ptr[i + 0], input_value.val[0]);
+        vst1q_f32(&output_ptr[i + 4], input_value.val[1]);
+        vst1q_f32(&output_ptr[i + 8], input_value.val[2]);
+        vst1q_f32(&output_ptr[i + 12], input_value.val[3]);
+    }
+
+    for(; i < static_cast<int32_t>(num_bins_block); ++i)
+    {
+        output_ptr[i] *= scale;
+    }
+}
+} // namespace
+
+NEHOGOrientationBinningKernel::NEHOGOrientationBinningKernel()
+    : _func(nullptr), _input_magnitude(nullptr), _input_phase(nullptr), _output(nullptr), _cell_width(0), _cell_height(0), _num_bins(0), _phase_scale(0)
+{
+}
+
+void NEHOGOrientationBinningKernel::configure(const ITensor *input_magnitude, const ITensor *input_phase, ITensor *output, const HOGInfo *hog_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_magnitude, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_phase, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(hog_info == nullptr);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, hog_info->num_bins(), DataType::F32);
+    ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimX) != input_phase->info()->dimension(Window::DimX));
+    ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimY) != input_phase->info()->dimension(Window::DimY));
+
+    _input_magnitude = input_magnitude;
+    _input_phase     = input_phase;
+    _output          = output;
+    _cell_width      = hog_info->cell_size().width;
+    _cell_height     = hog_info->cell_size().height;
+    _num_bins        = hog_info->num_bins();
+    _phase_scale     = (PhaseType::SIGNED == hog_info->phase_type() ? _num_bins / 360.0f : _num_bins / 180.0f);
+    _phase_scale *= (PhaseType::SIGNED == hog_info->phase_type() ? 360.0f / 255.0f : 1.0f);
+
+    if(_cell_width < 8)
+    {
+        _func = &cell_width_lt8;
+    }
+    else
+    {
+        _func = &cell_width_ge8;
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+    const unsigned int     num_elems_read_per_iteration      = 1;
+    const unsigned int     num_rows_read_per_iteration       = _cell_height;
+    const unsigned int     num_elems_written_per_iteration   = 1;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input_magnitude->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              AccessWindowRectangle(input_phase->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_access);
+
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NEHOGOrientationBinningKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    const size_t mag_stride   = _input_magnitude->info()->strides_in_bytes()[Window::DimY] / pixel_size_from_format(_input_magnitude->info()->format());
+    const size_t phase_stride = _input_phase->info()->strides_in_bytes()[Window::DimY] / pixel_size_from_format(_input_phase->info()->format());
+
+    Window win_mag(window);
+    win_mag.set(Window::DimX, Window::Dimension(window.x().start() * _cell_width, window.x().start() * _cell_width, _cell_width));
+    win_mag.set(Window::DimY, Window::Dimension(window.y().start() * _cell_height, window.y().start() * _cell_height, _cell_height));
+
+    Window win_phase(win_mag);
+
+    Iterator mag(_input_magnitude, win_mag);
+    Iterator phase(_input_phase, win_phase);
+    Iterator out(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto mag_row_ptr   = reinterpret_cast<const int16_t *>(mag.ptr());
+        const auto phase_row_ptr = reinterpret_cast<const uint8_t *>(phase.ptr());
+        const auto out_row_ptr   = reinterpret_cast<float *>(out.ptr());
+
+        (*_func)(mag_row_ptr, phase_row_ptr, out_row_ptr, mag_stride, phase_stride, _cell_width, _cell_height, _num_bins, _phase_scale);
+    },
+    mag, phase, out);
+}
+
+NEHOGBlockNormalizationKernel::NEHOGBlockNormalizationKernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _num_cells_per_block(), _num_cells_per_block_stride(), _num_bins(0), _l2_hyst_threshold(0.0f)
+{
+}
+
+void NEHOGBlockNormalizationKernel::configure(const ITensor *input, ITensor *output, const HOGInfo *hog_info)
+{
+    ARM_COMPUTE_ERROR_ON(hog_info == nullptr);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, hog_info->num_bins(), DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
+
+    // Number of cells per block
+    const Size2D num_cells_per_block(hog_info->block_size().width / hog_info->cell_size().width,
+                                     hog_info->block_size().height / hog_info->cell_size().height);
+
+    // Number of cells per block stride
+    const Size2D num_cells_per_block_stride(hog_info->block_stride().width / hog_info->cell_size().width,
+                                            hog_info->block_stride().height / hog_info->cell_size().height);
+
+    _input                      = input;
+    _output                     = output;
+    _l2_hyst_threshold          = hog_info->l2_hyst_threshold();
+    _num_cells_per_block        = num_cells_per_block;
+    _num_cells_per_block_stride = num_cells_per_block_stride;
+    _num_bins                   = hog_info->num_bins();
+
+    ARM_COMPUTE_ERROR_ON((output->info()->num_channels() != (_num_bins * num_cells_per_block.width * num_cells_per_block.height)));
+
+    switch(hog_info->normalization_type())
+    {
+        case HOGNormType::L2_NORM:
+            _func = &l2_norm;
+            break;
+        case HOGNormType::L2HYS_NORM:
+            _func = &l2hys_norm;
+            break;
+        case HOGNormType::L1_NORM:
+            _func = &l1_norm;
+            break;
+        default:
+            ARM_COMPUTE_ERROR_ON("Normalisation type not supported");
+            break;
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+    const unsigned int     num_elems_read_per_iteration      = 1;
+    const unsigned int     num_rows_read_per_iteration       = _num_cells_per_block.height;
+    const unsigned int     num_elems_written_per_iteration   = 1;
+    const unsigned int     num_rows_written_per_iteration    = _num_cells_per_block.height;
+
+    // Configure kernel window
+    Window                win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+
+void NEHOGBlockNormalizationKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    // Get number of bins per block
+    const size_t num_bins_per_block = _output->info()->num_channels();
+
+    // Number of bins on the same row of the block
+    const int32_t num_bins_per_block_x = _num_cells_per_block.width * _num_bins;
+
+    const size_t input_stride = _input->info()->strides_in_bytes()[Window::DimY] / data_size_from_type(_input->info()->data_type());
+
+    Window win_in(window);
+    win_in.set_dimension_step(Window::DimX, _num_cells_per_block_stride.width);
+    win_in.set_dimension_step(Window::DimY, _num_cells_per_block_stride.height);
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, window);
+
+    // Normalises blocks
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto input_row_ptr = reinterpret_cast<const float *>(in.ptr());
+        const auto out_row_ptr   = reinterpret_cast<float *>(out.ptr());
+
+        // Execute normalization function
+        (*_func)(input_row_ptr, out_row_ptr, input_stride, _num_cells_per_block.height, num_bins_per_block_x, num_bins_per_block, _l2_hyst_threshold);
+    },
+    in, out);
+}
diff --git a/src/core/NEON/kernels/NEHOGDetectorKernel.cpp b/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
new file mode 100644
index 0000000000..4af22bca75
--- /dev/null
+++ b/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/HOGInfo.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+NEHOGDetectorKernel::NEHOGDetectorKernel()
+    : _input(nullptr), _detection_windows(), _hog_descriptor(nullptr), _bias(0.0f), _threshold(0.0f), _idx_class(0), _num_bins_per_descriptor_x(0), _num_blocks_per_descriptor_y(0), _block_stride_width(0),
+      _block_stride_height(0), _detection_window_width(0), _detection_window_height(0), _max_num_detection_windows(0), _mutex()
+{
+}
+
+void NEHOGDetectorKernel::configure(const ITensor *input, const IHOG *hog, IDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, uint16_t idx_class)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(hog == nullptr);
+    ARM_COMPUTE_ERROR_ON(detection_windows == nullptr);
+    ARM_COMPUTE_ERROR_ON((detection_window_stride.width % hog->info()->block_stride().width) != 0);
+    ARM_COMPUTE_ERROR_ON((detection_window_stride.height % hog->info()->block_stride().height) != 0);
+
+    const Size2D &detection_window_size = hog->info()->detection_window_size();
+    const Size2D &block_size            = hog->info()->block_size();
+    const Size2D &block_stride          = hog->info()->block_stride();
+
+    _input                       = input;
+    _detection_windows           = detection_windows;
+    _threshold                   = threshold;
+    _idx_class                   = idx_class;
+    _hog_descriptor              = hog->descriptor();
+    _bias                        = _hog_descriptor[hog->info()->descriptor_size() - 1];
+    _num_bins_per_descriptor_x   = ((detection_window_size.width - block_size.width) / block_stride.width + 1) * input->info()->num_channels();
+    _num_blocks_per_descriptor_y = (detection_window_size.height - block_size.height) / block_stride.height + 1;
+    _block_stride_width          = block_stride.width;
+    _block_stride_height         = block_stride.height;
+    _detection_window_width      = detection_window_size.width;
+    _detection_window_height     = detection_window_size.height;
+    _max_num_detection_windows   = detection_windows->max_num_values();
+
+    ARM_COMPUTE_ERROR_ON((_num_bins_per_descriptor_x * _num_blocks_per_descriptor_y + 1) != hog->info()->descriptor_size());
+
+    // Get the number of blocks along the x and y directions of the input tensor
+    const ValidRegion &valid_region = input->info()->valid_region();
+    const size_t       num_blocks_x = valid_region.shape[0];
+    const size_t       num_blocks_y = valid_region.shape[1];
+
+    // Get the number of blocks along the x and y directions of the detection window
+    const size_t num_blocks_per_detection_window_x = detection_window_size.width / block_stride.width;
+    const size_t num_blocks_per_detection_window_y = detection_window_size.height / block_stride.height;
+
+    const size_t window_step_x = detection_window_stride.width / block_stride.width;
+    const size_t window_step_y = detection_window_stride.height / block_stride.height;
+
+    // Configure kernel window
+    Window win;
+    win.set(Window::DimX, Window::Dimension(0, floor_to_multiple(num_blocks_x - num_blocks_per_detection_window_x, window_step_x), window_step_x));
+    win.set(Window::DimY, Window::Dimension(0, floor_to_multiple(num_blocks_y - num_blocks_per_detection_window_y, window_step_y), window_step_y));
+
+    constexpr unsigned int num_elems_read_per_iteration = 1;
+    const unsigned int     num_rows_read_per_iteration  = _num_blocks_per_descriptor_y;
+
+    update_window_and_padding(win, AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration));
+
+    INEKernel::configure(win);
+}
+
+void NEHOGDetectorKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_hog_descriptor == nullptr);
+
+    const size_t in_step_y = _input->info()->strides_in_bytes()[Window::DimY] / data_size_from_type(_input->info()->data_type());
+
+    Iterator in(_input, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto *in_row_ptr = reinterpret_cast<const float *>(in.ptr());
+
+        // Init score_f32 with 0
+        float32x4_t score_f32 = vdupq_n_f32(0.0f);
+
+        // Init score with bias
+        float score = _bias;
+
+        // Compute Linear SVM
+        for(size_t yb = 0; yb < _num_blocks_per_descriptor_y; ++yb, in_row_ptr += in_step_y)
+        {
+            int32_t xb = 0;
+
+            const int32_t offset_y = yb * _num_bins_per_descriptor_x;
+
+            for(; xb < static_cast<int32_t>(_num_bins_per_descriptor_x) - 16; xb += 16)
+            {
+                // Load descriptor values
+                const float32x4x4_t a_f32 =
+                {
+                    {
+                        vld1q_f32(&in_row_ptr[xb + 0]),
+                        vld1q_f32(&in_row_ptr[xb + 4]),
+                        vld1q_f32(&in_row_ptr[xb + 8]),
+                        vld1q_f32(&in_row_ptr[xb + 12])
+                    }
+                };
+
+                // Load detector values
+                const float32x4x4_t b_f32 =
+                {
+                    {
+                        vld1q_f32(&_hog_descriptor[xb + 0 + offset_y]),
+                        vld1q_f32(&_hog_descriptor[xb + 4 + offset_y]),
+                        vld1q_f32(&_hog_descriptor[xb + 8 + offset_y]),
+                        vld1q_f32(&_hog_descriptor[xb + 12 + offset_y])
+                    }
+                };
+
+                // Multiply accumulate
+                score_f32 = vmlaq_f32(score_f32, a_f32.val[0], b_f32.val[0]);
+                score_f32 = vmlaq_f32(score_f32, a_f32.val[1], b_f32.val[1]);
+                score_f32 = vmlaq_f32(score_f32, a_f32.val[2], b_f32.val[2]);
+                score_f32 = vmlaq_f32(score_f32, a_f32.val[3], b_f32.val[3]);
+            }
+
+            for(; xb < static_cast<int32_t>(_num_bins_per_descriptor_x); ++xb)
+            {
+                const float a = in_row_ptr[xb];
+                const float b = _hog_descriptor[xb + offset_y];
+
+                score += a * b;
+            }
+        }
+
+        score += vgetq_lane_f32(score_f32, 0);
+        score += vgetq_lane_f32(score_f32, 1);
+        score += vgetq_lane_f32(score_f32, 2);
+        score += vgetq_lane_f32(score_f32, 3);
+
+        if(score > _threshold)
+        {
+            if(_detection_windows->num_values() < _max_num_detection_windows)
+            {
+                DetectionWindow win;
+                win.x         = (id.x() * _block_stride_width);
+                win.y         = (id.y() * _block_stride_height);
+                win.width     = _detection_window_width;
+                win.height    = _detection_window_height;
+                win.idx_class = _idx_class;
+                win.score     = score;
+
+                std::unique_lock<std::mutex> lock(_mutex);
+                _detection_windows->push_back(win);
+                lock.unlock();
+            }
+        }
+    },
+    in);
+}
diff --git a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
new file mode 100644
index 0000000000..585676bb87
--- /dev/null
+++ b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
@@ -0,0 +1,1137 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+
+using namespace arm_compute;
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+
+template class arm_compute::NEHarrisScoreFP16Kernel<3>;
+template class arm_compute::NEHarrisScoreFP16Kernel<5>;
+template class arm_compute::NEHarrisScoreFP16Kernel<7>;
+
+namespace fp16
+{
+inline float16x8_t harris_score(float16x8_t gx2, float16x8_t gy2, float16x8_t gxgy, float sensitivity, float strength_thresh)
+{
+    static const float16x8_t zero = vdupq_n_f16(0.f);
+
+    // Trace^2
+    float16x8_t trace2 = vaddq_f16(gx2, gy2);
+    trace2             = vmulq_f16(trace2, trace2);
+
+    // Det(A)
+    float16x8_t det = vmulq_f16(gx2, gy2);
+    det             = vfmsq_f16(det, gxgy, gxgy);
+
+    // Det(A) - sensitivity * trace^2
+    const float16x8_t mc = vfmsq_f16(det, vdupq_n_f16(sensitivity), trace2);
+
+    // mc > strength_thresh
+    const uint16x8_t mask = vcgtq_f16(mc, vdupq_n_f16(strength_thresh));
+
+    return vbslq_f16(mask, mc, zero);
+}
+
+template <size_t block_size>
+inline void harris_score1xN_FLOAT_FLOAT_FLOAT(float16x8_t low_gx, float16x8_t low_gy, float16x8_t high_gx, float16x8_t high_gy, float16x8_t &gx2, float16x8_t &gy2, float16x8_t &gxgy,
+                                              float norm_factor)
+{
+    const float16x8_t norm_factor_fp16 = vdupq_n_f16(norm_factor);
+
+    // Normalize
+    low_gx  = vmulq_f16(low_gx, norm_factor_fp16);
+    low_gy  = vmulq_f16(low_gy, norm_factor_fp16);
+    high_gx = vmulq_f16(high_gx, norm_factor_fp16);
+    high_gy = vmulq_f16(high_gy, norm_factor_fp16);
+
+    float16x8_t gx = vextq_f16(low_gx, high_gx, 0);
+    float16x8_t gy = vextq_f16(low_gy, high_gy, 0);
+
+    gx2  = vfmaq_f16(gx2, gx, gx);
+    gy2  = vfmaq_f16(gy2, gy, gy);
+    gxgy = vfmaq_f16(gxgy, gx, gy);
+
+    gx = vextq_f16(low_gx, high_gx, 1);
+    gy = vextq_f16(low_gy, high_gy, 1);
+
+    gx2  = vfmaq_f16(gx2, gx, gx);
+    gy2  = vfmaq_f16(gy2, gy, gy);
+    gxgy = vfmaq_f16(gxgy, gx, gy);
+
+    gx = vextq_f16(low_gx, high_gx, 2);
+    gy = vextq_f16(low_gy, high_gy, 2);
+
+    gx2  = vfmaq_f16(gx2, gx, gx);
+    gy2  = vfmaq_f16(gy2, gy, gy);
+    gxgy = vfmaq_f16(gxgy, gx, gy);
+
+    if(block_size > 3)
+    {
+        gx = vextq_f16(low_gx, high_gx, 3);
+        gy = vextq_f16(low_gy, high_gy, 3);
+
+        gx2  = vfmaq_f16(gx2, gx, gx);
+        gy2  = vfmaq_f16(gy2, gy, gy);
+        gxgy = vfmaq_f16(gxgy, gx, gy);
+
+        gx = vextq_f16(low_gx, high_gx, 4);
+        gy = vextq_f16(low_gy, high_gy, 4);
+
+        gx2  = vfmaq_f16(gx2, gx, gx);
+        gy2  = vfmaq_f16(gy2, gy, gy);
+        gxgy = vfmaq_f16(gxgy, gx, gy);
+    }
+
+    if(block_size == 7)
+    {
+        gx = vextq_f16(low_gx, high_gx, 5);
+        gy = vextq_f16(low_gy, high_gy, 5);
+
+        gx2  = vfmaq_f16(gx2, gx, gx);
+        gy2  = vfmaq_f16(gy2, gy, gy);
+        gxgy = vfmaq_f16(gxgy, gx, gy);
+
+        gx = vextq_f16(low_gx, high_gx, 6);
+        gy = vextq_f16(low_gy, high_gy, 6);
+
+        gx2  = vfmaq_f16(gx2, gx, gx);
+        gy2  = vfmaq_f16(gy2, gy, gy);
+        gxgy = vfmaq_f16(gxgy, gx, gy);
+    }
+}
+
+template <size_t block_size>
+inline void harris_score_S16_S16_FLOAT(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
+                                       float strength_thresh)
+{
+    auto           gx_ptr_0 = static_cast<const int16_t *__restrict>(in1_ptr) - (block_size / 2) * (in_stride + 1);
+    auto           gy_ptr_0 = static_cast<const int16_t *__restrict>(in2_ptr) - (block_size / 2) * (in_stride + 1);
+    const int16_t *gx_ptr_1 = gx_ptr_0 + 8;
+    const int16_t *gy_ptr_1 = gy_ptr_0 + 8;
+    const auto     output   = static_cast<float *__restrict>(out_ptr);
+
+    // Gx^2, Gy^2 and Gx*Gy
+    float16x8_t gx2  = vdupq_n_f16(0.0f);
+    float16x8_t gy2  = vdupq_n_f16(0.0f);
+    float16x8_t gxgy = vdupq_n_f16(0.0f);
+
+    for(size_t i = 0; i < block_size; ++i)
+    {
+        const float16x8_t low_gx  = vcvtq_f16_s16(vld1q_s16(gx_ptr_0));
+        const float16x8_t high_gx = vcvtq_f16_s16(vld1q_s16(gx_ptr_1));
+        const float16x8_t low_gy  = vcvtq_f16_s16(vld1q_s16(gy_ptr_0));
+        const float16x8_t high_gy = vcvtq_f16_s16(vld1q_s16(gy_ptr_1));
+        harris_score1xN_FLOAT_FLOAT_FLOAT<block_size>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
+
+        // Update gx and gy pointer
+        gx_ptr_0 += in_stride;
+        gy_ptr_0 += in_stride;
+        gx_ptr_1 += in_stride;
+        gy_ptr_1 += in_stride;
+    }
+
+    // Calculate harris score
+    const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
+
+    // Store score
+    vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
+    vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
+}
+
+template <size_t block_size>
+inline void harris_score_S32_S32_FLOAT(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
+                                       float strength_thresh)
+{
+    static const float16x8_t zero = vdupq_n_f16(0.0f);
+
+    auto           gx_ptr_0 = static_cast<const int32_t *__restrict>(in1_ptr) - (block_size / 2) * (in_stride + 1);
+    auto           gy_ptr_0 = static_cast<const int32_t *__restrict>(in2_ptr) - (block_size / 2) * (in_stride + 1);
+    const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
+    const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
+    const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
+    const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
+    const auto     output   = static_cast<float *__restrict>(out_ptr);
+
+    // Gx^2, Gy^2 and Gx*Gy
+    float16x8_t gx2  = zero;
+    float16x8_t gy2  = zero;
+    float16x8_t gxgy = zero;
+
+    for(size_t i = 0; i < block_size; ++i)
+    {
+        const float16x8_t low_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_0))),
+                                                vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_1))));
+        const float16x8_t high_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_2))),
+                                                 vget_low_f16(zero));
+        const float16x8_t low_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_0))),
+                                                vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_1))));
+        const float16x8_t high_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_2))),
+                                                 vget_low_f16(zero));
+        harris_score1xN_FLOAT_FLOAT_FLOAT<block_size>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
+
+        // Update gx and gy pointer
+        gx_ptr_0 += in_stride;
+        gy_ptr_0 += in_stride;
+        gx_ptr_1 += in_stride;
+        gy_ptr_1 += in_stride;
+        gx_ptr_2 += in_stride;
+        gy_ptr_2 += in_stride;
+    }
+
+    // Calculate harris score
+    const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
+
+    // Store score
+    vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
+    vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
+}
+
+template <>
+inline void harris_score_S32_S32_FLOAT<7>(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
+                                          float strength_thresh)
+{
+    static const float16x8_t zero = vdupq_n_f16(0.0f);
+
+    auto           gx_ptr_0 = static_cast<const int32_t *__restrict>(in1_ptr) - 3 * (in_stride + 1);
+    auto           gy_ptr_0 = static_cast<const int32_t *__restrict>(in2_ptr) - 3 * (in_stride + 1);
+    const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
+    const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
+    const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
+    const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
+    const int32_t *gx_ptr_3 = gx_ptr_0 + 12;
+    const int32_t *gy_ptr_3 = gy_ptr_0 + 12;
+    const auto     output   = static_cast<float *__restrict>(out_ptr);
+
+    // Gx^2, Gy^2 and Gx*Gy
+    float16x8_t gx2  = zero;
+    float16x8_t gy2  = zero;
+    float16x8_t gxgy = zero;
+
+    for(size_t i = 0; i < 7; ++i)
+    {
+        const float16x8_t low_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_0))),
+                                                vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_1))));
+        const float16x8_t high_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_2))),
+                                                 vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_3))));
+        const float16x8_t low_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_0))),
+                                                vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_1))));
+        const float16x8_t high_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_2))),
+                                                 vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_3))));
+        harris_score1xN_FLOAT_FLOAT_FLOAT<7>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
+
+        // Update gx and gy pointer
+        gx_ptr_0 += in_stride;
+        gy_ptr_0 += in_stride;
+        gx_ptr_1 += in_stride;
+        gy_ptr_1 += in_stride;
+        gx_ptr_2 += in_stride;
+        gy_ptr_2 += in_stride;
+    }
+
+    // Calculate harris score
+    const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
+
+    // Store score
+    vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
+    vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
+}
+
+} // namespace fp16
+
+template <int32_t block_size>
+BorderSize        NEHarrisScoreFP16Kernel<block_size>::border_size() const
+{
+    return _border_size;
+}
+
+template <int32_t block_size>
+NEHarrisScoreFP16Kernel<block_size>::NEHarrisScoreFP16Kernel()
+    : INEHarrisScoreKernel(), _func(nullptr)
+{
+}
+
+template <int32_t block_size>
+void NEHarrisScoreFP16Kernel<block_size>::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    Iterator input1(_input1, window);
+    Iterator input2(_input2, window);
+    Iterator output(_output, window);
+
+    const size_t input_stride = _input1->info()->strides_in_bytes()[1] / element_size_from_data_type(_input1->info()->data_type());
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        (*_func)(input1.ptr(), input2.ptr(), output.ptr(), input_stride, _norm_factor, _sensitivity, _strength_thresh);
+    },
+    input1, input2, output);
+}
+
+template <int32_t block_size>
+void NEHarrisScoreFP16Kernel<block_size>::configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity,
+                                                    bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+    ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
+
+    _input1          = input1;
+    _input2          = input2;
+    _output          = output;
+    _sensitivity     = sensitivity;
+    _strength_thresh = strength_thresh;
+    _norm_factor     = norm_factor;
+    _border_size     = BorderSize(block_size / 2);
+
+    if(input1->info()->data_type() == DataType::S16)
+    {
+        _func = &fp16::harris_score_S16_S16_FLOAT<block_size>;
+    }
+    else
+    {
+        _func = &fp16::harris_score_S32_S32_FLOAT<block_size>;
+    }
+
+    ARM_COMPUTE_ERROR_ON(nullptr == _func);
+
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = block_size;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input1->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              AccessWindowRectangle(input2->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region, border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+#endif
+
+template class arm_compute::NEHarrisScoreKernel<3>;
+template class arm_compute::NEHarrisScoreKernel<5>;
+template class arm_compute::NEHarrisScoreKernel<7>;
+template arm_compute::NEHarrisScoreKernel<3>::NEHarrisScoreKernel();
+template arm_compute::NEHarrisScoreKernel<5>::NEHarrisScoreKernel();
+template arm_compute::NEHarrisScoreKernel<7>::NEHarrisScoreKernel();
+
+namespace
+{
+inline float32x4_t harris_score(float32x4_t gx2, float32x4_t gy2, float32x4_t gxgy, float32x4_t sensitivity, float32x4_t strength_thresh)
+{
+    // Trace^2
+    float32x4_t trace2 = vaddq_f32(gx2, gy2);
+    trace2             = vmulq_f32(trace2, trace2);
+
+    // Det(A)
+    float32x4_t det = vmulq_f32(gx2, gy2);
+    det             = vmlsq_f32(det, gxgy, gxgy);
+
+    // Det(A) - sensitivity * trace^2
+    const float32x4_t mc = vmlsq_f32(det, sensitivity, trace2);
+
+    // mc > strength_thresh
+    const uint32x4_t mask = vcgtq_f32(mc, strength_thresh);
+
+    return vbslq_f32(mask, mc, vdupq_n_f32(0.0f));
+}
+
+inline void harris_score1x3_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
+                                              float32x4_t norm_factor)
+{
+    // Normalize
+    low_gx  = vmulq_f32(low_gx, norm_factor);
+    low_gy  = vmulq_f32(low_gy, norm_factor);
+    high_gx = vmulq_f32(high_gx, norm_factor);
+    high_gy = vmulq_f32(high_gy, norm_factor);
+
+    const float32x4_t l_gx = low_gx;
+    const float32x4_t l_gy = low_gy;
+    const float32x4_t m_gx = vextq_f32(low_gx, high_gx, 1);
+    const float32x4_t m_gy = vextq_f32(low_gy, high_gy, 1);
+    const float32x4_t r_gx = vextq_f32(low_gx, high_gx, 2);
+    const float32x4_t r_gy = vextq_f32(low_gy, high_gy, 2);
+
+    // Gx*Gx
+    gx2 = vmlaq_f32(gx2, l_gx, l_gx);
+    gx2 = vmlaq_f32(gx2, m_gx, m_gx);
+    gx2 = vmlaq_f32(gx2, r_gx, r_gx);
+
+    // Gy*Gy
+    gy2 = vmlaq_f32(gy2, l_gy, l_gy);
+    gy2 = vmlaq_f32(gy2, m_gy, m_gy);
+    gy2 = vmlaq_f32(gy2, r_gy, r_gy);
+
+    // Gx*Gy
+    gxgy = vmlaq_f32(gxgy, l_gx, l_gy);
+    gxgy = vmlaq_f32(gxgy, m_gx, m_gy);
+    gxgy = vmlaq_f32(gxgy, r_gx, r_gy);
+}
+
+inline void harris_score1x5_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
+                                              float32x4_t norm_factor)
+{
+    // Normalize
+    low_gx  = vmulq_f32(low_gx, norm_factor);
+    low_gy  = vmulq_f32(low_gy, norm_factor);
+    high_gx = vmulq_f32(high_gx, norm_factor);
+    high_gy = vmulq_f32(high_gy, norm_factor);
+
+    // L2 values
+    float32x4_t gx = low_gx;
+    float32x4_t gy = low_gy;
+
+    // Accumulate
+    gx2  = vmlaq_f32(gx2, gx, gx);
+    gy2  = vmlaq_f32(gy2, gy, gy);
+    gxgy = vmlaq_f32(gxgy, gx, gy);
+
+    // L1 values
+    gx = vextq_f32(low_gx, high_gx, 1);
+    gy = vextq_f32(low_gy, high_gy, 1);
+
+    // Accumulate
+    gx2  = vmlaq_f32(gx2, gx, gx);
+    gy2  = vmlaq_f32(gy2, gy, gy);
+    gxgy = vmlaq_f32(gxgy, gx, gy);
+
+    // M values
+    gx = vextq_f32(low_gx, high_gx, 2);
+    gy = vextq_f32(low_gy, high_gy, 2);
+
+    // Accumulate
+    gx2  = vmlaq_f32(gx2, gx, gx);
+    gy2  = vmlaq_f32(gy2, gy, gy);
+    gxgy = vmlaq_f32(gxgy, gx, gy);
+
+    // R1 values
+    gx = vextq_f32(low_gx, high_gx, 3);
+    gy = vextq_f32(low_gy, high_gy, 3);
+
+    // Accumulate
+    gx2  = vmlaq_f32(gx2, gx, gx);
+    gy2  = vmlaq_f32(gy2, gy, gy);
+    gxgy = vmlaq_f32(gxgy, gx, gy);
+
+    // R2 values
+    gx = high_gx;
+    gy = high_gy;
+
+    // Accumulate
+    gx2  = vmlaq_f32(gx2, gx, gx);
+    gy2  = vmlaq_f32(gy2, gy, gy);
+    gxgy = vmlaq_f32(gxgy, gx, gy);
+}
+
+inline void harris_score1x7_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t high_gx1, float32x4_t high_gy1, float32x4_t &gx2,
+                                              float32x4_t &gy2, float32x4_t &gxgy, float32x4_t norm_factor)
+{
+    // Normalize
+    low_gx  = vmulq_f32(low_gx, norm_factor);
+    low_gy  = vmulq_f32(low_gy, norm_factor);
+    high_gx = vmulq_f32(high_gx, norm_factor);
+    high_gy = vmulq_f32(high_gy, norm_factor);
+
+    // L3 values
+    float32x4_t gx = low_gx;
+    float32x4_t gy = low_gy;
+
+    // Accumulate
+    gx2  = vmlaq_f32(gx2, gx, gx);
+    gy2  = vmlaq_f32(gy2, gy, gy);
+    gxgy = vmlaq_f32(gxgy, gx, gy);
+
+    // L2 values
+    gx = vextq_f32(low_gx, high_gx, 1);
+    gy = vextq_f32(low_gy, high_gy, 1);
+
+    // Accumulate
+    gx2  = vmlaq_f32(gx2, gx, gx);
+    gy2  = vmlaq_f32(gy2, gy, gy);
+    gxgy = vmlaq_f32(gxgy, gx, gy);
+
+    // L1 values
+    gx = vextq_f32(low_gx, high_gx, 2);
+    gy = vextq_f32(low_gy, high_gy, 2);
+
+    // Accumulate
+    gx2  = vmlaq_f32(gx2, gx, gx);
+    gy2  = vmlaq_f32(gy2, gy, gy);
+    gxgy = vmlaq_f32(gxgy, gx, gy);
+
+    // M values
+    gx = vextq_f32(low_gx, high_gx, 3);
+    gy = vextq_f32(low_gy, high_gy, 3);
+
+    // Accumulate
+    gx2  = vmlaq_f32(gx2, gx, gx);
+    gy2  = vmlaq_f32(gy2, gy, gy);
+    gxgy = vmlaq_f32(gxgy, gx, gy);
+
+    // R1 values
+    gx = high_gx;
+    gy = high_gy;
+
+    // Accumulate
+    gx2  = vmlaq_f32(gx2, gx, gx);
+    gy2  = vmlaq_f32(gy2, gy, gy);
+    gxgy = vmlaq_f32(gxgy, gx, gy);
+
+    // Change tmp_low and tmp_high for calculating R2 and R3 values
+    low_gx  = high_gx;
+    low_gy  = high_gy;
+    high_gx = high_gx1;
+    high_gy = high_gy1;
+
+    // Normalize
+    high_gx = vmulq_f32(high_gx, norm_factor);
+    high_gy = vmulq_f32(high_gy, norm_factor);
+
+    // R2 values
+    gx = vextq_f32(low_gx, high_gx, 1);
+    gy = vextq_f32(low_gy, high_gy, 1);
+
+    // Accumulate
+    gx2  = vmlaq_f32(gx2, gx, gx);
+    gy2  = vmlaq_f32(gy2, gy, gy);
+    gxgy = vmlaq_f32(gxgy, gx, gy);
+
+    // R3 values
+    gx = vextq_f32(low_gx, high_gx, 2);
+    gy = vextq_f32(low_gy, high_gy, 2);
+
+    // Accumulate
+    gx2  = vmlaq_f32(gx2, gx, gx);
+    gy2  = vmlaq_f32(gy2, gy, gy);
+    gxgy = vmlaq_f32(gxgy, gx, gy);
+}
+
+inline void harris_score3x3_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
+                                          float in_norm_factor, float in_sensitivity, float in_strength_thresh)
+
+{
+    const auto     gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 1;
+    const auto     gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 1;
+    const int16_t *gx_ptr_1 = gx_ptr_0 + 4;
+    const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
+    const auto     output   = static_cast<float *__restrict>(output_ptr);
+
+    // Gx^2, Gy^2 and Gx*Gy
+    float32x4x2_t gx2 =
+    {
+        {
+            vdupq_n_f32(0.0f),
+            vdupq_n_f32(0.0f)
+        }
+    };
+    float32x4x2_t gy2 =
+    {
+        {
+            vdupq_n_f32(0.0f),
+            vdupq_n_f32(0.0f)
+        }
+    };
+    float32x4x2_t gxgy =
+    {
+        {
+            vdupq_n_f32(0.0f),
+            vdupq_n_f32(0.0f)
+        }
+    };
+
+    // Row0
+    int16x8x2_t tmp_gx =
+    {
+        {
+            vld1q_s16(gx_ptr_0 - input_stride),
+            vld1q_s16(gx_ptr_1 - input_stride)
+        }
+    };
+    int16x8x2_t tmp_gy =
+    {
+        {
+            vld1q_s16(gy_ptr_0 - input_stride),
+            vld1q_s16(gy_ptr_1 - input_stride)
+        }
+    };
+    float32x4_t sensitivity     = vdupq_n_f32(in_sensitivity);
+    float32x4_t norm_factor     = vdupq_n_f32(in_norm_factor);
+    float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
+
+    float32x4_t low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
+    float32x4_t low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
+    float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
+    float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
+    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
+
+    low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
+    low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
+    high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
+    high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
+    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
+
+    // Row1
+    tmp_gx.val[0] = vld1q_s16(gx_ptr_0);
+    tmp_gy.val[0] = vld1q_s16(gy_ptr_0);
+    tmp_gx.val[1] = vld1q_s16(gx_ptr_1);
+    tmp_gy.val[1] = vld1q_s16(gy_ptr_1);
+
+    low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
+    low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
+    high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
+    high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
+    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
+
+    low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
+    low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
+    high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
+    high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
+    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
+
+    // Row2
+    tmp_gx.val[0] = vld1q_s16(gx_ptr_0 + input_stride);
+    tmp_gy.val[0] = vld1q_s16(gy_ptr_0 + input_stride);
+    tmp_gx.val[1] = vld1q_s16(gx_ptr_1 + input_stride);
+    tmp_gy.val[1] = vld1q_s16(gy_ptr_1 + input_stride);
+
+    low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
+    low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
+    high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
+    high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
+    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
+
+    low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
+    low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
+    high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
+    high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
+    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
+
+    // Calculate harris score
+    const float32x4x2_t mc =
+    {
+        {
+            harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
+            harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
+        }
+    };
+
+    // Store score
+    vst1q_f32(output + 0, mc.val[0]);
+    vst1q_f32(output + 4, mc.val[1]);
+}
+
+inline void harris_score3x3_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
+                                          float in_norm_factor, float in_sensitivity, float in_strength_thresh)
+{
+    auto           gx_ptr_0        = static_cast<const int32_t *__restrict>(input1_ptr) - 1;
+    auto           gy_ptr_0        = static_cast<const int32_t *__restrict>(input2_ptr) - 1;
+    const int32_t *gx_ptr_1        = gx_ptr_0 + 4;
+    const int32_t *gy_ptr_1        = gy_ptr_0 + 4;
+    const int32_t *gx_ptr_2        = gx_ptr_0 + 8;
+    const int32_t *gy_ptr_2        = gy_ptr_0 + 8;
+    const auto     output          = static_cast<float *__restrict>(output_ptr);
+    float32x4_t    sensitivity     = vdupq_n_f32(in_sensitivity);
+    float32x4_t    norm_factor     = vdupq_n_f32(in_norm_factor);
+    float32x4_t    strength_thresh = vdupq_n_f32(in_strength_thresh);
+
+    // Gx^2, Gy^2 and Gx*Gy
+    float32x4x2_t gx2 =
+    {
+        {
+            vdupq_n_f32(0.0f),
+            vdupq_n_f32(0.0f)
+        }
+    };
+    float32x4x2_t gy2 =
+    {
+        {
+            vdupq_n_f32(0.0f),
+            vdupq_n_f32(0.0f)
+        }
+    };
+    float32x4x2_t gxgy =
+    {
+        {
+            vdupq_n_f32(0.0f),
+            vdupq_n_f32(0.0f)
+        }
+    };
+
+    // Row0
+    float32x4_t low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 - input_stride));
+    float32x4_t low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 - input_stride));
+    float32x4_t high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
+    float32x4_t high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride));
+    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
+
+    low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
+    low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride));
+    high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 - input_stride));
+    high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 - input_stride));
+    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
+
+    // Row1
+    low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
+    low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
+    high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
+    high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
+    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
+
+    low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
+    low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
+    high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
+    high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
+    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
+
+    // Row2
+    low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 + input_stride));
+    low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 + input_stride));
+    high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
+    high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride));
+    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
+
+    low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
+    low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride));
+    high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 + input_stride));
+    high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 + input_stride));
+    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
+
+    // Calculate harris score
+    const float32x4x2_t mc =
+    {
+        {
+            harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
+            harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
+        }
+    };
+
+    // Store score
+    vst1q_f32(output + 0, mc.val[0]);
+    vst1q_f32(output + 4, mc.val[1]);
+}
+
+inline void harris_score5x5_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
+                                          float in_norm_factor, float in_sensitivity, float in_strength_thresh)
+{
+    auto           gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 2 - 2 * input_stride;
+    auto           gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 2 - 2 * input_stride;
+    const int16_t *gx_ptr_1 = gx_ptr_0 + 4;
+    const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
+    const auto     output   = static_cast<float *__restrict>(output_ptr);
+
+    // Gx^2, Gy^2 and Gx*Gy
+    float32x4x2_t gx2 =
+    {
+        {
+            vdupq_n_f32(0.0f),
+            vdupq_n_f32(0.0f)
+        }
+    };
+    float32x4x2_t gy2 =
+    {
+        {
+            vdupq_n_f32(0.0f),
+            vdupq_n_f32(0.0f)
+        }
+    };
+    float32x4x2_t gxgy =
+    {
+        {
+            vdupq_n_f32(0.0f),
+            vdupq_n_f32(0.0f)
+        }
+    };
+    float32x4_t sensitivity     = vdupq_n_f32(in_sensitivity);
+    float32x4_t norm_factor     = vdupq_n_f32(in_norm_factor);
+    float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
+
+    for(int i = 0; i < 5; ++i)
+    {
+        const int16x8x2_t tmp_gx =
+        {
+            {
+                vld1q_s16(gx_ptr_0),
+                vld1q_s16(gx_ptr_1)
+            }
+        };
+        const int16x8x2_t tmp_gy =
+        {
+            {
+                vld1q_s16(gy_ptr_0),
+                vld1q_s16(gy_ptr_1)
+            }
+        };
+
+        float32x4_t low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
+        float32x4_t low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
+        float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
+        float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
+        harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
+
+        low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
+        low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
+        high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
+        high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
+        harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
+
+        // Update gx and gy pointer
+        gx_ptr_0 += input_stride;
+        gy_ptr_0 += input_stride;
+        gx_ptr_1 += input_stride;
+        gy_ptr_1 += input_stride;
+    }
+
+    // Calculate harris score
+    const float32x4x2_t mc =
+    {
+        {
+            harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
+            harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
+        }
+    };
+
+    // Store score
+    vst1q_f32(output + 0, mc.val[0]);
+    vst1q_f32(output + 4, mc.val[1]);
+}
+
+inline void harris_score5x5_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
+                                          float in_norm_factor, float in_sensitivity, float in_strength_thresh)
+
+{
+    auto           gx_ptr_0 = static_cast<const int32_t *__restrict>(input1_ptr) - 2 - 2 * input_stride;
+    auto           gy_ptr_0 = static_cast<const int32_t *__restrict>(input2_ptr) - 2 - 2 * input_stride;
+    const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
+    const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
+    const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
+    const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
+    const auto     output   = static_cast<float *__restrict>(output_ptr);
+
+    // Gx^2, Gy^2 and Gx*Gy
+    float32x4x2_t gx2 =
+    {
+        {
+            vdupq_n_f32(0.0f),
+            vdupq_n_f32(0.0f)
+        }
+    };
+    float32x4x2_t gy2 =
+    {
+        {
+            vdupq_n_f32(0.0f),
+            vdupq_n_f32(0.0f)
+        }
+    };
+    float32x4x2_t gxgy =
+    {
+        {
+            vdupq_n_f32(0.0f),
+            vdupq_n_f32(0.0f)
+        }
+    };
+    float32x4_t sensitivity     = vdupq_n_f32(in_sensitivity);
+    float32x4_t norm_factor     = vdupq_n_f32(in_norm_factor);
+    float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
+
+    for(int i = 0; i < 5; ++i)
+    {
+        const float32x4_t low_gx_0  = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
+        const float32x4_t low_gy_0  = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
+        const float32x4_t high_gx_0 = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
+        const float32x4_t high_gy_0 = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
+        harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_0, low_gy_0, high_gx_0, high_gy_0, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
+
+        const float32x4_t low_gx_1  = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
+        const float32x4_t low_gy_1  = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
+        const float32x4_t high_gx_1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
+        const float32x4_t high_gy_1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
+        harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_1, low_gy_1, high_gx_1, high_gy_1, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
+
+        // Update gx and gy pointer
+        gx_ptr_0 += input_stride;
+        gy_ptr_0 += input_stride;
+        gx_ptr_1 += input_stride;
+        gy_ptr_1 += input_stride;
+        gx_ptr_2 += input_stride;
+        gy_ptr_2 += input_stride;
+    }
+
+    // Calculate harris score
+    const float32x4x2_t mc =
+    {
+        {
+            harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
+            harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
+        }
+    };
+
+    // Store score
+    vst1q_f32(output + 0, mc.val[0]);
+    vst1q_f32(output + 4, mc.val[1]);
+}
+
+inline void harris_score7x7_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
+                                          float in_norm_factor, float in_sensitivity, float in_strength_thresh)
+{
+    auto           gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 3 - 3 * input_stride;
+    auto           gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 3 - 3 * input_stride;
+    const int16_t *gx_ptr_1 = gx_ptr_0 + 8;
+    const int16_t *gy_ptr_1 = gy_ptr_0 + 8;
+    const auto     output   = static_cast<float *__restrict>(output_ptr);
+
+    // Gx^2, Gy^2 and Gx*Gy
+    float32x4_t gx2             = vdupq_n_f32(0.0f);
+    float32x4_t gy2             = vdupq_n_f32(0.0f);
+    float32x4_t gxgy            = vdupq_n_f32(0.0f);
+    float32x4_t sensitivity     = vdupq_n_f32(in_sensitivity);
+    float32x4_t norm_factor     = vdupq_n_f32(in_norm_factor);
+    float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
+
+    for(int i = 0; i < 7; ++i)
+    {
+        const int16x8_t tmp0_gx = vld1q_s16(gx_ptr_0);
+        const int16x8_t tmp0_gy = vld1q_s16(gy_ptr_0);
+        const int16x4_t tmp1_gx = vld1_s16(gx_ptr_1);
+        const int16x4_t tmp1_gy = vld1_s16(gy_ptr_1);
+
+        float32x4_t low_gx   = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gx)));
+        float32x4_t low_gy   = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gy)));
+        float32x4_t high_gx  = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gx)));
+        float32x4_t high_gy  = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gy)));
+        float32x4_t high_gx1 = vcvtq_f32_s32(vmovl_s16(tmp1_gx));
+        float32x4_t high_gy1 = vcvtq_f32_s32(vmovl_s16(tmp1_gy));
+        harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
+
+        // Update gx and gy pointer
+        gx_ptr_0 += input_stride;
+        gy_ptr_0 += input_stride;
+        gx_ptr_1 += input_stride;
+        gy_ptr_1 += input_stride;
+    }
+
+    // Calculate harris score
+    const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
+
+    // Store score
+    vst1q_f32(output, mc);
+}
+
+inline void harris_score7x7_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
+                                          float in_norm_factor, float in_sensitivity, float in_strength_thresh)
+{
+    auto           gx_ptr_0 = static_cast<const int32_t *__restrict>(input1_ptr) - 3 - 3 * input_stride;
+    auto           gy_ptr_0 = static_cast<const int32_t *__restrict>(input2_ptr) - 3 - 3 * input_stride;
+    const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
+    const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
+    const int32_t *gx_ptr_2 = gx_ptr_1 + 4;
+    const int32_t *gy_ptr_2 = gy_ptr_1 + 4;
+    const auto     output   = static_cast<float *__restrict>(output_ptr);
+
+    // Gx^2, Gy^2 and Gx*Gy
+    float32x4_t gx2             = vdupq_n_f32(0.0f);
+    float32x4_t gy2             = vdupq_n_f32(0.0f);
+    float32x4_t gxgy            = vdupq_n_f32(0.0f);
+    float32x4_t sensitivity     = vdupq_n_f32(in_sensitivity);
+    float32x4_t norm_factor     = vdupq_n_f32(in_norm_factor);
+    float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
+
+    for(int i = 0; i < 7; ++i)
+    {
+        const float32x4_t low_gx   = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
+        const float32x4_t low_gy   = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
+        const float32x4_t high_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
+        const float32x4_t high_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
+        const float32x4_t high_gx1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
+        const float32x4_t high_gy1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
+        harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
+
+        // Update gx and gy pointer
+        gx_ptr_0 += input_stride;
+        gy_ptr_0 += input_stride;
+        gx_ptr_1 += input_stride;
+        gy_ptr_1 += input_stride;
+        gx_ptr_2 += input_stride;
+        gy_ptr_2 += input_stride;
+    }
+
+    // Calculate harris score
+    const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
+
+    // Store score
+    vst1q_f32(output, mc);
+}
+
+} // namespace
+
+INEHarrisScoreKernel::INEHarrisScoreKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr), _sensitivity(0.0f), _strength_thresh(0.0f), _norm_factor(0.0f), _border_size()
+{
+}
+
+template <int32_t block_size>
+NEHarrisScoreKernel<block_size>::NEHarrisScoreKernel()
+    : INEHarrisScoreKernel(), _func(nullptr)
+{
+}
+
+template <int32_t block_size>
+void NEHarrisScoreKernel<block_size>::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    Iterator input1(_input1, window);
+    Iterator input2(_input2, window);
+    Iterator output(_output, window);
+
+    const size_t input_stride = _input1->info()->strides_in_bytes()[1] / element_size_from_data_type(_input1->info()->data_type());
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        (*_func)(input1.ptr(), input2.ptr(), output.ptr(), input_stride, _norm_factor, _sensitivity, _strength_thresh);
+    },
+    input1, input2, output);
+}
+
+template <int32_t block_size>
+BorderSize        NEHarrisScoreKernel<block_size>::border_size() const
+{
+    return _border_size;
+}
+
+template <int32_t block_size>
+void NEHarrisScoreKernel<block_size>::configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity,
+                                                bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+    ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
+
+    _input1          = input1;
+    _input2          = input2;
+    _output          = output;
+    _sensitivity     = sensitivity;
+    _strength_thresh = strength_thresh;
+    _norm_factor     = norm_factor;
+    _border_size     = BorderSize(block_size / 2);
+
+    if(input1->info()->data_type() == DataType::S16)
+    {
+        switch(block_size)
+        {
+            case 3:
+                _func = &harris_score3x3_S16_S16_FLOAT;
+                break;
+            case 5:
+                _func = &harris_score5x5_S16_S16_FLOAT;
+                break;
+            case 7:
+                _func = &harris_score7x7_S16_S16_FLOAT;
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Invalid block size");
+                break;
+        }
+    }
+    else
+    {
+        switch(block_size)
+        {
+            case 3:
+                _func = &harris_score3x3_S32_S32_FLOAT;
+                break;
+            case 5:
+                _func = &harris_score5x5_S32_S32_FLOAT;
+                break;
+            case 7:
+                _func = &harris_score7x7_S32_S32_FLOAT;
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Invalid block size");
+                break;
+        }
+    }
+
+    ARM_COMPUTE_ERROR_ON(nullptr == _func);
+
+    constexpr unsigned int num_elems_processed_per_iteration = block_size != 7 ? 8 : 4;
+    constexpr unsigned int num_elems_read_per_iteration      = block_size != 7 ? 16 : 12;
+    constexpr unsigned int num_elems_written_per_iteration   = block_size != 7 ? 8 : 4;
+    constexpr unsigned int num_rows_read_per_iteration       = block_size;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input1->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              AccessWindowRectangle(input2->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region, border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
diff --git a/src/core/NEON/kernels/NEHistogramKernel.cpp b/src/core/NEON/kernels/NEHistogramKernel.cpp
new file mode 100644
index 0000000000..9e967ec4f5
--- /dev/null
+++ b/src/core/NEON/kernels/NEHistogramKernel.cpp
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEHistogramKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IDistribution1D.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Window.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <array>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+inline void NEHistogramKernel::merge_histogram(uint32_t *global_hist, const uint32_t *local_hist, size_t bins)
+{
+    std::lock_guard<std::mutex> lock(_hist_mtx);
+
+    const unsigned int v_end = (bins / 4) * 4;
+
+    for(unsigned int b = 0; b < v_end; b += 4)
+    {
+        const uint32x4_t tmp_global = vld1q_u32(global_hist + b);
+        const uint32x4_t tmp_local  = vld1q_u32(local_hist + b);
+        vst1q_u32(global_hist + b, vaddq_u32(tmp_global, tmp_local));
+    }
+
+    for(unsigned int b = v_end; b < bins; ++b)
+    {
+        global_hist[b] += local_hist[b];
+    }
+}
+
+NEHistogramKernel::NEHistogramKernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _local_hist(nullptr), _window_lut(nullptr), _hist_mtx()
+{
+}
+
+void NEHistogramKernel::histogram_U8(Window win)
+{
+    ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
+
+    const size_t          bins       = _output->num_bins();
+    const int32_t         offset     = _output->offset();
+    const uint32_t        offrange   = offset + _output->range();
+    const uint32_t *const w_lut      = _window_lut;
+    uint32_t *const       local_hist = _local_hist + win.thread_id() * bins;
+
+    // Clear local_histogram
+    std::fill_n(local_hist, bins, 0);
+
+    auto update_local_hist = [&](uint8_t p)
+    {
+        if(offset <= p && p < offrange)
+        {
+            ++local_hist[w_lut[p]];
+        }
+    };
+
+    const unsigned int x_start = win.x().start();
+    const unsigned int x_end   = win.x().end();
+
+    // Handle X dimension manually to split into two loops
+    // First one will use vector operations, second one processes the left over
+    // pixels
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(_input, win);
+
+    // Calculate local histogram
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        unsigned int x = x_start;
+
+        // Vector loop
+        for(; x <= x_end - 8; x += 8)
+        {
+            const uint8x8_t pixels = vld1_u8(input.ptr() + x);
+
+            update_local_hist(vget_lane_u8(pixels, 0));
+            update_local_hist(vget_lane_u8(pixels, 1));
+            update_local_hist(vget_lane_u8(pixels, 2));
+            update_local_hist(vget_lane_u8(pixels, 3));
+            update_local_hist(vget_lane_u8(pixels, 4));
+            update_local_hist(vget_lane_u8(pixels, 5));
+            update_local_hist(vget_lane_u8(pixels, 6));
+            update_local_hist(vget_lane_u8(pixels, 7));
+        }
+
+        // Process leftover pixels
+        for(; x < x_end; ++x)
+        {
+            update_local_hist(input.ptr()[x]);
+        }
+    },
+    input);
+
+    // Merge histograms
+    merge_histogram(_output->buffer(), local_hist, bins);
+}
+
+void NEHistogramKernel::histogram_fixed_U8(Window win)
+{
+    ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
+
+    std::array<uint32_t, _max_range_size> local_hist{ { 0 } };
+
+    const unsigned int x_start = win.x().start();
+    const unsigned int x_end   = win.x().end();
+
+    // Handle X dimension manually to split into two loops
+    // First one will use vector operations, second one processes the left over
+    // pixels
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(_input, win);
+
+    // Calculate local histogram
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        unsigned int x = x_start;
+
+        // Vector loop
+        for(; x <= x_end - 8; x += 8)
+        {
+            const uint8x8_t pixels = vld1_u8(input.ptr() + x);
+
+            ++local_hist[vget_lane_u8(pixels, 0)];
+            ++local_hist[vget_lane_u8(pixels, 1)];
+            ++local_hist[vget_lane_u8(pixels, 2)];
+            ++local_hist[vget_lane_u8(pixels, 3)];
+            ++local_hist[vget_lane_u8(pixels, 4)];
+            ++local_hist[vget_lane_u8(pixels, 5)];
+            ++local_hist[vget_lane_u8(pixels, 6)];
+            ++local_hist[vget_lane_u8(pixels, 7)];
+        }
+
+        // Process leftover pixels
+        for(; x < x_end; ++x)
+        {
+            ++local_hist[input.ptr()[x]];
+        }
+    },
+    input);
+
+    // Merge histograms
+    merge_histogram(_output->buffer(), local_hist.data(), _max_range_size);
+}
+
+void NEHistogramKernel::calculate_window_lut() const
+{
+    const int32_t  offset = _output->offset();
+    const size_t   bins   = _output->num_bins();
+    const uint32_t range  = _output->range();
+
+    std::fill_n(_window_lut, offset, 0);
+
+    for(unsigned int p = offset; p < _max_range_size; ++p)
+    {
+        _window_lut[p] = ((p - offset) * bins) / range;
+    }
+}
+
+void NEHistogramKernel::configure(const IImage *input, IDistribution1D *output, uint32_t *local_hist, uint32_t *window_lut)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    ARM_COMPUTE_ERROR_ON(nullptr == local_hist);
+    ARM_COMPUTE_ERROR_ON(nullptr == window_lut);
+
+    _input      = input;
+    _output     = output;
+    _local_hist = local_hist;
+    _window_lut = window_lut;
+
+    //Check offset
+    ARM_COMPUTE_ERROR_ON_MSG(0 > _output->offset() || _output->offset() > static_cast<int32_t>(_max_range_size), "Offset is larger than the image value range.");
+
+    //Check range
+    ARM_COMPUTE_ERROR_ON_MSG(static_cast<int32_t>(_output->range()) > static_cast<int32_t>(_max_range_size) /* max range */, "Range larger than the image value range.");
+
+    // Calculate LUT
+    calculate_window_lut();
+
+    // Set appropriate function
+    _func = &NEHistogramKernel::histogram_U8;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    INEKernel::configure(win);
+}
+
+void NEHistogramKernel::configure(const IImage *input, IDistribution1D *output)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+    _input  = input;
+    _output = output;
+
+    // Set appropriate function
+    _func = &NEHistogramKernel::histogram_fixed_U8;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    INEKernel::configure(win);
+}
+
+void NEHistogramKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (this->*_func)(window);
+}
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
new file mode 100644
index 0000000000..c7c23d5d06
--- /dev/null
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp
@@ -0,0 +1,338 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/FixedPoint.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <tuple>
+
+using namespace arm_compute;
+
+namespace
+{
+template <typename T, bool has_pads>
+inline void linearize_volume(const uint8_t *const in_ptr,
+                             T                   *out_ptr,
+                             bool                 has_bias,
+                             int                  top_left_x,
+                             int                  top_left_y,
+                             int                  kernel_size,
+                             int                  kernel_depth,
+                             int                  input_w,
+                             int                  input_h,
+                             int                  input_stride_x,
+                             int                  input_stride_y,
+                             int                  input_stride_z,
+                             int                  fixed_point_position)
+{
+    const int kernel_size2 = kernel_size * kernel_size;
+    const int x_e          = top_left_x + kernel_size;
+    const int y_e          = top_left_y + kernel_size;
+
+    // Linearize volume
+    int d = 0;
+    // This for loop linearize a volume with 3 slices. This allows:
+    // 1) to reduce the iterations of the outer for loop "d"
+    // 2) to have an optimized im2col for the first convolution layer where usually we have 3 IFMs
+    for(; d <= (kernel_depth - 3); d += 3)
+    {
+        for(int y = top_left_y; y < y_e; ++y)
+        {
+            if((y < 0 || y >= input_h) && has_pads)
+            {
+                // All the values will be zeros
+                for(int x = top_left_x; x < x_e; ++x, ++out_ptr)
+                {
+                    *(out_ptr + 0 * kernel_size2) = 0;
+                    *(out_ptr + 1 * kernel_size2) = 0;
+                    *(out_ptr + 2 * kernel_size2) = 0;
+                }
+            }
+            else
+            {
+                for(int x = top_left_x; x < x_e; ++x, ++out_ptr)
+                {
+                    if((x < 0 || x >= input_w) && has_pads)
+                    {
+                        *(out_ptr + 0 * kernel_size2) = 0;
+                        *(out_ptr + 1 * kernel_size2) = 0;
+                        *(out_ptr + 2 * kernel_size2) = 0;
+                    }
+                    else
+                    {
+                        *(out_ptr + 0 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 0) * input_stride_z + y * input_stride_y + x * input_stride_x)));
+                        *(out_ptr + 1 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 1) * input_stride_z + y * input_stride_y + x * input_stride_x)));
+                        *(out_ptr + 2 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 2) * input_stride_z + y * input_stride_y + x * input_stride_x)));
+                    }
+                }
+            }
+        }
+        out_ptr += 2 * kernel_size2;
+    }
+
+    // Left over
+    for(; d < kernel_depth; d++)
+    {
+        for(int y = top_left_y; y < y_e; ++y)
+        {
+            if((y < 0 || y >= input_h) && has_pads)
+            {
+                // All the values will be zeros
+                memset(out_ptr, 0, kernel_size * sizeof(T));
+                out_ptr += kernel_size;
+            }
+            else
+            {
+                for(int x = top_left_x; x < x_e; ++x, ++out_ptr)
+                {
+                    if((x < 0 || x >= input_w) && has_pads)
+                    {
+                        *out_ptr = 0;
+                    }
+                    else
+                    {
+                        *out_ptr = *(reinterpret_cast<const T *>(in_ptr + (d * input_stride_z + y * input_stride_y + x * input_stride_x)));
+                    }
+                }
+            }
+        }
+    }
+
+    // Append 1 if the convolution layer has biases
+    if(has_bias)
+    {
+        if(std::is_same<T, arm_compute::qint8_t>::value)
+        {
+            *out_ptr = scvt_qs8_f32(1.0f, fixed_point_position);
+        }
+        else
+        {
+            *out_ptr = static_cast<T>(1);
+        }
+    }
+}
+} // namespace
+
+template <typename T, bool has_pads>
+void NEIm2ColKernel::run_generic(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    const int kernel_depth   = _input->info()->dimension(2);
+    const int input_w        = _input->info()->dimension(0);
+    const int input_h        = _input->info()->dimension(1);
+    const int input_stride_x = _input->info()->strides_in_bytes().x();
+    const int input_stride_y = _input->info()->strides_in_bytes().y();
+    const int input_stride_z = _input->info()->strides_in_bytes().z();
+
+    int pad_x    = 0;
+    int pad_y    = 0;
+    int stride_x = 0;
+    int stride_y = 0;
+    std::tie(pad_x, pad_y)       = _conv_info.pad();
+    std::tie(stride_x, stride_y) = _conv_info.stride();
+
+    // Setup input window
+    const int start_x = -pad_x;
+    const int start_y = -pad_y;
+
+    Window window_in(window);
+    // The first three dimensions of the input are increased by the inner loops
+    window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    // Setup output window
+    Window window_out(window);
+    window_out.set(Window::DimX, Window::Dimension(0, _output->info()->dimension(0), _output->info()->strides_in_bytes().y() / _output->info()->element_size()));
+    window_out.set(Window::DimY, Window::Dimension(window.y().start() * _convolved_dims.first, window.y().end() * _convolved_dims.first, _convolved_dims.first));
+    window_out.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    // Create iterators
+    Iterator in(_input, window_in);
+    Iterator out(_output, window_out);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int top_left_x = id.x() * stride_x + start_x;
+        const int top_left_y = id.y() * stride_y + start_y;
+
+        // Get pointers
+        const uint8_t *const input_ptr  = in.ptr();
+        auto                 output_ptr = reinterpret_cast<T *>(out.ptr());
+
+        // Linearize volume
+        linearize_volume<T, has_pads>(input_ptr,
+                                      output_ptr,
+                                      _has_bias,
+                                      top_left_x,
+                                      top_left_y,
+                                      static_cast<int>(_kernel_size),
+                                      kernel_depth,
+                                      input_w,
+                                      input_h,
+                                      input_stride_x,
+                                      input_stride_y,
+                                      input_stride_z,
+                                      _input->info()->fixed_point_position());
+    },
+    in, out);
+}
+
+template <typename T>
+void NEIm2ColKernel::run_reduced(const Window &window)
+{
+    const size_t in_width   = _input->info()->dimension(0);
+    const size_t in_height  = _input->info()->dimension(1);
+    const size_t out_step_x = in_width * _input->info()->element_size();
+    const size_t out_step_y = out_step_x * in_height;
+    const size_t out_width  = _output->info()->dimension(0);
+
+    Window in_window(window);
+    in_window.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Window out_window;
+    out_window.use_tensor_dimensions(_output->info());
+    out_window.set(Window::DimX, Window::Dimension(out_window.x().start(), out_window.x().end(), in_width));
+
+    Window in_slice  = in_window.first_slice_window_3D();
+    Window out_slice = out_window.first_slice_window_1D();
+
+    do
+    {
+        Iterator in(_input, in_slice);
+        Iterator out(_output, out_slice);
+
+        uint8_t *out_ptr = out.ptr();
+
+        execute_window_loop(in_slice, [&](const Coordinates & id)
+        {
+            memcpy(out_ptr + id.y() * out_step_x + id.z() * out_step_y, in.ptr(), out_step_x);
+        },
+        in);
+
+        // Add bias
+        if(_has_bias)
+        {
+            if(std::is_same<T, arm_compute::qint8_t>::value)
+            {
+                *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = scvt_qs8_f32(1.0f, _input->info()->fixed_point_position());
+            }
+            else
+            {
+                *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = static_cast<T>(1);
+            }
+        }
+    }
+    while(in_window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice));
+}
+
+NEIm2ColKernel::NEIm2ColKernel()
+    : _func(), _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_size(0), _has_bias(false)
+{
+}
+
+void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, std::pair<unsigned int, unsigned int> convolved_dims, const PadStrideInfo &conv_info, bool has_bias)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input          = input;
+    _output         = output;
+    _convolved_dims = convolved_dims;
+    _conv_info      = conv_info;
+    _kernel_size    = std::sqrt((output->info()->dimension(0) - (has_bias ? 1 : 0)) / input->info()->dimension(2));
+    _has_bias       = has_bias;
+
+    unsigned int pad_x, pad_y, stride_x, stride_y = 0;
+    std::tie(pad_x, pad_y)       = conv_info.pad();
+    std::tie(stride_x, stride_y) = conv_info.stride();
+
+    bool run_img2col_reduced = (output->info()->dimension(0) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))) && (TensorShape::num_max_dimensions >= 4)
+                               && (std::equal(input->info()->tensor_shape().cbegin() + 3,
+                                              input->info()->tensor_shape().cend(),
+                                              output->info()->tensor_shape().cbegin() + 1))
+                               && ((stride_x == 1) && (stride_y == 1) && (pad_x == 0) && (pad_y == 0));
+
+    Window window = calculate_max_window(*input->info(), Steps());
+
+    if(run_img2col_reduced)
+    {
+        switch(_input->info()->data_type())
+        {
+            case DataType::F32:
+                _func = &NEIm2ColKernel::run_reduced<float>;
+                break;
+            case DataType::QS8:
+                _func = &NEIm2ColKernel::run_reduced<qint8_t>;
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Data type not supported");
+                break;
+        }
+    }
+    else
+    {
+        switch(_input->info()->data_type())
+        {
+            case DataType::F32:
+                _func = ((pad_x == 0) && (pad_y == 0)) ? &NEIm2ColKernel::run_generic<float, false> : &NEIm2ColKernel::run_generic<float, true>;
+                break;
+            case DataType::QS8:
+                _func = ((pad_x == 0) && (pad_y == 0)) ? &NEIm2ColKernel::run_generic<qint8_t, false> : &NEIm2ColKernel::run_generic<qint8_t, true>;
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Data type not supported");
+                break;
+        }
+        window.set(Window::DimX, Window::Dimension(0, _convolved_dims.first, 1));
+        window.set(Window::DimY, Window::Dimension(0, _convolved_dims.second, 1));
+        window.set(Window::DimZ, Window::Dimension(0, 1, 1));
+    }
+
+    // The NEIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    IKernel::configure(window);
+}
+
+void NEIm2ColKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    (this->*_func)(window);
+}
diff --git a/src/core/NEON/kernels/NEIntegralImageKernel.cpp b/src/core/NEON/kernels/NEIntegralImageKernel.cpp
new file mode 100644
index 0000000000..3b09a1bdbb
--- /dev/null
+++ b/src/core/NEON/kernels/NEIntegralImageKernel.cpp
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEIntegralImageKernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+void NEIntegralImageKernel::configure(const ITensor *input, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32);
+
+    _input  = input;
+    _output = output;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    // The kernel is effectively reading 17 values from -1 as it loads 16
+    // starting at -1 and also 16 starting at 0
+    AccessWindowRectangle  output_read_access(output->info(), -1, -1, num_elems_processed_per_iteration + 1, 1);
+    AccessWindowHorizontal output_write_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+                              output_read_access, output_write_access);
+
+    output_write_access.set_valid_region(win, input->info()->valid_region());
+
+    IKernel::configure(win);
+}
+
+BorderSize NEIntegralImageKernel::border_size() const
+{
+    return BorderSize(1, 0, 0, 1);
+}
+
+bool NEIntegralImageKernel::is_parallelisable() const
+{
+    return false;
+}
+
+void NEIntegralImageKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    const auto output_top_left = reinterpret_cast<const uint32_t *>(_output->ptr_to_element(Coordinates(-1, -1)));
+    const auto output_top_mid  = reinterpret_cast<const uint32_t *>(_output->ptr_to_element(Coordinates(0, -1)));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t input_pixels = vld1q_u8(input.ptr());
+
+        const uint16x8x2_t tmp =
+        {
+            {
+                vmovl_u8(vget_low_u8(input_pixels)),
+                vmovl_u8(vget_high_u8(input_pixels))
+            }
+        };
+
+        uint32x4x4_t pixels =
+        {
+            {
+                vmovl_u16(vget_low_u16(tmp.val[0])),
+                vmovl_u16(vget_high_u16(tmp.val[0])),
+                vmovl_u16(vget_low_u16(tmp.val[1])),
+                vmovl_u16(vget_high_u16(tmp.val[1]))
+            }
+        };
+
+        // Divide by four as pointer is now uint32 instead of uint8!
+        const size_t off = output.offset() / 4;
+
+        // Add top mid pixel values
+        const uint32_t *const top_mid_ptr = output_top_mid + off;
+
+        pixels.val[0] = vaddq_u32(vld1q_u32(top_mid_ptr), pixels.val[0]);
+        pixels.val[1] = vaddq_u32(vld1q_u32(top_mid_ptr + 4), pixels.val[1]);
+        pixels.val[2] = vaddq_u32(vld1q_u32(top_mid_ptr + 8), pixels.val[2]);
+        pixels.val[3] = vaddq_u32(vld1q_u32(top_mid_ptr + 12), pixels.val[3]);
+
+        // Subtract top left diagonal values
+        const auto            outptr       = reinterpret_cast<uint32_t *>(output.ptr());
+        const uint32_t *const top_left_ptr = output_top_left + off;
+
+        pixels.val[0] = vsubq_u32(pixels.val[0], vld1q_u32(top_left_ptr));
+        vst1q_u32(outptr, pixels.val[0]);
+
+        pixels.val[1] = vsubq_u32(pixels.val[1], vld1q_u32(top_left_ptr + 4));
+        vst1q_u32(outptr + 4, pixels.val[1]);
+
+        pixels.val[2] = vsubq_u32(pixels.val[2], vld1q_u32(top_left_ptr + 8));
+        vst1q_u32(outptr + 8, pixels.val[2]);
+
+        pixels.val[3] = vsubq_u32(pixels.val[3], vld1q_u32(top_left_ptr + 12));
+        vst1q_u32(outptr + 12, pixels.val[3]);
+
+        // Perform prefix summation
+        for(auto i = 0; i < 16; ++i)
+        {
+            outptr[i] += outptr[i - 1];
+        }
+    },
+    input, output);
+}
diff --git a/src/core/NEON/kernels/NELKTrackerKernel.cpp b/src/core/NEON/kernels/NELKTrackerKernel.cpp
new file mode 100644
index 0000000000..3d2bfb204e
--- /dev/null
+++ b/src/core/NEON/kernels/NELKTrackerKernel.cpp
@@ -0,0 +1,533 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cmath>
+
+using namespace arm_compute;
+
+/** Constants used for Lucas-Kanade Algorithm */
+constexpr int   W_BITS                = 14;
+constexpr float D0                    = 1 << W_BITS;
+constexpr float DETERMINANT_THRESHOLD = 1.0e-07f; // Threshold for the determinant. Used for lost tracking criteria
+constexpr float EIGENVALUE_THRESHOLD  = 1.0e-04f; // Thresholds for minimum eigenvalue. Used for lost tracking criteria
+constexpr float FLT_SCALE             = 1.0f / (1 << 20);
+
+namespace
+{
+enum class BilinearInterpolation
+{
+    BILINEAR_OLD_NEW,
+    BILINEAR_SCHARR
+};
+
+template <typename T>
+constexpr int INT_ROUND(T x, int n)
+{
+    return (x + (1 << (n - 1))) >> n;
+}
+
+template <typename T>
+inline int get_pixel(const ITensor *tensor, int xi, int yi, int iw00, int iw01, int iw10, int iw11, int scale)
+{
+    const auto px00 = *reinterpret_cast<const T *>(tensor->buffer() + tensor->info()->offset_element_in_bytes(Coordinates(xi, yi)));
+    const auto px01 = *reinterpret_cast<const T *>(tensor->buffer() + tensor->info()->offset_element_in_bytes(Coordinates(xi + 1, yi)));
+    const auto px10 = *reinterpret_cast<const T *>(tensor->buffer() + tensor->info()->offset_element_in_bytes(Coordinates(xi, yi + 1)));
+    const auto px11 = *reinterpret_cast<const T *>(tensor->buffer() + tensor->info()->offset_element_in_bytes(Coordinates(xi + 1, yi + 1)));
+
+    return INT_ROUND(px00 * iw00 + px01 * iw01 + px10 * iw10 + px11 * iw11, scale);
+}
+
+inline int32x4_t compute_bilinear_interpolation(int16x8_t top_row, int16x8_t bottom_row, int16x4_t w00, int16x4_t w01, int16x4_t w10, int16x4_t w11, int32x4_t shift)
+{
+    // Get the left column of upper row
+    const int16x4_t px00 = vget_low_s16(top_row);
+
+    // Get the right column of upper row
+    const int16x4_t px01 = vext_s16(px00, vget_high_s16(top_row), 1);
+
+    // Get the left column of lower row
+    const int16x4_t px10 = vget_low_s16(bottom_row);
+
+    // Get the right column of right row
+    const int16x4_t px11 = vext_s16(px10, vget_high_s16(bottom_row), 1);
+
+    // Apply the bilinear filter
+    return vqrshlq_s32(vmull_s16(px00, w00) + vmull_s16(px01, w01) + vmull_s16(px10, w10) + vmull_s16(px11, w11), shift);
+}
+} // namespace
+
+void NELKTrackerKernel::init_keypoints(int start, int end)
+{
+    if(_level == _num_levels - 1)
+    {
+        const float level_scale = pow(_pyramid_scale, _level);
+
+        for(int i = start; i < end; ++i)
+        {
+            _old_points_internal->at(i).x               = _old_points->at(i).x * level_scale;
+            _old_points_internal->at(i).y               = _old_points->at(i).y * level_scale;
+            _old_points_internal->at(i).tracking_status = true;
+
+            NELKInternalKeypoint keypoint_to_track;
+
+            if(_use_initial_estimate)
+            {
+                keypoint_to_track.x               = _new_points_estimates->at(i).x * level_scale;
+                keypoint_to_track.y               = _new_points_estimates->at(i).y * level_scale;
+                keypoint_to_track.tracking_status = (_new_points_estimates->at(i).tracking_status == 1);
+            }
+            else
+            {
+                keypoint_to_track.x               = _old_points_internal->at(i).x;
+                keypoint_to_track.y               = _old_points_internal->at(i).y;
+                keypoint_to_track.tracking_status = true;
+            }
+
+            _new_points_internal->at(i) = keypoint_to_track;
+        }
+    }
+    else
+    {
+        for(int i = start; i < end; ++i)
+        {
+            _old_points_internal->at(i).x /= _pyramid_scale;
+            _old_points_internal->at(i).y /= _pyramid_scale;
+            _new_points_internal->at(i).x /= _pyramid_scale;
+            _new_points_internal->at(i).y /= _pyramid_scale;
+        }
+    }
+}
+
+std::tuple<int, int, int> NELKTrackerKernel::compute_spatial_gradient_matrix(const NELKInternalKeypoint &keypoint, int *bilinear_ix, int *bilinear_iy)
+{
+    int iA11 = 0;
+    int iA12 = 0;
+    int iA22 = 0;
+
+    int32x4_t nA11 = vdupq_n_s32(0);
+    int32x4_t nA12 = vdupq_n_s32(0);
+    int32x4_t nA22 = vdupq_n_s32(0);
+
+    float keypoint_int_x = 0;
+    float keypoint_int_y = 0;
+
+    const float wx = std::modf(keypoint.x, &keypoint_int_x);
+    const float wy = std::modf(keypoint.y, &keypoint_int_y);
+
+    const int iw00 = roundf((1.0f - wx) * (1.0f - wy) * D0);
+    const int iw01 = roundf(wx * (1.0f - wy) * D0);
+    const int iw10 = roundf((1.0f - wx) * wy * D0);
+    const int iw11 = D0 - iw00 - iw01 - iw10;
+
+    const int16x4_t nw00 = vdup_n_s16(iw00);
+    const int16x4_t nw01 = vdup_n_s16(iw01);
+    const int16x4_t nw10 = vdup_n_s16(iw10);
+    const int16x4_t nw11 = vdup_n_s16(iw11);
+
+    // Convert stride from uint_t* to int16_t*
+    const size_t           row_stride = _old_scharr_gx->info()->strides_in_bytes()[1] / 2;
+    const Coordinates      top_left_window_corner(static_cast<int>(keypoint_int_x) - _window_dimension / 2, static_cast<int>(keypoint_int_y) - _window_dimension / 2);
+    auto                   idx             = reinterpret_cast<const int16_t *>(_old_scharr_gx->buffer() + _old_scharr_gx->info()->offset_element_in_bytes(top_left_window_corner));
+    auto                   idy             = reinterpret_cast<const int16_t *>(_old_scharr_gy->buffer() + _old_scharr_gy->info()->offset_element_in_bytes(top_left_window_corner));
+    static const int32x4_t nshifter_scharr = vdupq_n_s32(-W_BITS);
+
+    for(int ky = 0; ky < _window_dimension; ++ky, idx += row_stride, idy += row_stride)
+    {
+        int kx = 0;
+
+        // Calculate elements in blocks of four as long as possible
+        for(; kx <= _window_dimension - 4; kx += 4)
+        {
+            // Interpolation X
+            const int16x8_t ndx_row1 = vld1q_s16(idx + kx);
+            const int16x8_t ndx_row2 = vld1q_s16(idx + kx + row_stride);
+
+            const int32x4_t nxval = compute_bilinear_interpolation(ndx_row1, ndx_row2, nw00, nw01, nw10, nw11, nshifter_scharr);
+
+            // Interpolation Y
+            const int16x8_t ndy_row1 = vld1q_s16(idy + kx);
+            const int16x8_t ndy_row2 = vld1q_s16(idy + kx + row_stride);
+
+            const int32x4_t nyval = compute_bilinear_interpolation(ndy_row1, ndy_row2, nw00, nw01, nw10, nw11, nshifter_scharr);
+
+            // Store the intermediate data so that we don't need to recalculate them in later stage
+            vst1q_s32(bilinear_ix + kx + ky * _window_dimension, nxval);
+            vst1q_s32(bilinear_iy + kx + ky * _window_dimension, nyval);
+
+            // Accumulate Ix^2
+            nA11 = vmlaq_s32(nA11, nxval, nxval);
+            // Accumulate Ix * Iy
+            nA12 = vmlaq_s32(nA12, nxval, nyval);
+            // Accumulate Iy^2
+            nA22 = vmlaq_s32(nA22, nyval, nyval);
+        }
+
+        // Calculate the leftover elements
+        for(; kx < _window_dimension; ++kx)
+        {
+            const int32_t ixval = get_pixel<int16_t>(_old_scharr_gx, top_left_window_corner.x() + kx, top_left_window_corner.y() + ky,
+                                                     iw00, iw01, iw10, iw11, W_BITS);
+            const int32_t iyval = get_pixel<int16_t>(_old_scharr_gy, top_left_window_corner.x() + kx, top_left_window_corner.y() + ky,
+                                                     iw00, iw01, iw10, iw11, W_BITS);
+
+            iA11 += ixval * ixval;
+            iA12 += ixval * iyval;
+            iA22 += iyval * iyval;
+
+            bilinear_ix[kx + ky * _window_dimension] = ixval;
+            bilinear_iy[kx + ky * _window_dimension] = iyval;
+        }
+    }
+
+    iA11 += vgetq_lane_s32(nA11, 0) + vgetq_lane_s32(nA11, 1) + vgetq_lane_s32(nA11, 2) + vgetq_lane_s32(nA11, 3);
+    iA12 += vgetq_lane_s32(nA12, 0) + vgetq_lane_s32(nA12, 1) + vgetq_lane_s32(nA12, 2) + vgetq_lane_s32(nA12, 3);
+    iA22 += vgetq_lane_s32(nA22, 0) + vgetq_lane_s32(nA22, 1) + vgetq_lane_s32(nA22, 2) + vgetq_lane_s32(nA22, 3);
+
+    return std::make_tuple(iA11, iA12, iA22);
+}
+
+std::pair<int, int> NELKTrackerKernel::compute_image_mismatch_vector(const NELKInternalKeypoint &old_keypoint, const NELKInternalKeypoint &new_keypoint, const int *bilinear_ix, const int *bilinear_iy)
+{
+    int ib1 = 0;
+    int ib2 = 0;
+
+    int32x4_t nb1 = vdupq_n_s32(0);
+    int32x4_t nb2 = vdupq_n_s32(0);
+
+    // Compute weights for the old keypoint
+    float old_keypoint_int_x = 0;
+    float old_keypoint_int_y = 0;
+
+    const float old_wx = std::modf(old_keypoint.x, &old_keypoint_int_x);
+    const float old_wy = std::modf(old_keypoint.y, &old_keypoint_int_y);
+
+    const int iw00_old = roundf((1.0f - old_wx) * (1.0f - old_wy) * D0);
+    const int iw01_old = roundf(old_wx * (1.0f - old_wy) * D0);
+    const int iw10_old = roundf((1.0f - old_wx) * old_wy * D0);
+    const int iw11_old = D0 - iw00_old - iw01_old - iw10_old;
+
+    const int16x4_t nw00_old = vdup_n_s16(iw00_old);
+    const int16x4_t nw01_old = vdup_n_s16(iw01_old);
+    const int16x4_t nw10_old = vdup_n_s16(iw10_old);
+    const int16x4_t nw11_old = vdup_n_s16(iw11_old);
+
+    // Compute weights for the new keypoint
+    float new_keypoint_int_x = 0;
+    float new_keypoint_int_y = 0;
+
+    const float new_wx = std::modf(new_keypoint.x, &new_keypoint_int_x);
+    const float new_wy = std::modf(new_keypoint.y, &new_keypoint_int_y);
+
+    const int iw00_new = roundf((1.0f - new_wx) * (1.0f - new_wy) * D0);
+    const int iw01_new = roundf(new_wx * (1.0f - new_wy) * D0);
+    const int iw10_new = roundf((1.0f - new_wx) * new_wy * D0);
+    const int iw11_new = D0 - iw00_new - iw01_new - iw10_new;
+
+    const int16x4_t nw00_new = vdup_n_s16(iw00_new);
+    const int16x4_t nw01_new = vdup_n_s16(iw01_new);
+    const int16x4_t nw10_new = vdup_n_s16(iw10_new);
+    const int16x4_t nw11_new = vdup_n_s16(iw11_new);
+
+    const int              row_stride = _input_new->info()->strides_in_bytes()[1];
+    const Coordinates      top_left_window_corner_old(static_cast<int>(old_keypoint_int_x) - _window_dimension / 2, static_cast<int>(old_keypoint_int_y) - _window_dimension / 2);
+    const Coordinates      top_left_window_corner_new(static_cast<int>(new_keypoint_int_x) - _window_dimension / 2, static_cast<int>(new_keypoint_int_y) - _window_dimension / 2);
+    const uint8_t         *old_ptr         = _input_old->buffer() + _input_old->info()->offset_element_in_bytes(top_left_window_corner_old);
+    const uint8_t         *new_ptr         = _input_new->buffer() + _input_new->info()->offset_element_in_bytes(top_left_window_corner_new);
+    static const int32x4_t nshifter_tensor = vdupq_n_s32(-(W_BITS - 5));
+
+    for(int ky = 0; ky < _window_dimension; ++ky, new_ptr += row_stride, old_ptr += row_stride)
+    {
+        int kx = 0;
+
+        // Calculate elements in blocks of four as long as possible
+        for(; kx <= _window_dimension - 4; kx += 4)
+        {
+            // Interpolation old tensor
+            const int16x8_t nold_row1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(old_ptr + kx)));
+            const int16x8_t nold_row2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(old_ptr + kx + row_stride)));
+
+            const int32x4_t noldval = compute_bilinear_interpolation(nold_row1, nold_row2, nw00_old, nw01_old, nw10_old, nw11_old, nshifter_tensor);
+
+            // Interpolation new tensor
+            const int16x8_t nnew_row1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(new_ptr + kx)));
+            const int16x8_t nnew_row2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(new_ptr + kx + row_stride)));
+
+            const int32x4_t nnewval = compute_bilinear_interpolation(nnew_row1, nnew_row2, nw00_new, nw01_new, nw10_new, nw11_new, nshifter_tensor);
+
+            // Calculate It gradient, i.e. pixelwise difference between old and new tensor
+            const int32x4_t diff = vsubq_s32(nnewval, noldval);
+
+            // Load the Ix and Iy gradient computed in the previous stage
+            const int32x4_t nxval = vld1q_s32(bilinear_ix + kx + ky * _window_dimension);
+            const int32x4_t nyval = vld1q_s32(bilinear_iy + kx + ky * _window_dimension);
+
+            // Caculate Ix * It and Iy * It, and accumulate the results
+            nb1 = vmlaq_s32(nb1, diff, nxval);
+            nb2 = vmlaq_s32(nb2, diff, nyval);
+        }
+
+        // Calculate the leftover elements
+        for(; kx < _window_dimension; ++kx)
+        {
+            const int32_t ival = get_pixel<uint8_t>(_input_old, top_left_window_corner_old.x() + kx, top_left_window_corner_old.y() + ky,
+                                                    iw00_old, iw01_old, iw10_old, iw11_old, W_BITS - 5);
+            const int32_t jval = get_pixel<uint8_t>(_input_new, top_left_window_corner_new.x() + kx, top_left_window_corner_new.y() + ky,
+                                                    iw00_new, iw01_new, iw10_new, iw11_new, W_BITS - 5);
+
+            const int32_t diff = jval - ival;
+
+            ib1 += diff * bilinear_ix[kx + ky * _window_dimension];
+            ib2 += diff * bilinear_iy[kx + ky * _window_dimension];
+        }
+    }
+
+    ib1 += vgetq_lane_s32(nb1, 0) + vgetq_lane_s32(nb1, 1) + vgetq_lane_s32(nb1, 2) + vgetq_lane_s32(nb1, 3);
+    ib2 += vgetq_lane_s32(nb2, 0) + vgetq_lane_s32(nb2, 1) + vgetq_lane_s32(nb2, 2) + vgetq_lane_s32(nb2, 3);
+
+    return std::make_pair(ib1, ib2);
+}
+
+NELKTrackerKernel::NELKTrackerKernel()
+    : _input_old(nullptr), _input_new(nullptr), _old_scharr_gx(nullptr), _old_scharr_gy(nullptr), _new_points(nullptr), _new_points_estimates(nullptr), _old_points(nullptr), _old_points_internal(),
+      _new_points_internal(), _termination(Termination::TERM_CRITERIA_EPSILON), _use_initial_estimate(false), _pyramid_scale(0.0f), _epsilon(0.0f), _num_iterations(0), _window_dimension(0), _level(0),
+      _num_levels(0), _valid_region()
+{
+}
+
+BorderSize NELKTrackerKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void NELKTrackerKernel::configure(const ITensor *input_old, const ITensor *input_new, const ITensor *old_scharr_gx, const ITensor *old_scharr_gy,
+                                  const IKeyPointArray *old_points, const IKeyPointArray *new_points_estimates, IKeyPointArray *new_points,
+                                  INELKInternalKeypointArray *old_points_internal, INELKInternalKeypointArray *new_points_internal,
+                                  Termination termination, bool use_initial_estimate, float epsilon, unsigned int num_iterations, size_t window_dimension,
+                                  size_t level, size_t num_levels, float pyramid_scale)
+
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_old, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_new, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_scharr_gx, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_scharr_gy, 1, DataType::S16);
+
+    _input_old            = input_old;
+    _input_new            = input_new;
+    _old_scharr_gx        = old_scharr_gx;
+    _old_scharr_gy        = old_scharr_gy;
+    _old_points           = old_points;
+    _new_points_estimates = new_points_estimates;
+    _new_points           = new_points;
+    _old_points_internal  = old_points_internal;
+    _new_points_internal  = new_points_internal;
+    _termination          = termination;
+    _use_initial_estimate = use_initial_estimate;
+    _epsilon              = epsilon;
+    _num_iterations       = num_iterations;
+    _window_dimension     = window_dimension;
+    _level                = level;
+    _num_levels           = num_levels;
+    _pyramid_scale        = pyramid_scale;
+    _num_levels           = num_levels;
+
+    Window window;
+    window.set(Window::DimX, Window::Dimension(0, old_points->num_values()));
+    window.set(Window::DimY, Window::Dimension(0, 1));
+
+    _valid_region = intersect_valid_regions(
+                        input_old->info()->valid_region(),
+                        input_new->info()->valid_region(),
+                        old_scharr_gx->info()->valid_region(),
+                        old_scharr_gy->info()->valid_region());
+
+    update_window_and_padding(window,
+                              AccessWindowStatic(input_old->info(), _valid_region.start(0), _valid_region.start(1),
+                                                 _valid_region.end(0), _valid_region.end(1)),
+                              AccessWindowStatic(input_new->info(), _valid_region.start(0), _valid_region.start(1),
+                                                 _valid_region.end(0), _valid_region.end(1)),
+                              AccessWindowStatic(old_scharr_gx->info(), _valid_region.start(0), _valid_region.start(1),
+                                                 _valid_region.end(0), _valid_region.end(1)),
+                              AccessWindowStatic(old_scharr_gy->info(), _valid_region.start(0), _valid_region.start(1),
+                                                 _valid_region.end(0), _valid_region.end(1)));
+
+    INEKernel::configure(window);
+}
+
+void NELKTrackerKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    ARM_COMPUTE_ERROR_ON(_input_old->buffer() == nullptr);
+    ARM_COMPUTE_ERROR_ON(_input_new->buffer() == nullptr);
+    ARM_COMPUTE_ERROR_ON(_old_scharr_gx->buffer() == nullptr);
+    ARM_COMPUTE_ERROR_ON(_old_scharr_gy->buffer() == nullptr);
+
+    const int list_end   = window.x().end();
+    const int list_start = window.x().start();
+
+    init_keypoints(list_start, list_end);
+
+    const int buffer_size = _window_dimension * _window_dimension;
+    int       bilinear_ix[buffer_size];
+    int       bilinear_iy[buffer_size];
+
+    const int half_window = _window_dimension / 2;
+
+    auto is_invalid_keypoint = [&](const NELKInternalKeypoint & keypoint)
+    {
+        const int x = std::floor(keypoint.x);
+        const int y = std::floor(keypoint.y);
+
+        return (x - half_window < _valid_region.start(0)) || (x + half_window >= _valid_region.end(0) - 1) || (y - half_window < _valid_region.start(1)) || (y + half_window >= _valid_region.end(1) - 1);
+    };
+
+    for(int list_indx = list_start; list_indx < list_end; ++list_indx)
+    {
+        NELKInternalKeypoint &old_keypoint = _old_points_internal->at(list_indx);
+        NELKInternalKeypoint &new_keypoint = _new_points_internal->at(list_indx);
+
+        if(!old_keypoint.tracking_status)
+        {
+            continue;
+        }
+
+        if(is_invalid_keypoint(old_keypoint))
+        {
+            if(_level == 0)
+            {
+                new_keypoint.tracking_status = false;
+            }
+
+            continue;
+        }
+
+        // Compute spatial gradient matrix
+        int iA11 = 0;
+        int iA12 = 0;
+        int iA22 = 0;
+
+        std::tie(iA11, iA12, iA22) = compute_spatial_gradient_matrix(old_keypoint, bilinear_ix, bilinear_iy);
+
+        const float A11 = iA11 * FLT_SCALE;
+        const float A12 = iA12 * FLT_SCALE;
+        const float A22 = iA22 * FLT_SCALE;
+
+        // Calculate minimum eigenvalue
+        const float sum_A11_A22  = A11 + A22;
+        const float discriminant = sum_A11_A22 * sum_A11_A22 - 4.0f * (A11 * A22 - A12 * A12);
+        // Divide by _window_dimension^2 to reduce the floating point accummulation error
+        const float minimum_eigenvalue = (sum_A11_A22 - std::sqrt(discriminant)) / (2.0f * _window_dimension * _window_dimension);
+
+        // Determinant
+        const double D = A11 * A22 - A12 * A12;
+
+        // Check if it is a good point to track
+        if(minimum_eigenvalue < EIGENVALUE_THRESHOLD || D < DETERMINANT_THRESHOLD)
+        {
+            // Invalidate tracked point
+            if(_level == 0)
+            {
+                new_keypoint.tracking_status = false;
+            }
+
+            continue;
+        }
+
+        float prev_delta_x = 0.0f;
+        float prev_delta_y = 0.0f;
+
+        for(unsigned int j = 0; j < _num_iterations || _termination == Termination::TERM_CRITERIA_EPSILON; ++j)
+        {
+            if(is_invalid_keypoint(new_keypoint))
+            {
+                if(_level == 0)
+                {
+                    new_keypoint.tracking_status = false;
+                }
+
+                break;
+            }
+
+            // Compute image mismatch vector
+            int ib1 = 0;
+            int ib2 = 0;
+
+            std::tie(ib1, ib2) = compute_image_mismatch_vector(old_keypoint, new_keypoint, bilinear_ix, bilinear_iy);
+
+            double b1 = ib1 * FLT_SCALE;
+            double b2 = ib2 * FLT_SCALE;
+
+            // Compute motion vector -> A^-1 * -b
+            const float delta_x = (A12 * b2 - A22 * b1) / D;
+            const float delta_y = (A12 * b1 - A11 * b2) / D;
+
+            // Update the new position
+            new_keypoint.x += delta_x;
+            new_keypoint.y += delta_y;
+
+            const float mag2 = delta_x * delta_x + delta_y * delta_y;
+
+            // Check if termination criteria is EPSILON and if it is satisfied
+            if(mag2 <= _epsilon && (_termination == Termination::TERM_CRITERIA_EPSILON || _termination == Termination::TERM_CRITERIA_BOTH))
+            {
+                break;
+            }
+
+            // Check convergence analyzing the previous delta
+            if(j > 0 && std::fabs(delta_x + prev_delta_x) < 0.01f && std::fabs(delta_y + prev_delta_y) < 0.01f)
+            {
+                new_keypoint.x -= delta_x * _pyramid_scale;
+                new_keypoint.y -= delta_y * _pyramid_scale;
+                break;
+            }
+
+            prev_delta_x = delta_x;
+            prev_delta_y = delta_y;
+        }
+    }
+
+    if(_level == 0)
+    {
+        for(int list_indx = list_start; list_indx < list_end; ++list_indx)
+        {
+            const NELKInternalKeypoint &new_keypoint = _new_points_internal->at(list_indx);
+
+            _new_points->at(list_indx).x               = roundf(new_keypoint.x);
+            _new_points->at(list_indx).y               = roundf(new_keypoint.y);
+            _new_points->at(list_indx).tracking_status = new_keypoint.tracking_status ? 1 : 0;
+        }
+    }
+}
diff --git a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
new file mode 100644
index 0000000000..ab84efbf23
--- /dev/null
+++ b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
+
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+void vector_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window)
+{
+    const auto width_matrix_b  = static_cast<int>(output->info()->dimension(0));
+    const auto in_b_stride     = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
+    const auto num_elems_vec_a = static_cast<int>(input0->info()->dimension(0));
+
+    // The implementation computes 16 elements per iteration
+    const int window_start_x = 16 * window.thread_id();
+    const int window_step_x  = 16 * window.num_threads();
+    // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
+    const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
+
+    Window win_out(window);
+    win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+
+    Window win_a(window);
+    win_a.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator ina(input0, win_a);
+    Iterator out(output, win_out);
+
+    execute_window_loop(win_out, [&](const Coordinates & id)
+    {
+        if(id.x() > width_matrix_b)
+        {
+            return;
+        }
+
+        float32x4_t acc0 = vdupq_n_f32(0.f);
+        float32x4_t acc1 = vdupq_n_f32(0.f);
+        float32x4_t acc2 = vdupq_n_f32(0.f);
+        float32x4_t acc3 = vdupq_n_f32(0.f);
+
+        auto vec_a    = reinterpret_cast<const float *>(ina.ptr());
+        auto matrix_b = reinterpret_cast<const float *>(input1->ptr_to_element(Coordinates(id[0], 0, id[1])));
+
+#if __arm__
+        asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+        asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
+        asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
+#endif
+
+        const float *vec_a_end_addr = vec_a + num_elems_vec_a;
+
+        for(; vec_a <= (vec_a_end_addr - 4);)
+        {
+            float32x2_t a0l = vld1_f32(vec_a);
+
+            float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+            float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+            float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+            float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+
+            float32x4_t b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
+            float32x4_t b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
+            float32x4_t b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
+            float32x4_t b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
+
+#if __arm__
+            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
+#endif
+
+            acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
+            acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
+            acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
+            acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
+
+            acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
+            acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
+            acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
+            acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
+
+            vec_a += 2;
+            matrix_b += 2 * in_b_stride;
+
+            a0l = vld1_f32(vec_a);
+
+            b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+            b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+            b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+            b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+
+            b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
+            b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
+            b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
+            b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
+
+            acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
+            acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
+            acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
+            acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
+
+            acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
+            acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
+            acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
+            acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
+
+            vec_a += 2;
+            matrix_b += 2 * in_b_stride;
+        }
+
+        for(; vec_a < vec_a_end_addr;)
+        {
+            const float a0 = *vec_a;
+
+            const float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+            const float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+            const float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+            const float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+
+            acc0 = vmlaq_n_f32(acc0, b00, a0);
+            acc1 = vmlaq_n_f32(acc1, b01, a0);
+            acc2 = vmlaq_n_f32(acc2, b02, a0);
+            acc3 = vmlaq_n_f32(acc3, b03, a0);
+
+            vec_a += 1;
+            matrix_b += in_b_stride;
+        }
+
+        const auto vec_out = reinterpret_cast<float *>(out.ptr());
+
+        vst1q_f32(vec_out + 0, acc0);
+        vst1q_f32(vec_out + 4, acc1);
+        vst1q_f32(vec_out + 8, acc2);
+        vst1q_f32(vec_out + 12, acc3);
+    },
+    ina, out);
+}
+} // namespace
+
+NELocallyConnectedMatrixMultiplyKernel::NELocallyConnectedMatrixMultiplyKernel()
+    : _input0(nullptr), _input1(nullptr), _output(nullptr)
+{
+}
+
+void NELocallyConnectedMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
+
+    _input0 = input0;
+    _input1 = input1;
+    _output = output;
+
+    unsigned int num_elems_processed_per_iteration_x = 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
+
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration_x);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input0->info(), 0, num_elems_processed_per_iteration_x),
+                              AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration_x),
+                              output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NELocallyConnectedMatrixMultiplyKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    vector_matrix_multiply_f32(_input0, _input1, _output, window);
+}
diff --git a/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp b/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
new file mode 100644
index 0000000000..a874d219d7
--- /dev/null
+++ b/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
@@ -0,0 +1,869 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+// Defines for computing atan2
+constexpr float SCALE_FACTOR = 0.7111111111111111f;
+constexpr float PI           = 3.141592653589793f;
+constexpr float SCALE_180    = 180.0f / PI;
+constexpr float SCALE_360    = SCALE_180 * SCALE_FACTOR;
+constexpr float PI_4         = 0.7853981633974483f;
+constexpr float COEFF1       = 0.0663f;
+constexpr float COEFF2       = 0.2447f;
+} // namespace
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+namespace fp16
+{
+inline float16x8_t inv(float16x8_t x)
+{
+    const float16x8_t estimate = vrecpeq_f16(x);
+    return vmulq_f16(estimate, vrecpsq_f16(x, estimate));
+}
+
+inline float16x8_t atan2_fast(float16x8_t gx, float16x8_t gy, float16x8_t scale)
+{
+    static const float16x8_t one     = vdupq_n_f16(1.0f);
+    static const float16x8_t ninety  = vdupq_n_f16(90.f * SCALE_FACTOR);
+    static const float16x8_t epsilon = vdupq_n_f16(1e-9f);
+    static const float16x8_t piover4 = vdupq_n_f16(PI_4);
+    static const float16x8_t coeff1  = vdupq_n_f16(COEFF1);
+    static const float16x8_t coeff2  = vdupq_n_f16(COEFF2);
+
+    const float16x8_t abs_gx = vabsq_f16(gx);
+    const float16x8_t abs_gy = vabsq_f16(gy);
+    const float16x8_t tmin   = vminq_f16(abs_gx, abs_gy);
+    const float16x8_t tmax   = vmaxq_f16(abs_gx, abs_gy);
+
+    // z = min(x, y) / max(x, y)
+    const float16x8_t z    = vmulq_f16(tmin, inv(vaddq_f16(tmax, epsilon)));
+    const float16x8_t absz = vabsq_f16(z);
+
+    //                   = x * [pi/4 + (1 - |x|) * (0.2447 + 0.0663 * |x|)]
+    float16x8_t arctan = vmulq_f16(z, vfmaq_f16(piover4,
+                                                vsubq_f16(one, absz),
+                                                vfmaq_f16(coeff2, coeff1, absz)));
+
+    // Radians to degrees conversion with applied a scale factor in order to have the result [0, 255]
+    arctan = vmulq_f16(arctan, scale);
+
+    /* If z > 1, result = 90 - result */
+    return vbslq_f16(vcgeq_f16(abs_gx, abs_gy), arctan, vsubq_f16(ninety, arctan));
+}
+
+inline float16x8_t atan2_0_360(float16x8_t gx, float16x8_t gy)
+{
+    static const float16x8_t scale      = vdupq_n_f16(SCALE_360);
+    static const float16x8_t threesixty = vdupq_n_f16(360.0f * SCALE_FACTOR);
+    static const float16x8_t zero       = vdupq_n_f16(0.0f);
+    static const float16x8_t oneeighty  = vdupq_n_f16(180.0f * SCALE_FACTOR);
+
+    float16x8_t arctan = atan2_fast(gx, gy, scale);
+
+    // Choose correct quadrant
+    arctan = vbslq_f16(vcltq_f16(gx, zero), vsubq_f16(oneeighty, arctan), arctan);
+    arctan = vbslq_f16(vcltq_f16(gy, zero), vsubq_f16(threesixty, arctan), arctan);
+
+    return arctan;
+}
+
+inline float16x8_t atan2_0_180(float16x8_t gx, float16x8_t gy)
+{
+    static const float16x8_t scale      = vdupq_n_f16(SCALE_180);
+    static const float16x8_t threesixty = vdupq_n_f16(360.0f * SCALE_FACTOR);
+    static const float16x8_t oneeighty  = vdupq_n_f16(180.0f * SCALE_FACTOR);
+    static const float16x8_t zero       = vdupq_n_f16(0.0f);
+
+    float16x8_t arctan = atan2_fast(gx, gy, scale);
+
+    // Choose correct quadrant
+    arctan = vbslq_f16(vcltq_f16(gx, zero), vsubq_f16(oneeighty, arctan), arctan);
+    arctan = vbslq_f16(vcltq_f16(gy, zero), vsubq_f16(threesixty, arctan), arctan);
+    arctan = vbslq_f16(vcgtq_f16(arctan, oneeighty), vsubq_f16(arctan, oneeighty), arctan);
+
+    return arctan;
+}
+
+inline float32x4_t invsqrtv(float32x4_t x)
+{
+    float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
+
+    sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal),
+                                sqrt_reciprocal);
+    sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal),
+                                sqrt_reciprocal);
+
+    return sqrt_reciprocal;
+}
+
+inline float32x4_t sqrtv(float32x4_t x)
+{
+    float32x4_t res = vdupq_n_f32(0.5f);
+    return vmlaq_f32(res, x, invsqrtv(x));
+}
+
+inline int16x8_t magnitude_l1(int16x8_t input1, int16x8_t input2)
+{
+    return vqaddq_s16(vabsq_s16(input1), vabsq_s16(input2));
+}
+
+inline int16x8_t magnitude_l2(int16x8_t input1, int16x8_t input2)
+{
+    const int32x4x2_t square_x =
+    {
+        vmull_s16(vget_low_s16(input1), vget_low_s16(input1)),
+        vmull_s16(vget_high_s16(input1), vget_high_s16(input1))
+    };
+
+    const int32x4x2_t square_y =
+    {
+        vmull_s16(vget_low_s16(input2), vget_low_s16(input2)),
+        vmull_s16(vget_high_s16(input2), vget_high_s16(input2))
+    };
+
+    const uint32x4x2_t sum =
+    {
+        vaddq_u32(vreinterpretq_u32_s32(square_x.val[0]),
+        vreinterpretq_u32_s32(square_y.val[0])),
+        vaddq_u32(vreinterpretq_u32_s32(square_x.val[1]),
+        vreinterpretq_u32_s32(square_y.val[1]))
+    };
+
+    const float32x4x2_t res =
+    {
+        sqrtv(vcvtq_f32_u32(sum.val[0])),
+        sqrtv(vcvtq_f32_u32(sum.val[1]))
+    };
+
+    return vcombine_s16(vqmovn_s32(vcvtq_s32_f32(res.val[0])),
+                        vqmovn_s32(vcvtq_s32_f32(res.val[1])));
+}
+
+inline uint8x8_t phase_signed(int16x8_t input1, int16x8_t input2)
+{
+    static const float16x8_t zeropointfive = vdupq_n_f16(0.5f);
+
+    const float16x8_t inputx_f16 = vcvtq_f16_s16(input1);
+    const float16x8_t inputy_f16 = vcvtq_f16_s16(input2);
+
+    // Compute fast atan2
+    const float16x8_t angle = atan2_0_360(inputx_f16, inputy_f16);
+
+    return vqmovun_s16(vcvtq_s16_f16(vaddq_f16(angle, zeropointfive)));
+}
+
+inline uint8x8_t phase_unsigned(int16x8_t input1, int16x8_t input2)
+{
+    static const float16x8_t zeropointfive = vdupq_n_f16(0.5f);
+
+    const float16x8_t inputx_f16 = vcvtq_f16_s16(input1);
+    const float16x8_t inputy_f16 = vcvtq_f16_s16(input2);
+
+    // Compute fast atan2
+    const float16x8_t angle = atan2_0_180(inputx_f16, inputy_f16);
+
+    return vqmovun_s16(vcvtq_s16_f16(vaddq_f16(angle, zeropointfive)));
+}
+
+template <MagnitudeType mag_type>
+inline int16x8x2_t compute_magnitude(const int16x8x2_t &in0, const int16x8x2_t &gx);
+
+template <>
+inline int16x8x2_t compute_magnitude<MagnitudeType::L2NORM>(const int16x8x2_t &in0, const int16x8x2_t &gx)
+{
+    const int16x8x2_t mag =
+    {
+        magnitude_l2(in0.val[0], gx.val[0]),
+        magnitude_l2(in0.val[1], gx.val[1])
+    };
+
+    return mag;
+}
+
+template <>
+inline int16x8x2_t compute_magnitude<MagnitudeType::L1NORM>(const int16x8x2_t &in0, const int16x8x2_t &gx)
+{
+    const int16x8x2_t mag =
+    {
+        magnitude_l1(in0.val[0], gx.val[0]),
+        magnitude_l1(in0.val[1], gx.val[1])
+    };
+
+    return mag;
+}
+
+template <PhaseType phase_type>
+inline uint8x16_t compute_phase(const int16x8x2_t &in0, const int16x8x2_t &gx);
+
+template <>
+inline uint8x16_t compute_phase<PhaseType::SIGNED>(const int16x8x2_t &in0, const int16x8x2_t &gx)
+{
+    return vcombine_u8(phase_signed(in0.val[0], gx.val[0]),
+                       phase_signed(in0.val[1], gx.val[1]));
+}
+
+template <>
+inline uint8x16_t compute_phase<PhaseType::UNSIGNED>(const int16x8x2_t &in0, const int16x8x2_t &gx)
+{
+    return vcombine_u8(phase_unsigned(in0.val[0], gx.val[0]),
+                       phase_unsigned(in0.val[1], gx.val[1]));
+}
+} // namespace fp16
+
+template <MagnitudeType mag_type, PhaseType phase_type>
+NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::NEMagnitudePhaseFP16Kernel()
+    : _func(nullptr), _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr)
+{
+}
+
+template <MagnitudeType mag_type, PhaseType phase_type>
+void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase)
+{
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(gx, Format::S16);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(gy, Format::S16);
+    ARM_COMPUTE_ERROR_ON((nullptr == magnitude) && (nullptr == phase));
+
+    const bool run_mag   = magnitude != nullptr;
+    const bool run_phase = phase != nullptr;
+
+    if(run_mag)
+    {
+        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(magnitude, Format::S16);
+    }
+
+    if(run_phase)
+    {
+        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(phase, Format::U8);
+    }
+
+    _gx        = gx;
+    _gy        = gy;
+    _magnitude = magnitude;
+    _phase     = phase;
+
+    if(run_mag && run_phase)
+    {
+        /* Run magnitude and phase */
+        _func = &NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::magnitude_phase;
+    }
+    else if(run_mag)
+    {
+        /* Run magnitude */
+        _func = &NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::magnitude;
+    }
+    else if(run_phase)
+    {
+        /* Run phase */
+        _func = &NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::phase;
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("At least one output must be NOT NULL");
+    }
+
+    const unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*gx->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal magnitude_access(magnitude == nullptr ? nullptr : magnitude->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal phase_access(phase == nullptr ? nullptr : phase->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(gx->info(), 0, num_elems_processed_per_iteration),
+                              AccessWindowHorizontal(gy->info(), 0, num_elems_processed_per_iteration),
+                              magnitude_access,
+                              phase_access);
+
+    ValidRegion valid_region = intersect_valid_regions(gx->info()->valid_region(),
+                                                       gy->info()->valid_region());
+
+    magnitude_access.set_valid_region(win, valid_region);
+    phase_access.set_valid_region(win, valid_region);
+
+    INEKernel::configure(win);
+}
+
+template <MagnitudeType mag_type, PhaseType phase_type>
+void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::magnitude(const Window &window)
+{
+    Iterator gx(_gx, window);
+    Iterator gy(_gy, window);
+    Iterator magnitude(_magnitude, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int16x8x2_t input1 =
+        {
+            vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
+            vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
+        };
+
+        const int16x8x2_t input2 =
+        {
+            vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
+            vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
+        };
+
+        // Compute and store magnitude
+        const int16x8x2_t mag = fp16::compute_magnitude<mag_type>(input1, input2);
+
+        /* Store magnitude */
+        vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()), mag.val[0]);
+        vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()) + 8, mag.val[1]);
+    },
+    gx, gy, magnitude);
+}
+
+template <MagnitudeType mag_type, PhaseType phase_type>
+void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::phase(const Window &window)
+{
+    Iterator gx(_gx, window);
+    Iterator gy(_gy, window);
+    Iterator phase(_phase, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int16x8x2_t input1 =
+        {
+            vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
+            vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
+        };
+
+        const int16x8x2_t input2 =
+        {
+            vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
+            vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
+        };
+
+        // Compute and store phase
+        vst1q_u8(phase.ptr(), fp16::compute_phase<phase_type>(input1, input2));
+    },
+    gx, gy, phase);
+}
+
+template <MagnitudeType mag_type, PhaseType phase_type>
+void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::magnitude_phase(const Window &window)
+{
+    Iterator gx(_gx, window);
+    Iterator gy(_gy, window);
+    Iterator magnitude(_magnitude, window);
+    Iterator phase(_phase, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int16x8x2_t input1 =
+        {
+            vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
+            vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
+        };
+
+        const int16x8x2_t input2 =
+        {
+            vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
+            vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
+        };
+
+        // Compute and store magnitude
+        const int16x8x2_t mag = fp16::compute_magnitude<mag_type>(input1, input2);
+
+        vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()), mag.val[0]);
+        vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()) + 8, mag.val[1]);
+
+        // Compute and store phase
+        vst1q_u8(phase.ptr(), fp16::compute_phase<phase_type>(input1, input2));
+    },
+    gx, gy, magnitude, phase);
+}
+
+template <MagnitudeType mag_type, PhaseType phase_type>
+void NEMagnitudePhaseFP16Kernel<mag_type, phase_type>::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (this->*_func)(window);
+}
+
+template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L1NORM, PhaseType::SIGNED>;
+template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::SIGNED>;
+template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L1NORM, PhaseType::UNSIGNED>;
+template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>;
+#endif
+
+namespace
+{
+inline float32x4_t inv(float32x4_t x)
+{
+    float32x4_t result = vrecpeq_f32(x);
+    result             = vmulq_f32(vrecpsq_f32(x, result), result);
+    return result;
+}
+
+inline float32x4_t atan2_0_360(float32x4_t gx, float32x4_t gy)
+{
+    const float32x4_t zero       = vdupq_n_f32(0.0f);
+    const float32x4_t epsilon    = vdupq_n_f32(1e-9f);
+    const float32x4_t piover4    = vdupq_n_f32(PI_4);
+    const float32x4_t coeff1     = vdupq_n_f32(COEFF1);
+    const float32x4_t coeff2     = vdupq_n_f32(COEFF2);
+    const float32x4_t ninety     = vdupq_n_f32(90.0f * SCALE_FACTOR);
+    const float32x4_t oneeighty  = vdupq_n_f32(180.0f * SCALE_FACTOR);
+    const float32x4_t threesixty = vdupq_n_f32(360.0f * SCALE_FACTOR);
+    const float32x4_t scale      = vdupq_n_f32(SCALE_360);
+
+    float32x4_t abs_gx = vabsq_f32(gx);
+    float32x4_t abs_gy = vabsq_f32(gy);
+    float32x4_t tmin   = vminq_f32(abs_gx, abs_gy);
+    float32x4_t tmax   = vmaxq_f32(abs_gx, abs_gy);
+    float32x4_t z      = vmulq_f32(tmin, inv(vaddq_f32(tmax, epsilon)));
+    float32x4_t absz   = vabsq_f32(z);
+    float32x4_t term   = vmulq_f32(z, vsubq_f32(vdupq_n_f32(1.0f), absz));
+
+    /* Compute y = pi/4 * x - x*(abs(x)-1)*(0.2447+0.0663 * abs(x) */
+    float32x4_t result = vaddq_f32(coeff2, vmulq_f32(absz, coeff1));
+    result             = vmulq_f32(result, term);
+    result             = vmlaq_f32(result, piover4, z);
+
+    /* Radians to degrees conversion with applied a scale factor in order to have the result [0, 255]  */
+    result = vmulq_f32(result, scale);
+
+    /* If z > 1, result = 90 - result */
+    result = vbslq_f32(vcgeq_f32(abs_gx, abs_gy), result, vsubq_f32(ninety, result));
+
+    /* Choose correct quadrant */
+    result = vbslq_f32(vcltq_f32(gx, zero), vsubq_f32(oneeighty, result), result);
+    result = vbslq_f32(vcltq_f32(gy, zero), vsubq_f32(threesixty, result), result);
+
+    return result;
+}
+
+inline float32x4_t atan2_0_180(float32x4_t gx, float32x4_t gy)
+{
+    const float32x4_t zero       = vdupq_n_f32(0.0f);
+    const float32x4_t epsilon    = vdupq_n_f32(1e-9f); // epsilon used to avoiding division by 0
+    const float32x4_t piover4    = vdupq_n_f32(PI_4);
+    const float32x4_t coeff1     = vdupq_n_f32(COEFF1);
+    const float32x4_t coeff2     = vdupq_n_f32(COEFF2);
+    const float32x4_t ninety     = vdupq_n_f32(90.0f);
+    const float32x4_t oneeighty  = vdupq_n_f32(180.0f);
+    const float32x4_t threesixty = vdupq_n_f32(360.0f);
+    const float32x4_t scale      = vdupq_n_f32(SCALE_180);
+
+    float32x4_t abs_gx = vabsq_f32(gx);
+    float32x4_t abs_gy = vabsq_f32(gy);
+    float32x4_t tmin   = vminq_f32(abs_gx, abs_gy);
+    float32x4_t tmax   = vmaxq_f32(abs_gx, abs_gy);
+    float32x4_t z      = vmulq_f32(tmin, inv(vaddq_f32(tmax, epsilon)));
+    float32x4_t absz   = vabsq_f32(z);
+
+    /* Compute y = pi/4 * z - z*(abs(z)-1)*(0.2447+0.0663 * abs(z) */
+    float32x4_t term   = vmulq_f32(z, vsubq_f32(vdupq_n_f32(1.0f), absz));
+    float32x4_t result = vaddq_f32(coeff2, vmulq_f32(absz, coeff1));
+    result             = vmulq_f32(result, term);
+    result             = vmlaq_f32(result, piover4, z);
+
+    /* Radians to degrees conversion */
+    result = vmulq_f32(result, scale);
+
+    /* If z > 1, result = 90 - result */
+    result = vbslq_f32(vcgeq_f32(abs_gx, abs_gy), result, vsubq_f32(ninety, result));
+
+    /* Choose correct quadrant */
+    result = vbslq_f32(vcltq_f32(gx, zero), vsubq_f32(oneeighty, result), result);
+    result = vbslq_f32(vcltq_f32(gy, zero), vsubq_f32(threesixty, result), result);
+    result = vbslq_f32(vcgtq_f32(result, oneeighty), vsubq_f32(result, oneeighty), result);
+
+    return result;
+}
+
+inline float32x4_t invsqrtv(float32x4_t x)
+{
+    float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
+
+    sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal),
+                                sqrt_reciprocal);
+    sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal),
+                                sqrt_reciprocal);
+
+    return sqrt_reciprocal;
+}
+
+inline float32x4_t sqrtv(float32x4_t x)
+{
+    float32x4_t res = vdupq_n_f32(0.5f);
+    return vmlaq_f32(res, x, invsqrtv(x));
+}
+
+inline int16x8_t magnitude_l2(int16x8_t input1, int16x8_t input2)
+{
+    const int32x4x2_t square_x =
+    {
+        {
+            vmull_s16(vget_low_s16(input1), vget_low_s16(input1)),
+            vmull_s16(vget_high_s16(input1), vget_high_s16(input1))
+        }
+    };
+
+    const int32x4x2_t square_y =
+    {
+        {
+            vmull_s16(vget_low_s16(input2), vget_low_s16(input2)),
+            vmull_s16(vget_high_s16(input2), vget_high_s16(input2))
+        }
+    };
+
+    const uint32x4x2_t sum =
+    {
+        {
+            vaddq_u32(vreinterpretq_u32_s32(square_x.val[0]), vreinterpretq_u32_s32(square_y.val[0])),
+            vaddq_u32(vreinterpretq_u32_s32(square_x.val[1]), vreinterpretq_u32_s32(square_y.val[1]))
+        }
+    };
+
+    const float32x4x2_t res =
+    {
+        {
+            sqrtv(vcvtq_f32_u32(sum.val[0])),
+            sqrtv(vcvtq_f32_u32(sum.val[1]))
+        }
+    };
+
+    return vcombine_s16(vqmovn_s32(vcvtq_s32_f32(res.val[0])),
+                        vqmovn_s32(vcvtq_s32_f32(res.val[1])));
+}
+
+inline int16x8_t magnitude_l1(int16x8_t input1, int16x8_t input2)
+{
+    int16x8_t gx_abs = vabsq_s16(input1);
+    int16x8_t gy_abs = vabsq_s16(input2);
+
+    /* Saturating add */
+    return vqaddq_s16(gx_abs, gy_abs);
+}
+
+inline uint8x8_t phase_signed(int16x8_t input1, int16x8_t input2)
+{
+    const float32x4_t zeropointfive = vdupq_n_f32(0.5f);
+
+    float32x4_t inputx_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input1)));
+    float32x4_t inputx_f32_low  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input1)));
+    float32x4_t inputy_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input2)));
+    float32x4_t inputy_f32_low  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input2)));
+
+    /* Compute fast atan2 */
+    float32x4_t angle_high = atan2_0_360(inputx_f32_high, inputy_f32_high);
+    float32x4_t angle_low  = atan2_0_360(inputx_f32_low, inputy_f32_low);
+
+    angle_high = vaddq_f32(angle_high, zeropointfive);
+    angle_low  = vaddq_f32(angle_low, zeropointfive);
+
+    return vmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(angle_low)),
+                                  vqmovun_s32(vcvtq_s32_f32(angle_high))));
+}
+
+inline uint8x8_t phase_unsigned(int16x8_t input1, int16x8_t input2)
+{
+    const float32x4_t zeropointfive = vdupq_n_f32(0.5f);
+
+    float32x4_t inputx_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input1)));
+    float32x4_t inputx_f32_low  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input1)));
+    float32x4_t inputy_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input2)));
+    float32x4_t inputy_f32_low  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input2)));
+
+    /* Compute fast atan2 */
+    float32x4_t angle_high = atan2_0_180(inputx_f32_high, inputy_f32_high);
+    float32x4_t angle_low  = atan2_0_180(inputx_f32_low, inputy_f32_low);
+
+    angle_high = vaddq_f32(angle_high, zeropointfive);
+    angle_low  = vaddq_f32(angle_low, zeropointfive);
+
+    return vmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(angle_low)),
+                                  vqmovun_s32(vcvtq_s32_f32(angle_high))));
+}
+} // namespace
+
+template <MagnitudeType mag_type, PhaseType phase_type>
+NEMagnitudePhaseKernel<mag_type, phase_type>::NEMagnitudePhaseKernel()
+    : _func(nullptr), _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr)
+{
+}
+
+template <MagnitudeType mag_type, PhaseType phase_type>
+void NEMagnitudePhaseKernel<mag_type, phase_type>::configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON((nullptr == magnitude) && (nullptr == phase));
+
+    const bool run_mag   = magnitude != nullptr;
+    const bool run_phase = phase != nullptr;
+
+    if(run_mag)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::S16);
+    }
+
+    if(run_phase)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
+    }
+
+    _gx        = gx;
+    _gy        = gy;
+    _magnitude = magnitude;
+    _phase     = phase;
+
+    if(run_mag && run_phase)
+    {
+        /* Run magnitude and phase */
+        _func = &NEMagnitudePhaseKernel<mag_type, phase_type>::magnitude_phase;
+    }
+    else
+    {
+        if(run_mag)
+        {
+            /* Run magnitude */
+            _func = &NEMagnitudePhaseKernel<mag_type, phase_type>::magnitude;
+        }
+        else if(run_phase)
+        {
+            /* Run phase */
+            _func = &NEMagnitudePhaseKernel<mag_type, phase_type>::phase;
+        }
+        else
+        {
+            ARM_COMPUTE_ERROR("At least one output must be NOT NULL");
+        }
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*gx->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal magnitude_access(magnitude == nullptr ? nullptr : magnitude->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal phase_access(phase == nullptr ? nullptr : phase->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(gx->info(), 0, num_elems_processed_per_iteration),
+                              AccessWindowHorizontal(gy->info(), 0, num_elems_processed_per_iteration),
+                              magnitude_access,
+                              phase_access);
+
+    ValidRegion valid_region = intersect_valid_regions(gx->info()->valid_region(),
+                                                       gy->info()->valid_region());
+
+    magnitude_access.set_valid_region(win, valid_region);
+    phase_access.set_valid_region(win, valid_region);
+
+    INEKernel::configure(win);
+}
+
+template <MagnitudeType mag_type, PhaseType phase_type>
+void NEMagnitudePhaseKernel<mag_type, phase_type>::magnitude(const Window &window)
+{
+    Iterator gx(_gx, window);
+    Iterator gy(_gy, window);
+    Iterator magnitude(_magnitude, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int16x8x2_t input1 =
+        {
+            {
+                vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
+                vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
+            }
+        };
+
+        const int16x8x2_t input2 =
+        {
+            {
+                vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
+                vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
+            }
+        };
+
+        /* Compute magnitude */
+        int16x8x2_t mag{ {} };
+
+        if(MagnitudeType::L2NORM == mag_type)
+        {
+            mag.val[0] = magnitude_l2(input1.val[0], input2.val[0]);
+            mag.val[1] = magnitude_l2(input1.val[1], input2.val[1]);
+        }
+        else
+        {
+            mag.val[0] = magnitude_l1(input1.val[0], input2.val[0]);
+            mag.val[1] = magnitude_l1(input1.val[1], input2.val[1]);
+        }
+
+        /* Store magnitude */
+        vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()), mag.val[0]);
+        vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()) + 8, mag.val[1]);
+    },
+    gx, gy, magnitude);
+}
+
+template <MagnitudeType mag_type, PhaseType phase_type>
+void NEMagnitudePhaseKernel<mag_type, phase_type>::phase(const Window &window)
+{
+    Iterator gx(_gx, window);
+    Iterator gy(_gy, window);
+    Iterator phase(_phase, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int16x8x2_t input1 =
+        {
+            {
+                vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
+                vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
+            }
+        };
+
+        const int16x8x2_t input2 =
+        {
+            {
+                vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
+                vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
+            }
+        };
+
+        /* Compute phase */
+        uint8x8x2_t vphase{ {} };
+
+        if(PhaseType::SIGNED == phase_type)
+        {
+            vphase.val[0] = phase_signed(input1.val[0], input2.val[0]);
+            vphase.val[1] = phase_signed(input1.val[1], input2.val[1]);
+        }
+        else
+        {
+            vphase.val[0] = phase_unsigned(input1.val[0], input2.val[0]);
+            vphase.val[1] = phase_unsigned(input1.val[1], input2.val[1]);
+        }
+
+        /* Store phase */
+        vst1q_u8(phase.ptr(), vcombine_u8(vphase.val[0], vphase.val[1]));
+    },
+    gx, gy, phase);
+}
+
+template <MagnitudeType mag_type, PhaseType phase_type>
+void NEMagnitudePhaseKernel<mag_type, phase_type>::magnitude_phase(const Window &window)
+{
+    Iterator gx(_gx, window);
+    Iterator gy(_gy, window);
+    Iterator magnitude(_magnitude, window);
+    Iterator phase(_phase, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int16x8x2_t input1 =
+        {
+            {
+                vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
+                vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
+            }
+        };
+
+        const int16x8x2_t input2 =
+        {
+            {
+                vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
+                vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
+            }
+        };
+
+        /* Compute magnitude */
+        int16x8x2_t mag{ {} };
+
+        if(MagnitudeType::L2NORM == mag_type)
+        {
+            mag.val[0] = magnitude_l2(input1.val[0], input2.val[0]);
+            mag.val[1] = magnitude_l2(input1.val[1], input2.val[1]);
+        }
+        else
+        {
+            mag.val[0] = magnitude_l1(input1.val[0], input2.val[0]);
+            mag.val[1] = magnitude_l1(input1.val[1], input2.val[1]);
+        }
+
+        /* Store magnitude */
+        vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()), mag.val[0]);
+        vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()) + 8, mag.val[1]);
+
+        /* Compute phase */
+        uint8x8x2_t vphase{ {} };
+
+        if(PhaseType::SIGNED == phase_type)
+        {
+            vphase.val[0] = phase_signed(input1.val[0], input2.val[0]);
+            vphase.val[1] = phase_signed(input1.val[1], input2.val[1]);
+        }
+        else
+        {
+            vphase.val[0] = phase_unsigned(input1.val[0], input2.val[0]);
+            vphase.val[1] = phase_unsigned(input1.val[1], input2.val[1]);
+        }
+
+        /* Store phase */
+        vst1q_u8(phase.ptr(), vcombine_u8(vphase.val[0], vphase.val[1]));
+    },
+    gx, gy, magnitude, phase);
+}
+
+template <MagnitudeType mag_type, PhaseType phase_type>
+void NEMagnitudePhaseKernel<mag_type, phase_type>::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (this->*_func)(window);
+}
+
+template class arm_compute::NEMagnitudePhaseKernel<MagnitudeType::L1NORM, PhaseType::SIGNED>;
+template class arm_compute::NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>;
+template class arm_compute::NEMagnitudePhaseKernel<MagnitudeType::L1NORM, PhaseType::UNSIGNED>;
+template class arm_compute::NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>;
diff --git a/src/core/NEON/kernels/NEMeanStdDevKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevKernel.cpp
new file mode 100644
index 0000000000..4616203d66
--- /dev/null
+++ b/src/core/NEON/kernels/NEMeanStdDevKernel.cpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <tuple>
+#include <utility>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+template <bool calc_sum_squared>
+std::pair<uint64x1_t, uint64x1_t> accumulate(const Window &window, Iterator &iterator)
+{
+    uint64x1_t sum         = vdup_n_u64(0);
+    uint64x1_t sum_squared = vdup_n_u64(0);
+
+    // Calculate sum
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t in_data = vld1q_u8(iterator.ptr());
+
+        // Sum of the low and high elements of data
+        const uint16x8_t tmp0 = vaddl_u8(vget_low_u8(in_data), vget_high_u8(in_data));
+        const uint32x4_t tmp1 = vaddl_u16(vget_low_u16(tmp0), vget_high_u16(tmp0));
+        const uint32x2_t tmp2 = vadd_u32(vget_low_u32(tmp1), vget_high_u32(tmp1));
+
+        // Update sum
+        sum = vpadal_u32(sum, tmp2);
+
+        if(calc_sum_squared)
+        {
+            const uint16x8_t square_data_low  = vmull_u8(vget_low_u8(in_data), vget_low_u8(in_data));
+            const uint16x8_t square_data_high = vmull_u8(vget_high_u8(in_data), vget_high_u8(in_data));
+
+            // Sum of the low and high elements of data
+            const uint32x4_t tmp0_low  = vaddl_u16(vget_low_u16(square_data_low), vget_high_u16(square_data_low));
+            const uint32x4_t tmp0_high = vaddl_u16(vget_low_u16(square_data_high), vget_high_u16(square_data_high));
+            const uint32x4_t tmp1      = vaddq_u32(tmp0_low, tmp0_high);
+            const uint32x2_t tmp2      = vadd_u32(vget_low_u32(tmp1), vget_high_u32(tmp1));
+
+            // Update sum
+            sum_squared = vpadal_u32(sum_squared, tmp2);
+        }
+    },
+    iterator);
+
+    return std::make_pair(sum, sum_squared);
+}
+} // namespace
+
+NEMeanStdDevKernel::NEMeanStdDevKernel()
+    : _input(nullptr), _mean(nullptr), _stddev(nullptr), _global_sum(nullptr), _global_sum_squared(nullptr), _mtx()
+{
+}
+
+void NEMeanStdDevKernel::configure(const IImage *input, float *mean, uint64_t *global_sum, float *stddev, uint64_t *global_sum_squared)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON(nullptr == mean);
+    ARM_COMPUTE_ERROR_ON(nullptr == global_sum);
+    ARM_COMPUTE_ERROR_ON(stddev && nullptr == global_sum_squared);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+
+    _input              = input;
+    _mean               = mean;
+    _stddev             = stddev;
+    _global_sum         = global_sum;
+    _global_sum_squared = global_sum_squared;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
+
+    INEKernel::configure(win);
+}
+
+void NEMeanStdDevKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    Iterator input(_input, window);
+
+    uint64x1_t local_sum         = vdup_n_u64(0);
+    uint64x1_t local_sum_squared = vdup_n_u64(0);
+
+    if(_stddev != nullptr)
+    {
+        std::tie(local_sum, local_sum_squared) = accumulate<true>(window, input);
+    }
+    else
+    {
+        std::tie(local_sum, local_sum_squared) = accumulate<false>(window, input);
+    }
+
+    const float num_pixels = _input->info()->dimension(0) * _input->info()->dimension(1);
+
+    // Merge sum and calculate mean and stddev
+    std::unique_lock<std::mutex> lock(_mtx);
+
+    *_global_sum += vget_lane_u64(local_sum, 0);
+
+    const float mean = *_global_sum / num_pixels;
+    *_mean           = mean;
+
+    if(_stddev != nullptr)
+    {
+        const uint64_t tmp_sum_squared = vget_lane_u64(local_sum_squared, 0);
+        *_global_sum_squared += tmp_sum_squared;
+        *_stddev = std::sqrt((*_global_sum_squared / num_pixels) - (mean * mean));
+    }
+
+    lock.unlock();
+}
diff --git a/src/core/NEON/kernels/NEMedian3x3Kernel.cpp b/src/core/NEON/kernels/NEMedian3x3Kernel.cpp
new file mode 100644
index 0000000000..601a0e109f
--- /dev/null
+++ b/src/core/NEON/kernels/NEMedian3x3Kernel.cpp
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <utility>
+
+using namespace arm_compute;
+
+namespace
+{
+inline void sort(uint8x8_t &a, uint8x8_t &b)
+{
+    const uint8x8_t min = vmin_u8(a, b);
+    const uint8x8_t max = vmax_u8(a, b);
+    a                   = min;
+    b                   = max;
+}
+} // namespace
+
+BorderSize NEMedian3x3Kernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void NEMedian3x3Kernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
+{
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+    constexpr int          rect_offset_xy                    = -1;
+
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), rect_offset_xy, rect_offset_xy, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+void NEMedian3x3Kernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+
+    const unsigned char *input_bot_ptr = _input->ptr_to_element(Coordinates(-1, -1));
+    const unsigned char *input_mid_ptr = _input->ptr_to_element(Coordinates(-1, 0));
+    const unsigned char *input_top_ptr = _input->ptr_to_element(Coordinates(-1, +1));
+
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
+        const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
+        const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
+
+        uint8x8_t p0 = vget_low_u8(top_data);
+        uint8x8_t p1 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 1);
+        uint8x8_t p2 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 2);
+        uint8x8_t p3 = vget_low_u8(mid_data);
+        uint8x8_t p4 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1);
+        uint8x8_t p5 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2);
+        uint8x8_t p6 = vget_low_u8(bot_data);
+        uint8x8_t p7 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 1);
+        uint8x8_t p8 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 2);
+
+        sort(p1, p2);
+        sort(p4, p5);
+        sort(p7, p8);
+
+        sort(p0, p1);
+        sort(p3, p4);
+        sort(p6, p7);
+
+        sort(p1, p2);
+        sort(p4, p5);
+        sort(p7, p8);
+
+        sort(p0, p3);
+        sort(p5, p8);
+        sort(p4, p7);
+
+        sort(p3, p6);
+        sort(p1, p4);
+        sort(p2, p5);
+
+        sort(p4, p7);
+        sort(p4, p2);
+        sort(p6, p4);
+
+        sort(p4, p2);
+
+        vst1_u8(output.ptr(), p4);
+    },
+    input, output);
+}
diff --git a/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
new file mode 100644
index 0000000000..b188614752
--- /dev/null
+++ b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
@@ -0,0 +1,361 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <climits>
+#include <cstddef>
+
+namespace arm_compute
+{
+NEMinMaxKernel::NEMinMaxKernel()
+    : _func(), _input(nullptr), _min(), _max(), _min_init(), _max_init(), _mtx()
+{
+}
+
+void NEMinMaxKernel::configure(const IImage *input, int32_t *min, int32_t *max)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON(nullptr == min);
+    ARM_COMPUTE_ERROR_ON(nullptr == max);
+
+    _input = input;
+    _min   = min;
+    _max   = max;
+
+    switch(input->info()->format())
+    {
+        case Format::U8:
+            _min_init = UCHAR_MAX;
+            _max_init = 0;
+            _func     = &NEMinMaxKernel::minmax_U8;
+            break;
+        case Format::S16:
+            _min_init = SHRT_MAX;
+            _max_init = SHRT_MIN;
+            _func     = &NEMinMaxKernel::minmax_S16;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("You called with the wrong img formats");
+            break;
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
+
+    INEKernel::configure(win);
+}
+
+void NEMinMaxKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (this->*_func)(window);
+}
+
+void NEMinMaxKernel::reset()
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    *_min = _min_init;
+    *_max = _max_init;
+}
+
+template <typename T>
+void NEMinMaxKernel::update_min_max(const T min, const T max)
+{
+    std::lock_guard<std::mutex> lock(_mtx);
+
+    if(min < *_min)
+    {
+        *_min = min;
+    }
+
+    if(max > *_max)
+    {
+        *_max = max;
+    }
+}
+
+void NEMinMaxKernel::minmax_U8(const Window &win)
+{
+    uint8x8_t carry_min = vdup_n_u8(UCHAR_MAX);
+    uint8x8_t carry_max = vdup_n_u8(0);
+
+    Iterator input(_input, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const uint8x16_t pixels  = vld1q_u8(input.ptr());
+        const uint8x8_t  tmp_min = vmin_u8(vget_high_u8(pixels), vget_low_u8(pixels));
+        const uint8x8_t  tmp_max = vmax_u8(vget_high_u8(pixels), vget_low_u8(pixels));
+        carry_min                = vmin_u8(tmp_min, carry_min);
+        carry_max                = vmax_u8(tmp_max, carry_max);
+    },
+    input);
+
+    // Reduce result
+    carry_min = vpmin_u8(carry_min, carry_min);
+    carry_max = vpmax_u8(carry_max, carry_max);
+    carry_min = vpmin_u8(carry_min, carry_min);
+    carry_max = vpmax_u8(carry_max, carry_max);
+    carry_min = vpmin_u8(carry_min, carry_min);
+    carry_max = vpmax_u8(carry_max, carry_max);
+
+    // Extract max/min values
+    const uint8_t min_i = vget_lane_u8(carry_min, 0);
+    const uint8_t max_i = vget_lane_u8(carry_max, 0);
+
+    // Perform reduction of local min/max values
+    update_min_max(min_i, max_i);
+}
+
+void NEMinMaxKernel::minmax_S16(const Window &win)
+{
+    int16x4_t carry_min = vdup_n_s16(SHRT_MAX);
+    int16x4_t carry_max = vdup_n_s16(SHRT_MIN);
+
+    Iterator input(_input, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto        in_ptr   = reinterpret_cast<const int16_t *>(input.ptr());
+        const int16x8x2_t pixels   = vld2q_s16(in_ptr);
+        const int16x8_t   tmp_min1 = vminq_s16(pixels.val[0], pixels.val[1]);
+        const int16x8_t   tmp_max1 = vmaxq_s16(pixels.val[0], pixels.val[1]);
+        const int16x4_t   tmp_min2 = vmin_s16(vget_high_s16(tmp_min1), vget_low_s16(tmp_min1));
+        const int16x4_t   tmp_max2 = vmax_s16(vget_high_s16(tmp_max1), vget_low_s16(tmp_max1));
+        carry_min                  = vmin_s16(tmp_min2, carry_min);
+        carry_max                  = vmax_s16(tmp_max2, carry_max);
+    },
+    input);
+
+    // Reduce result
+    carry_min = vpmin_s16(carry_min, carry_min);
+    carry_max = vpmax_s16(carry_max, carry_max);
+    carry_min = vpmin_s16(carry_min, carry_min);
+    carry_max = vpmax_s16(carry_max, carry_max);
+
+    // Extract max/min values
+    const int16_t min_i = vget_lane_s16(carry_min, 0);
+    const int16_t max_i = vget_lane_s16(carry_max, 0);
+
+    // Perform reduction of local min/max values
+    update_min_max(min_i, max_i);
+}
+
+NEMinMaxLocationKernel::NEMinMaxLocationKernel()
+    : _func(nullptr), _input(nullptr), _min(nullptr), _max(nullptr), _min_count(nullptr), _max_count(nullptr), _min_loc(nullptr), _max_loc(nullptr), _num_elems_processed_per_iteration(0)
+{
+}
+
+bool NEMinMaxLocationKernel::is_parallelisable() const
+{
+    return false;
+}
+
+template <unsigned int...>
+struct index_seq
+{
+    index_seq()                  = default;
+    index_seq(const index_seq &) = default;
+    index_seq &operator=(const index_seq &) = default;
+    index_seq(index_seq &&) noexcept        = default;
+    index_seq &operator=(index_seq &&) noexcept = default;
+    virtual ~index_seq()                        = default;
+};
+template <unsigned int N, unsigned int... S>
+struct gen_index_seq : gen_index_seq < N - 1, N - 1, S... >
+{
+};
+template <unsigned int... S>
+struct gen_index_seq<0u, S...> : index_seq<S...>
+{
+    using type = index_seq<S...>;
+};
+
+template <class T, unsigned int... N>
+struct NEMinMaxLocationKernel::create_func_table<T, index_seq<N...>>
+{
+    static const NEMinMaxLocationKernel::MinMaxLocFunction func_table[sizeof...(N)];
+};
+
+template <class T, unsigned int... N>
+const NEMinMaxLocationKernel::MinMaxLocFunction NEMinMaxLocationKernel::create_func_table<T, index_seq<N...>>::func_table[sizeof...(N)] =
+{
+    &NEMinMaxLocationKernel::minmax_loc<T, bool(N & 8), bool(N & 4), bool(N & 2), bool(N & 1)>...
+};
+
+void NEMinMaxLocationKernel::configure(const IImage *input, int32_t *min, int32_t *max,
+                                       ICoordinates2DArray *min_loc, ICoordinates2DArray *max_loc,
+                                       uint32_t *min_count, uint32_t *max_count)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::U8, Format::S16);
+    ARM_COMPUTE_ERROR_ON(nullptr == min);
+    ARM_COMPUTE_ERROR_ON(nullptr == max);
+
+    _input     = input;
+    _min       = min;
+    _max       = max;
+    _min_count = min_count;
+    _max_count = max_count;
+    _min_loc   = min_loc;
+    _max_loc   = max_loc;
+
+    unsigned int count_min = (nullptr != min_count ? 1 : 0);
+    unsigned int count_max = (nullptr != max_count ? 1 : 0);
+    unsigned int loc_min   = (nullptr != min_loc ? 1 : 0);
+    unsigned int loc_max   = (nullptr != max_loc ? 1 : 0);
+
+    unsigned int table_idx = (count_min << 3) | (count_max << 2) | (loc_min << 1) | loc_max;
+
+    switch(input->info()->format())
+    {
+        case Format::U8:
+            _func = create_func_table<uint8_t, gen_index_seq<16>::type>::func_table[table_idx];
+            break;
+        case Format::S16:
+            _func = create_func_table<int16_t, gen_index_seq<16>::type>::func_table[table_idx];
+            break;
+        default:
+            ARM_COMPUTE_ERROR("You called with the wrong img formats");
+            break;
+    }
+
+    _num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(_num_elems_processed_per_iteration));
+
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, _num_elems_processed_per_iteration));
+
+    INEKernel::configure(win);
+}
+
+void NEMinMaxLocationKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (this->*_func)(window);
+}
+
+template <class T, bool count_min, bool count_max, bool loc_min, bool loc_max>
+void NEMinMaxLocationKernel::minmax_loc(const Window &win)
+{
+    if(count_min || count_max || loc_min || loc_max)
+    {
+        Iterator input(_input, win);
+
+        size_t       min_count = 0;
+        size_t       max_count = 0;
+        unsigned int step      = _num_elems_processed_per_iteration;
+
+        // Clear min location array
+        if(loc_min)
+        {
+            _min_loc->clear();
+        }
+
+        // Clear max location array
+        if(loc_max)
+        {
+            _max_loc->clear();
+        }
+
+        execute_window_loop(win, [&](const Coordinates & id)
+        {
+            auto    in_ptr = reinterpret_cast<const T *>(input.ptr());
+            int32_t idx    = id.x();
+            int32_t idy    = id.y();
+
+            for(unsigned int i = 0; i < step; ++i)
+            {
+                const T       pixel = *in_ptr++;
+                Coordinates2D p{ idx++, idy };
+
+                if(count_min || loc_min)
+                {
+                    if(*_min == pixel)
+                    {
+                        if(count_min)
+                        {
+                            ++min_count;
+                        }
+
+                        if(loc_min)
+                        {
+                            _min_loc->push_back(p);
+                        }
+                    }
+                }
+
+                if(count_max || loc_max)
+                {
+                    if(*_max == pixel)
+                    {
+                        if(count_max)
+                        {
+                            ++max_count;
+                        }
+
+                        if(loc_max)
+                        {
+                            _max_loc->push_back(p);
+                        }
+                    }
+                }
+            }
+        },
+        input);
+
+        if(count_min)
+        {
+            *_min_count = min_count;
+        }
+
+        if(count_max)
+        {
+            *_max_count = max_count;
+        }
+    }
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NENonLinearFilterKernel.cpp b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
new file mode 100644
index 0000000000..03d1409be1
--- /dev/null
+++ b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
@@ -0,0 +1,1009 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <array>
+#include <tuple>
+#include <utility>
+
+namespace arm_compute
+{
+namespace
+{
+const uint8x16_t zero_u8 = vdupq_n_u8(0);
+
+template <size_t columns>
+inline uint8x8_t min_row(uint8x16_t row_data)
+{
+    uint8x8_t min = vget_low_u8(row_data);
+
+    for(size_t c = 1; c < columns; ++c)
+    {
+        row_data = vextq_u8(row_data, zero_u8, 1);
+        min      = vmin_u8(min, vget_low_u8(row_data));
+    }
+
+    return min;
+}
+
+template <size_t columns>
+inline uint8x8_t max_row(uint8x16_t row_data)
+{
+    uint8x8_t max = vget_low_u8(row_data);
+
+    for(size_t c = 1; c < columns; ++c)
+    {
+        row_data = vextq_u8(row_data, zero_u8, 1);
+        max      = vmax_u8(max, vget_low_u8(row_data));
+    }
+
+    return max;
+}
+
+inline void sort(uint8x8_t &a, uint8x8_t &b)
+{
+    const uint8x8_t min = vmin_u8(a, b);
+    const uint8x8_t max = vmax_u8(a, b);
+    a                   = min;
+    b                   = max;
+}
+
+// Sorting networks below were generated using http://pages.ripco.net/~jgamble/nw.html
+// Calculations that do not affect the median were removed.
+inline void sort5(uint8x8_t &p0, uint8x8_t &p1, uint8x8_t &p2, uint8x8_t &p3, uint8x8_t &p4)
+{
+    sort(p0, p1);
+    sort(p2, p3);
+    sort(p0, p2);
+    sort(p1, p3);
+    sort(p1, p2);
+    sort(p0, p4);
+    sort(p1, p4);
+    sort(p2, p4);
+}
+
+inline void sort9(uint8x8_t &p0, uint8x8_t &p1, uint8x8_t &p2,
+                  uint8x8_t &p3, uint8x8_t &p4, uint8x8_t &p5,
+                  uint8x8_t &p6, uint8x8_t &p7, uint8x8_t &p8)
+{
+    sort(p1, p2);
+    sort(p4, p5);
+    sort(p7, p8);
+    sort(p0, p1);
+    sort(p3, p4);
+    sort(p6, p7);
+    sort(p1, p2);
+    sort(p4, p5);
+    sort(p7, p8);
+    sort(p0, p3);
+    sort(p5, p8);
+    sort(p4, p7);
+    sort(p3, p6);
+    sort(p1, p4);
+    sort(p2, p5);
+    sort(p4, p7);
+    sort(p4, p2);
+    sort(p6, p4);
+    sort(p4, p2);
+}
+
+inline void sort21(uint8x8_t p[21])
+{
+    sort(p[0], p[1]);
+    sort(p[2], p[3]);
+    sort(p[4], p[5]);
+    sort(p[6], p[7]);
+    sort(p[8], p[9]);
+    sort(p[10], p[11]);
+    sort(p[12], p[13]);
+    sort(p[14], p[15]);
+    sort(p[16], p[17]);
+    sort(p[18], p[19]);
+    sort(p[0], p[2]);
+    sort(p[1], p[3]);
+    sort(p[4], p[6]);
+    sort(p[5], p[7]);
+    sort(p[8], p[10]);
+    sort(p[9], p[11]);
+    sort(p[12], p[14]);
+    sort(p[13], p[15]);
+    sort(p[16], p[18]);
+    sort(p[17], p[19]);
+    sort(p[1], p[2]);
+    sort(p[5], p[6]);
+    sort(p[0], p[4]);
+    sort(p[3], p[7]);
+    sort(p[9], p[10]);
+    sort(p[13], p[14]);
+    sort(p[8], p[12]);
+    sort(p[11], p[15]);
+    sort(p[17], p[18]);
+    sort(p[16], p[20]);
+    sort(p[1], p[5]);
+    sort(p[2], p[6]);
+    sort(p[9], p[13]);
+    sort(p[10], p[14]);
+    sort(p[0], p[8]);
+    sort(p[7], p[15]);
+    sort(p[17], p[20]);
+    sort(p[1], p[4]);
+    sort(p[3], p[6]);
+    sort(p[9], p[12]);
+    sort(p[11], p[14]);
+    sort(p[18], p[20]);
+    sort(p[0], p[16]);
+    sort(p[2], p[4]);
+    sort(p[3], p[5]);
+    sort(p[10], p[12]);
+    sort(p[11], p[13]);
+    sort(p[1], p[9]);
+    sort(p[6], p[14]);
+    sort(p[19], p[20]);
+    sort(p[3], p[4]);
+    sort(p[11], p[12]);
+    sort(p[1], p[8]);
+    sort(p[2], p[10]);
+    sort(p[5], p[13]);
+    sort(p[7], p[14]);
+    sort(p[3], p[11]);
+    sort(p[2], p[8]);
+    sort(p[4], p[12]);
+    sort(p[7], p[13]);
+    sort(p[1], p[17]);
+    sort(p[3], p[10]);
+    sort(p[5], p[12]);
+    sort(p[1], p[16]);
+    sort(p[2], p[18]);
+    sort(p[3], p[9]);
+    sort(p[6], p[12]);
+    sort(p[2], p[16]);
+    sort(p[3], p[8]);
+    sort(p[7], p[12]);
+    sort(p[5], p[9]);
+    sort(p[6], p[10]);
+    sort(p[4], p[8]);
+    sort(p[7], p[11]);
+    sort(p[3], p[19]);
+    sort(p[5], p[8]);
+    sort(p[7], p[10]);
+    sort(p[3], p[18]);
+    sort(p[4], p[20]);
+    sort(p[6], p[8]);
+    sort(p[7], p[9]);
+    sort(p[3], p[17]);
+    sort(p[5], p[20]);
+    sort(p[7], p[8]);
+    sort(p[3], p[16]);
+    sort(p[6], p[20]);
+    sort(p[5], p[17]);
+    sort(p[7], p[20]);
+    sort(p[4], p[16]);
+    sort(p[6], p[18]);
+    sort(p[5], p[16]);
+    sort(p[7], p[19]);
+    sort(p[7], p[18]);
+    sort(p[6], p[16]);
+    sort(p[7], p[17]);
+    sort(p[10], p[18]);
+    sort(p[7], p[16]);
+    sort(p[9], p[17]);
+    sort(p[8], p[16]);
+    sort(p[9], p[16]);
+    sort(p[10], p[16]);
+}
+
+inline void sort25(uint8x8_t p[25])
+{
+    sort(p[1], p[2]);
+    sort(p[0], p[1]);
+    sort(p[1], p[2]);
+    sort(p[4], p[5]);
+    sort(p[3], p[4]);
+    sort(p[4], p[5]);
+    sort(p[0], p[3]);
+    sort(p[2], p[5]);
+    sort(p[2], p[3]);
+    sort(p[1], p[4]);
+    sort(p[1], p[2]);
+    sort(p[3], p[4]);
+    sort(p[7], p[8]);
+    sort(p[6], p[7]);
+    sort(p[7], p[8]);
+    sort(p[10], p[11]);
+    sort(p[9], p[10]);
+    sort(p[10], p[11]);
+    sort(p[6], p[9]);
+    sort(p[8], p[11]);
+    sort(p[8], p[9]);
+    sort(p[7], p[10]);
+    sort(p[7], p[8]);
+    sort(p[9], p[10]);
+    sort(p[0], p[6]);
+    sort(p[4], p[10]);
+    sort(p[4], p[6]);
+    sort(p[2], p[8]);
+    sort(p[2], p[4]);
+    sort(p[6], p[8]);
+    sort(p[1], p[7]);
+    sort(p[5], p[11]);
+    sort(p[5], p[7]);
+    sort(p[3], p[9]);
+    sort(p[3], p[5]);
+    sort(p[7], p[9]);
+    sort(p[1], p[2]);
+    sort(p[3], p[4]);
+    sort(p[5], p[6]);
+    sort(p[7], p[8]);
+    sort(p[9], p[10]);
+    sort(p[13], p[14]);
+    sort(p[12], p[13]);
+    sort(p[13], p[14]);
+    sort(p[16], p[17]);
+    sort(p[15], p[16]);
+    sort(p[16], p[17]);
+    sort(p[12], p[15]);
+    sort(p[14], p[17]);
+    sort(p[14], p[15]);
+    sort(p[13], p[16]);
+    sort(p[13], p[14]);
+    sort(p[15], p[16]);
+    sort(p[19], p[20]);
+    sort(p[18], p[19]);
+    sort(p[19], p[20]);
+    sort(p[21], p[22]);
+    sort(p[23], p[24]);
+    sort(p[21], p[23]);
+    sort(p[22], p[24]);
+    sort(p[22], p[23]);
+    sort(p[18], p[21]);
+    sort(p[20], p[23]);
+    sort(p[20], p[21]);
+    sort(p[19], p[22]);
+    sort(p[22], p[24]);
+    sort(p[19], p[20]);
+    sort(p[21], p[22]);
+    sort(p[23], p[24]);
+    sort(p[12], p[18]);
+    sort(p[16], p[22]);
+    sort(p[16], p[18]);
+    sort(p[14], p[20]);
+    sort(p[20], p[24]);
+    sort(p[14], p[16]);
+    sort(p[18], p[20]);
+    sort(p[22], p[24]);
+    sort(p[13], p[19]);
+    sort(p[17], p[23]);
+    sort(p[17], p[19]);
+    sort(p[15], p[21]);
+    sort(p[15], p[17]);
+    sort(p[19], p[21]);
+    sort(p[13], p[14]);
+    sort(p[15], p[16]);
+    sort(p[17], p[18]);
+    sort(p[19], p[20]);
+    sort(p[21], p[22]);
+    sort(p[23], p[24]);
+    sort(p[0], p[12]);
+    sort(p[8], p[20]);
+    sort(p[8], p[12]);
+    sort(p[4], p[16]);
+    sort(p[16], p[24]);
+    sort(p[12], p[16]);
+    sort(p[2], p[14]);
+    sort(p[10], p[22]);
+    sort(p[10], p[14]);
+    sort(p[6], p[18]);
+    sort(p[6], p[10]);
+    sort(p[10], p[12]);
+    sort(p[1], p[13]);
+    sort(p[9], p[21]);
+    sort(p[9], p[13]);
+    sort(p[5], p[17]);
+    sort(p[13], p[17]);
+    sort(p[3], p[15]);
+    sort(p[11], p[23]);
+    sort(p[11], p[15]);
+    sort(p[7], p[19]);
+    sort(p[7], p[11]);
+    sort(p[11], p[13]);
+    sort(p[11], p[12]);
+}
+} // namespace
+
+NENonLinearFilterKernel::NENonLinearFilterKernel()
+    : _border_width(0), _input(nullptr), _output(nullptr), _mask(nullptr), _pattern(MatrixPattern::BOX), _function(NonLinearFilterFunction::MIN), _func_idx(0), _border_size()
+{
+}
+
+BorderSize NENonLinearFilterKernel::border_size() const
+{
+    return _border_size;
+}
+
+void NENonLinearFilterKernel::configure(const ITensor *input, ITensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
+                                        bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(3 != mask_size && 5 != mask_size);
+    ARM_COMPUTE_ERROR_ON(MatrixPattern::OTHER == pattern && nullptr == mask);
+
+    // Set class variables
+    _border_size = BorderSize(mask_size / 2);
+    _input       = input;
+    _output      = output;
+    _mask        = mask;
+    _pattern     = pattern;
+    _function    = function;
+
+    // Configure kernel window
+    const unsigned int     num_elems_processed_per_iteration = (MatrixPattern::OTHER == pattern) ? 1 : 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+
+    Window                 win = calculate_max_window(*input->info(), num_elems_processed_per_iteration, border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, mask_size),
+                              output_access);
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+
+    // Define function index
+    _func_idx = (3 == mask_size) ? 0 : 1;
+
+    if(MatrixPattern::OTHER != pattern)
+    {
+        _func_idx = (_func_idx) * 3 + static_cast<unsigned int>(function);
+    }
+}
+
+void NENonLinearFilterKernel::fill_mask(uint8_t *mask, int cols, int rows, MatrixPattern pattern)
+{
+    unsigned int v = 0;
+
+    for(int r = 0; r < rows; ++r)
+    {
+        for(int c = 0; c < cols; ++c, ++v)
+        {
+            uint8_t val = 0;
+
+            switch(pattern)
+            {
+                case MatrixPattern::BOX:
+                    val = 255;
+                    break;
+                case MatrixPattern::CROSS:
+                    val = ((r == (rows / 2)) || (c == (cols / 2))) ? 255 : 0;
+                    break;
+                case MatrixPattern::DISK:
+                    val = (((r - rows / 2.0f + 0.5f) * (r - rows / 2.0f + 0.5f)) / ((rows / 2.0f) * (rows / 2.0f)) + ((c - cols / 2.0f + 0.5f) * (c - cols / 2.0f + 0.5f)) / ((cols / 2.0f) *
+                            (cols / 2.0f))) <= 1.0f ? 255 : 0;
+                    break;
+                default:
+                    return;
+            }
+
+            mask[v] = val;
+        }
+    }
+}
+
+template <>
+void NENonLinearFilterKernel::median_filter_box<3, 3>(const Window &win)
+{
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+
+    const auto input_top_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, -1)));
+    const auto input_mid_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 0)));
+    const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 1)));
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
+        const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
+        const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
+
+        uint8x8_t p0 = vget_low_u8(top_data);
+        uint8x8_t p1 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 1);
+        uint8x8_t p2 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 2);
+        uint8x8_t p3 = vget_low_u8(mid_data);
+        uint8x8_t p4 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1);
+        uint8x8_t p5 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2);
+        uint8x8_t p6 = vget_low_u8(bot_data);
+        uint8x8_t p7 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 1);
+        uint8x8_t p8 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 2);
+
+        sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
+
+        vst1_u8(output.ptr(), p4);
+    },
+    input, output);
+}
+template <>
+void NENonLinearFilterKernel::median_filter_box<5, 5>(const Window &win)
+{
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+
+    const auto input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -2)));
+    const auto input_top_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -1)));
+    const auto input_mid_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
+    const auto input_bot_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
+    const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 2)));
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const uint8x16_t top2_data = vld1q_u8(input_top2_ptr + input.offset());
+        const uint8x16_t top_data  = vld1q_u8(input_top_ptr + input.offset());
+        const uint8x16_t mid_data  = vld1q_u8(input_mid_ptr + input.offset());
+        const uint8x16_t bot_data  = vld1q_u8(input_bot_ptr + input.offset());
+        const uint8x16_t bot2_data = vld1q_u8(input_bot2_ptr + input.offset());
+
+        const uint8x8_t d[] =
+        {
+            vget_low_u8(top2_data),
+            vget_high_u8(top2_data),
+            vget_low_u8(top_data),
+            vget_high_u8(top_data),
+            vget_low_u8(mid_data),
+            vget_high_u8(mid_data),
+            vget_low_u8(bot_data),
+            vget_high_u8(bot_data),
+            vget_low_u8(bot2_data),
+            vget_high_u8(bot2_data)
+        };
+
+        uint8x8_t p[25];
+        for(unsigned int i = 0; i < 5; ++i)
+        {
+            const unsigned int idx_d = i * 2;
+            const unsigned int idx_p = i * 5;
+
+            p[idx_p]     = d[idx_d];
+            p[idx_p + 1] = vext_u8(d[idx_d], d[idx_d + 1], 1);
+            p[idx_p + 2] = vext_u8(d[idx_d], d[idx_d + 1], 2);
+            p[idx_p + 3] = vext_u8(d[idx_d], d[idx_d + 1], 3);
+            p[idx_p + 4] = vext_u8(d[idx_d], d[idx_d + 1], 4);
+        }
+
+        sort25(p);
+
+        vst1_u8(output.ptr(), p[12]);
+    },
+    input, output);
+}
+
+template <int mask_w, int mask_h>
+void NENonLinearFilterKernel::min_filter_box(const Window &win)
+{
+    static_assert(mask_w > 0, "Mask size must not be 0");
+    static_assert(mask_h > 0, "Mask size must not be 0");
+
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+
+    const int k_row_half = mask_h / 2;
+    const int k_col_half = mask_w / 2;
+
+    // Set row pointers
+    std::array<const unsigned char *, mask_h> input_ptrs{ {} };
+    for(int i = -k_row_half; i <= k_row_half; ++i)
+    {
+        input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
+    }
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        // Get min of rows
+        uint8x16_t rows_min = vld1q_u8(input_ptrs[0] + input.offset());
+
+        for(unsigned int r = 1; r < mask_h; ++r)
+        {
+            const uint8x16_t data = vld1q_u8(input_ptrs[r] + input.offset());
+            rows_min              = vminq_u8(rows_min, data);
+        }
+
+        const uint8x8_t out = min_row<mask_w>(rows_min);
+
+        // Store result as U8
+        vst1_u8(output.ptr(), out);
+    },
+    input, output);
+}
+
+template <int mask_w, int mask_h>
+void NENonLinearFilterKernel::max_filter_box(const Window &win)
+{
+    static_assert(mask_w > 0, "Mask size must not be 0");
+    static_assert(mask_h > 0, "Mask size must not be 0");
+    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
+
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+
+    const int k_row_half = mask_h / 2;
+    const int k_col_half = mask_w / 2;
+
+    // Set row pointers
+    std::array<const unsigned char *, mask_h> input_ptrs{ {} };
+    for(int i = -k_row_half; i <= k_row_half; ++i)
+    {
+        input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
+    }
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        uint8x16_t rows_max = vld1q_u8(input_ptrs[0] + input.offset());
+
+        // Get max of rows
+        for(unsigned int r = 1; r < mask_h; ++r)
+        {
+            const uint8x16_t data = vld1q_u8(input_ptrs[r] + input.offset());
+            rows_max              = vmaxq_u8(rows_max, data);
+        }
+
+        // Get max of columns
+        const uint8x8_t out = max_row<mask_w>(rows_max);
+
+        // Store result as U8
+        vst1_u8(output.ptr(), out);
+    },
+    input, output);
+}
+
+template <>
+void NENonLinearFilterKernel::median_filter_cross<3, 3>(const Window &win)
+{
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+
+    const auto input_top_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, -1)));
+    const auto input_mid_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 0)));
+    const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, 1)));
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const uint8x8_t  top_data = vld1_u8(input_top_ptr + input.offset());
+        const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
+        const uint8x8_t  bot_data = vld1_u8(input_bot_ptr + input.offset());
+
+        uint8x8_t p0 = top_data;
+        uint8x8_t p1 = vget_low_u8(mid_data);
+        uint8x8_t p2 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1);
+        uint8x8_t p3 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2);
+        uint8x8_t p4 = bot_data;
+
+        sort5(p0, p1, p2, p3, p4);
+
+        vst1_u8(output.ptr(), p2);
+    },
+    input, output);
+}
+
+template <>
+void NENonLinearFilterKernel::median_filter_cross<5, 5>(const Window &win)
+{
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+
+    const auto input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, -2)));
+    const auto input_top_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, -1)));
+    const auto input_mid_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
+    const auto input_bot_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, 1)));
+    const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, 2)));
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const uint8x8_t  top2_data = vld1_u8(input_top2_ptr + input.offset());
+        const uint8x8_t  top_data  = vld1_u8(input_top_ptr + input.offset());
+        const uint8x16_t mid_data  = vld1q_u8(input_mid_ptr + input.offset());
+        const uint8x8_t  bot_data  = vld1_u8(input_bot_ptr + input.offset());
+        const uint8x8_t  bot2_data = vld1_u8(input_bot2_ptr + input.offset());
+
+        uint8x8_t p0 = top2_data;
+        uint8x8_t p1 = top_data;
+        uint8x8_t p2 = vget_low_u8(mid_data);
+        uint8x8_t p3 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1);
+        uint8x8_t p4 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2);
+        uint8x8_t p5 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 3);
+        uint8x8_t p6 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 4);
+        uint8x8_t p7 = bot_data;
+        uint8x8_t p8 = bot2_data;
+
+        sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
+
+        vst1_u8(output.ptr(), p4);
+    },
+    input, output);
+}
+
+template <int mask_w, int mask_h>
+void NENonLinearFilterKernel::min_filter_cross(const Window &win)
+{
+    static_assert(mask_w > 0, "Mask size must not be 0");
+    static_assert(mask_h > 0, "Mask size must not be 0");
+    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
+
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+
+    const int k_row_half = mask_h / 2;
+    const int k_col_half = mask_w / 2;
+
+    const unsigned char *mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, 0));
+
+    // Set row pointers
+    std::array<const unsigned char *, mask_h> input_ptrs{ {} };
+    for(int i = -k_row_half; i <= k_row_half; ++i)
+    {
+        input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, i));
+    }
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        uint8x8_t rows_min = vld1_u8(input_ptrs[0] + input.offset());
+
+        // Get min of rows
+        for(unsigned int r = 1; r < mask_h; ++r)
+        {
+            const uint8x8_t data = vld1_u8(input_ptrs[r] + input.offset());
+            rows_min             = vmin_u8(rows_min, data);
+        }
+
+        // Get min of middle row
+        const uint8x16_t data = vld1q_u8(mid_ptr + input.offset());
+        uint8x8_t        out  = min_row<mask_w>(data);
+
+        // Get final min
+        out = vmin_u8(out, rows_min);
+
+        // Store result as U8
+        vst1_u8(output.ptr(), out);
+    },
+    input, output);
+}
+
+template <int mask_w, int mask_h>
+void NENonLinearFilterKernel::max_filter_cross(const Window &win)
+{
+    static_assert(mask_w > 0, "Mask size must not be 0");
+    static_assert(mask_h > 0, "Mask size must not be 0");
+    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
+
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+
+    const int k_row_half = mask_h / 2;
+    const int k_col_half = mask_w / 2;
+
+    const unsigned char *mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, 0));
+
+    // Set row pointers
+    std::array<unsigned char *, mask_h> input_ptrs{ {} };
+    for(int i = -k_row_half; i <= k_row_half; ++i)
+    {
+        input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, i));
+    }
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        uint8x8_t rows_max = vld1_u8(input_ptrs[0] + input.offset());
+
+        // Get max of rows
+        for(unsigned int r = 1; r < mask_h; ++r)
+        {
+            const uint8x8_t data = vld1_u8(input_ptrs[r] + input.offset());
+            rows_max             = vmax_u8(rows_max, data);
+        }
+
+        // Get max of middle row
+        const uint8x16_t data = vld1q_u8(mid_ptr + input.offset());
+        uint8x8_t        out  = max_row<mask_w>(data);
+
+        // Get final max
+        out = vmax_u8(out, rows_max);
+
+        // Store result as U8
+        vst1_u8(output.ptr(), out);
+    },
+    input, output);
+}
+
+template <>
+void NENonLinearFilterKernel::median_filter_disk<5, 5>(const Window &win)
+{
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+
+    const auto input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, -2)));
+    const auto input_top_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -1)));
+    const auto input_mid_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
+    const auto input_bot_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
+    const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 2)));
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const uint8x16_t top2_data = vld1q_u8(input_top2_ptr + input.offset());
+        const uint8x16_t top_data  = vld1q_u8(input_top_ptr + input.offset());
+        const uint8x16_t mid_data  = vld1q_u8(input_mid_ptr + input.offset());
+        const uint8x16_t bot_data  = vld1q_u8(input_bot_ptr + input.offset());
+        const uint8x16_t bot2_data = vld1q_u8(input_bot2_ptr + input.offset());
+
+        uint8x8_t d[] =
+        {
+            vget_low_u8(top2_data),
+            vget_high_u8(top2_data),
+            vget_low_u8(top_data),
+            vget_high_u8(top_data),
+            vget_low_u8(mid_data),
+            vget_high_u8(mid_data),
+            vget_low_u8(bot_data),
+            vget_high_u8(bot_data),
+            vget_low_u8(bot2_data),
+            vget_high_u8(bot2_data)
+        };
+
+        uint8x8_t p[21];
+        p[0]  = d[0];
+        p[1]  = vext_u8(d[0], d[1], 1);
+        p[2]  = vext_u8(d[0], d[1], 2);
+        p[18] = d[8];
+        p[19] = vext_u8(d[8], d[9], 1);
+        p[20] = vext_u8(d[8], d[9], 2);
+
+        for(unsigned int i = 0; i < 3; ++i)
+        {
+            const unsigned int idx_d = 2 + i * 2;
+            const unsigned int idx_p = 3 + i * 5;
+
+            p[idx_p]     = d[idx_d];
+            p[idx_p + 1] = vext_u8(d[idx_d], d[idx_d + 1], 1);
+            p[idx_p + 2] = vext_u8(d[idx_d], d[idx_d + 1], 2);
+            p[idx_p + 3] = vext_u8(d[idx_d], d[idx_d + 1], 3);
+            p[idx_p + 4] = vext_u8(d[idx_d], d[idx_d + 1], 4);
+        }
+
+        sort21(p);
+
+        vst1_u8(output.ptr(), p[10]);
+    },
+    input, output);
+}
+
+template <>
+void NENonLinearFilterKernel::min_filter_disk<5, 5>(const Window &win)
+{
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+
+    const auto input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, -2)));
+    const auto input_top_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -1)));
+    const auto input_mid_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
+    const auto input_bot_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
+    const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 2)));
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const uint8x16_t top2_data = vld1q_u8(input_top2_ptr + input.offset());
+        const uint8x16_t top_data  = vld1q_u8(input_top_ptr + input.offset());
+        const uint8x16_t mid_data  = vld1q_u8(input_mid_ptr + input.offset());
+        const uint8x16_t bot_data  = vld1q_u8(input_bot_ptr + input.offset());
+        const uint8x16_t bot2_data = vld1q_u8(input_bot2_ptr + input.offset());
+
+        const uint8x16_t rows_min_3 = vminq_u8(top2_data, bot2_data);
+        uint8x16_t       rows_min_5 = vminq_u8(top_data, bot_data);
+        rows_min_5                  = vminq_u8(rows_min_5, mid_data);
+
+        const uint8x8_t out_3 = min_row<3>(rows_min_3);
+        const uint8x8_t out_5 = min_row<5>(rows_min_5);
+
+        vst1_u8(output.ptr(), vmin_u8(out_3, out_5));
+    },
+    input, output);
+}
+
+template <>
+void NENonLinearFilterKernel::max_filter_disk<5, 5>(const Window &win)
+{
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+
+    const auto input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, -2)));
+    const auto input_top_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -1)));
+    const auto input_mid_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
+    const auto input_bot_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
+    const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 2)));
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const uint8x16_t top2_data = vld1q_u8(input_top2_ptr + input.offset());
+        const uint8x16_t top_data  = vld1q_u8(input_top_ptr + input.offset());
+        const uint8x16_t mid_data  = vld1q_u8(input_mid_ptr + input.offset());
+        const uint8x16_t bot_data  = vld1q_u8(input_bot_ptr + input.offset());
+        const uint8x16_t bot2_data = vld1q_u8(input_bot2_ptr + input.offset());
+
+        const uint8x16_t rows_max_3 = vmaxq_u8(top2_data, bot2_data);
+        uint8x16_t       rows_max_5 = vmaxq_u8(top_data, bot_data);
+        rows_max_5                  = vmaxq_u8(rows_max_5, mid_data);
+
+        const uint8x8_t out_3 = max_row<3>(rows_max_3);
+        const uint8x8_t out_5 = max_row<5>(rows_max_5);
+
+        vst1_u8(output.ptr(), vmax_u8(out_3, out_5));
+    },
+    input, output);
+}
+
+template <int mask_w, int mask_h>
+void NENonLinearFilterKernel::non_linear_filter_generic(const Window &win)
+{
+    Iterator input(_input, win);
+    Iterator output(_output, win);
+    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
+
+    const int     k_row_half = mask_h / 2;
+    const int     k_col_half = mask_w / 2;
+    constexpr int mask_size  = mask_w * mask_h;
+
+    // Set row pointers
+    std::array<unsigned char *, mask_h> input_ptrs{ {} };
+    for(int i = -k_row_half; i <= k_row_half; ++i)
+    {
+        input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
+    }
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        std::array<uint8_t, mask_size> vals{ {} };
+
+        size_t v = 0;
+        size_t m = 0;
+
+        for(unsigned int r = 0; r < mask_h; ++r)
+        {
+            const auto in_ptr = static_cast<const uint8_t *>(input_ptrs[r] + input.offset());
+
+            for(unsigned int c = 0; c < mask_w; ++c, ++m)
+            {
+                if(_mask[m] == 255)
+                {
+                    vals[v] = in_ptr[c];
+                    ++v;
+                }
+            }
+        }
+
+        // Only do something if there is at least one non-zero element in the
+        // mask
+        if(v > 0)
+        {
+            std::sort(vals.begin(), vals.begin() + v);
+
+            switch(_function)
+            {
+                case NonLinearFilterFunction::MIN:
+                    *output.ptr() = vals[0];
+                    break;
+                case NonLinearFilterFunction::MAX:
+                    *output.ptr() = vals[v - 1];
+                    break;
+                case NonLinearFilterFunction::MEDIAN:
+                    *output.ptr() = vals[v / 2];
+                    break;
+                default:
+                    break;
+            }
+        }
+    },
+    input, output);
+}
+
+void NENonLinearFilterKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    using NonLinearFilterFunction = void (NENonLinearFilterKernel::*)(const Window & window);
+
+    // Function table for BOX pattern
+    static const std::array<NonLinearFilterFunction, 6> func_table_box =
+    {
+        {
+            &NENonLinearFilterKernel::median_filter_box<3, 3>,
+            &NENonLinearFilterKernel::min_filter_box<3, 3>,
+            &NENonLinearFilterKernel::max_filter_box<3, 3>,
+            &NENonLinearFilterKernel::median_filter_box<5, 5>,
+            &NENonLinearFilterKernel::min_filter_box<5, 5>,
+            &NENonLinearFilterKernel::max_filter_box<5, 5>,
+        }
+    };
+
+    // Function table for CROSS pattern
+    static const std::array<NonLinearFilterFunction, 6> func_table_cross =
+    {
+        {
+            &NENonLinearFilterKernel::median_filter_cross<3, 3>,
+            &NENonLinearFilterKernel::min_filter_cross<3, 3>,
+            &NENonLinearFilterKernel::max_filter_cross<3, 3>,
+            &NENonLinearFilterKernel::median_filter_cross<5, 5>,
+            &NENonLinearFilterKernel::min_filter_cross<5, 5>,
+            &NENonLinearFilterKernel::max_filter_cross<5, 5>,
+        }
+    };
+
+    // Function table for DISK pattern
+    static const std::array<NonLinearFilterFunction, 6> func_table_disk =
+    {
+        {
+            &NENonLinearFilterKernel::median_filter_box<3, 3>,
+            &NENonLinearFilterKernel::min_filter_box<3, 3>,
+            &NENonLinearFilterKernel::max_filter_box<3, 3>,
+            &NENonLinearFilterKernel::median_filter_disk<5, 5>,
+            &NENonLinearFilterKernel::min_filter_disk<5, 5>,
+            &NENonLinearFilterKernel::max_filter_disk<5, 5>,
+        }
+    };
+
+    // Function table for OTHER pattern
+    static const std::array<NonLinearFilterFunction, 2> func_table_generic =
+    {
+        {
+            &NENonLinearFilterKernel::non_linear_filter_generic<3, 3>,
+            &NENonLinearFilterKernel::non_linear_filter_generic<5, 5>,
+        }
+    };
+
+    switch(_pattern)
+    {
+        case MatrixPattern::BOX:
+            ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_box.size());
+            (this->*func_table_box[_func_idx])(window);
+            break;
+        case MatrixPattern::CROSS:
+            ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_cross.size());
+            (this->*func_table_cross[_func_idx])(window);
+            break;
+        case MatrixPattern::DISK:
+            ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_disk.size());
+            (this->*func_table_disk[_func_idx])(window);
+            break;
+        case MatrixPattern::OTHER:
+        default:
+            ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_generic.size());
+            (this->*func_table_generic[_func_idx])(window);
+            break;
+    }
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp
new file mode 100644
index 0000000000..1826c474f7
--- /dev/null
+++ b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp
@@ -0,0 +1,513 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+namespace fp16
+{
+inline void mask_top(const float16x8_t &vc, const float16x8_t &in0, const float16x8_t &in1, uint16x8_t &mask)
+{
+    // vc > nc.val[0], vc > nc.val[1], vc > nc.val[2]
+    mask = vandq_u16(mask, vcgeq_f16(vc, in0));
+    mask = vandq_u16(mask, vcgeq_f16(vc, vextq_f16(in0, in1, 1)));
+    mask = vandq_u16(mask, vcgeq_f16(vc, vextq_f16(in0, in1, 2)));
+}
+
+inline void mask_middle(const float16x8_t &vc, const float16x8_t &in0, const float16x8_t &in1, uint16x8_t &mask)
+{
+    // vc >= nc.val[0], vc > nc.val[2]
+    mask = vandq_u16(mask, vcgeq_f16(vc, in0));
+    mask = vandq_u16(mask, vcgtq_f16(vc, vextq_f16(in0, in1, 2)));
+}
+
+inline void mask_bottom(const float16x8_t &vc, const float16x8_t &in0, const float16x8_t &in1, uint16x8_t &mask)
+{
+    // vc > nc.val[0], vc > nc.val[1], vc > nc.val[2]
+    mask = vandq_u16(mask, vcgtq_f16(vc, in0));
+    mask = vandq_u16(mask, vcgtq_f16(vc, vextq_f16(in0, in1, 1)));
+    mask = vandq_u16(mask, vcgtq_f16(vc, vextq_f16(in0, in1, 2)));
+}
+
+inline void non_maxima_suppression3x3_F32_F32(const void *__restrict in_ptr, void *__restrict out_ptr, const uint32_t in_stride)
+{
+    auto       in  = static_cast<const float *__restrict>(in_ptr) - 1;
+    const auto out = static_cast<float *__restrict>(out_ptr);
+
+    // Get centre scores
+    const float16x8x2_t vc =
+    {
+        vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 1)), vcvt_f16_f32(vld1q_f32(in + 5))),
+        vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 9)), vcvt_f16_f32(vld1q_f32(in + 13)))
+    };
+
+    // Neighboring pixels
+    in -= in_stride;
+
+    static const float16x4_t  zero_f16x4 = vdup_n_f16(0);
+    static const uint16x8_t   zero_u16   = vdupq_n_u16(0);
+    static const uint16x8_t   true_mask  = vceqq_u16(zero_u16, zero_u16);
+    static const uint16x8x2_t true_mask_x2 =
+    {
+        true_mask,
+        true_mask
+    };
+
+    uint16x8x2_t mask = true_mask_x2;
+
+    // Top row
+    const float16x8_t tmp_top0 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in)), vcvt_f16_f32(vld1q_f32(in + 4)));
+    const float16x8_t tmp_top1 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 8)), vcvt_f16_f32(vld1q_f32(in + 12)));
+    const float16x8_t tmp_top2 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 16)), zero_f16x4);
+
+    // vc >= nc.val[0], vc >= nc.val[1], vc >= nc.val[2]
+    mask_top(vc.val[0], tmp_top0, tmp_top1, mask.val[0]);
+    mask_top(vc.val[1], tmp_top1, tmp_top2, mask.val[1]);
+
+    in += in_stride;
+
+    // Middle row
+    const float16x8_t tmp_mid0 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in)), vcvt_f16_f32(vld1q_f32(in + 4)));
+    const float16x8_t tmp_mid1 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 8)), vcvt_f16_f32(vld1q_f32(in + 12)));
+    const float16x8_t tmp_mid2 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 16)), zero_f16x4);
+
+    // vc >= nc.val[0], vc > nc.val[2]
+    mask_middle(vc.val[0], tmp_mid0, tmp_mid1, mask.val[0]);
+    mask_middle(vc.val[1], tmp_mid1, tmp_mid2, mask.val[1]);
+
+    in += in_stride;
+
+    // Bottom row
+    const float16x8_t tmp_bot0 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in)), vcvt_f16_f32(vld1q_f32(in + 4)));
+    const float16x8_t tmp_bot1 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 8)), vcvt_f16_f32(vld1q_f32(in + 12)));
+    const float16x8_t tmp_bot2 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 16)), zero_f16x4);
+
+    // vc > nc.val[0], vc > nc.val[1], vc > nc.val[2]
+    mask_bottom(vc.val[0], tmp_bot0, tmp_bot1, mask.val[0]);
+    mask_bottom(vc.val[1], tmp_bot1, tmp_bot2, mask.val[1]);
+
+    // Store
+    static const float16x8_t zero_f16x8 = vdupq_n_f16(0);
+
+    const float16x8_t suppressed0 = vbslq_f16(mask.val[0], vc.val[0], zero_f16x8);
+    vst1q_f32(out + 0, vcvt_f32_f16(vget_low_f16(suppressed0)));
+    vst1q_f32(out + 4, vcvt_f32_f16(vget_high_f16(suppressed0)));
+
+    const float16x8_t suppressed1 = vbslq_f16(mask.val[1], vc.val[1], zero_f16x8);
+    vst1q_f32(out + 8, vcvt_f32_f16(vget_low_f16(suppressed1)));
+    vst1q_f32(out + 12, vcvt_f32_f16(vget_high_f16(suppressed1)));
+}
+
+inline void non_maxima_suppression3x3_U8_U8(const void *__restrict in_ptr, void *__restrict out_ptr, const uint32_t in_stride)
+{
+    auto       in  = static_cast<const uint8_t *__restrict>(in_ptr) - 1;
+    const auto out = static_cast<uint8_t *__restrict>(out_ptr);
+
+    // Get centre scores
+    const uint8x16_t vc = vld1q_u8(in + 1);
+
+    // Neighboring pixels
+    in -= in_stride;
+
+    // Top row
+    const uint8x16_t l_nc_0 = vld1q_u8(in);
+    const uint8x16_t m_nc_0 = vld1q_u8(in + 1);
+    const uint8x16_t r_nc_0 = vld1q_u8(in + 2);
+
+    // Keep center scores if ...
+    // vc >= l_nc_0, vc >= m_nc_0, vc >= r_nc_0
+    uint8x16_t mask = vcgeq_u8(vc, l_nc_0);
+    mask            = vandq_u8(mask, vcgeq_u8(vc, m_nc_0));
+    mask            = vandq_u8(mask, vcgeq_u8(vc, r_nc_0));
+
+    in += in_stride;
+
+    // Middle row
+    const uint8x16_t l_nc_1 = vld1q_u8(in);
+    const uint8x16_t r_nc_1 = vld1q_u8(in + 2);
+
+    // ... and ...
+    // vc >= l_nc_1, vc > r_nc_1
+    mask = vandq_u8(mask, vcgeq_u8(vc, l_nc_1));
+    mask = vandq_u8(mask, vcgtq_u8(vc, r_nc_1));
+
+    in += in_stride;
+
+    // Bottom row
+    const uint8x16_t l_nc_2 = vld1q_u8(in);
+    const uint8x16_t m_nc_2 = vld1q_u8(in + 1);
+    const uint8x16_t r_nc_2 = vld1q_u8(in + 2);
+
+    // ... and ...
+    // vc > l_nc_2, vc > m_nc_2, vc > r_nc_2
+    mask = vandq_u8(mask, vcgtq_u8(vc, l_nc_2));
+    mask = vandq_u8(mask, vcgtq_u8(vc, m_nc_2));
+    mask = vandq_u8(mask, vcgtq_u8(vc, r_nc_2));
+
+    // Store
+    static const uint8x16_t zero = vdupq_n_u8(0);
+    vst1q_u8(out, vbslq_u8(mask, vc, zero));
+}
+} // namespace fp16
+
+void NENonMaximaSuppression3x3FP16Kernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input  = input;
+    _output = output;
+
+    switch(input->info()->data_type())
+    {
+        case DataType::U8:
+            _func = &fp16::non_maxima_suppression3x3_U8_U8;
+            break;
+        default:
+            _func = &fp16::non_maxima_suppression3x3_F32_F32;
+            break;
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    const unsigned int     num_elems_read_per_iteration      = 16 + 2 * border_size().left + (input->info()->data_type() == DataType::U8 ? 0 : 3);
+    constexpr unsigned int num_elems_written_per_iteration   = 16;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+#endif
+
+namespace
+{
+inline void non_maxima_suppression3x3_FLOAT_FLOAT(const void *__restrict input_ptr, void *__restrict output_ptr, const uint32_t input_stride)
+{
+    auto       input  = static_cast<const float *__restrict>(input_ptr) - 1;
+    const auto output = static_cast<float *__restrict>(output_ptr);
+
+    // Get centre scores
+    const float32x4x4_t vc =
+    {
+        {
+            vld1q_f32(input + 1),
+            vld1q_f32(input + 5),
+            vld1q_f32(input + 9),
+            vld1q_f32(input + 13)
+        }
+    };
+
+    // Neighboring pixels
+    float32x4x4_t l_nc{ {} };
+    float32x4x4_t m_nc{ {} };
+    float32x4x4_t r_nc{ {} };
+
+    input -= input_stride;
+
+    // Row0 - Low part
+    float32x4_t tmp_low   = vld1q_f32(input);
+    float32x4_t tmp_high  = vld1q_f32(input + 4);
+    float32x4_t tmp_high1 = vld1q_f32(input + 8);
+
+    l_nc.val[0] = tmp_low;
+    m_nc.val[0] = vextq_f32(tmp_low, tmp_high, 1);
+    r_nc.val[0] = vextq_f32(tmp_low, tmp_high, 2);
+
+    tmp_low  = tmp_high;
+    tmp_high = tmp_high1;
+
+    l_nc.val[1] = tmp_low;
+    m_nc.val[1] = vextq_f32(tmp_low, tmp_high, 1);
+    r_nc.val[1] = vextq_f32(tmp_low, tmp_high, 2);
+
+    // Row0 - High part
+    tmp_low   = tmp_high1;
+    tmp_high  = vld1q_f32(input + 12);
+    tmp_high1 = vld1q_f32(input + 16);
+
+    l_nc.val[2] = tmp_low;
+    m_nc.val[2] = vextq_f32(tmp_low, tmp_high, 1);
+    r_nc.val[2] = vextq_f32(tmp_low, tmp_high, 2);
+
+    tmp_low  = tmp_high;
+    tmp_high = tmp_high1;
+
+    l_nc.val[3] = tmp_low;
+    m_nc.val[3] = vextq_f32(tmp_low, tmp_high, 1);
+    r_nc.val[3] = vextq_f32(tmp_low, tmp_high, 2);
+
+    // mc >= nc.val[0], mc >= nc.val[1], mc >= nc.val[2]
+    uint32x4x4_t mask{ {} };
+    mask.val[0] = vcgeq_f32(vc.val[0], l_nc.val[0]);
+    mask.val[0] = vandq_u32(mask.val[0], vcgeq_f32(vc.val[0], m_nc.val[0]));
+    mask.val[0] = vandq_u32(mask.val[0], vcgeq_f32(vc.val[0], r_nc.val[0]));
+    mask.val[1] = vcgeq_f32(vc.val[1], l_nc.val[1]);
+    mask.val[1] = vandq_u32(mask.val[1], vcgeq_f32(vc.val[1], m_nc.val[1]));
+    mask.val[1] = vandq_u32(mask.val[1], vcgeq_f32(vc.val[1], r_nc.val[1]));
+    mask.val[2] = vcgeq_f32(vc.val[2], l_nc.val[2]);
+    mask.val[2] = vandq_u32(mask.val[2], vcgeq_f32(vc.val[2], m_nc.val[2]));
+    mask.val[2] = vandq_u32(mask.val[2], vcgeq_f32(vc.val[2], r_nc.val[2]));
+    mask.val[3] = vcgeq_f32(vc.val[3], l_nc.val[3]);
+    mask.val[3] = vandq_u32(mask.val[3], vcgeq_f32(vc.val[3], m_nc.val[3]));
+    mask.val[3] = vandq_u32(mask.val[3], vcgeq_f32(vc.val[3], r_nc.val[3]));
+
+    input += input_stride;
+
+    // Row1 - Low part
+    tmp_low   = vld1q_f32(input);
+    tmp_high  = vld1q_f32(input + 4);
+    tmp_high1 = vld1q_f32(input + 8);
+
+    l_nc.val[0] = tmp_low;
+    r_nc.val[0] = vextq_f32(tmp_low, tmp_high, 2);
+
+    tmp_low  = tmp_high;
+    tmp_high = tmp_high1;
+
+    l_nc.val[1] = tmp_low;
+    r_nc.val[1] = vextq_f32(tmp_low, tmp_high, 2);
+
+    // Row1 - High part
+    tmp_low   = tmp_high1;
+    tmp_high  = vld1q_f32(input + 12);
+    tmp_high1 = vld1q_f32(input + 16);
+
+    l_nc.val[2] = tmp_low;
+    r_nc.val[2] = vextq_f32(tmp_low, tmp_high, 2);
+
+    tmp_low  = tmp_high;
+    tmp_high = tmp_high1;
+
+    l_nc.val[3] = tmp_low;
+    r_nc.val[3] = vextq_f32(tmp_low, tmp_high, 2);
+
+    // mc >= nc.val[0], mc > nc.val[2]
+    mask.val[0] = vandq_u32(mask.val[0], vcgeq_f32(vc.val[0], l_nc.val[0]));
+    mask.val[0] = vandq_u32(mask.val[0], vcgtq_f32(vc.val[0], r_nc.val[0]));
+    mask.val[1] = vandq_u32(mask.val[1], vcgeq_f32(vc.val[1], l_nc.val[1]));
+    mask.val[1] = vandq_u32(mask.val[1], vcgtq_f32(vc.val[1], r_nc.val[1]));
+    mask.val[2] = vandq_u32(mask.val[2], vcgeq_f32(vc.val[2], l_nc.val[2]));
+    mask.val[2] = vandq_u32(mask.val[2], vcgtq_f32(vc.val[2], r_nc.val[2]));
+    mask.val[3] = vandq_u32(mask.val[3], vcgeq_f32(vc.val[3], l_nc.val[3]));
+    mask.val[3] = vandq_u32(mask.val[3], vcgtq_f32(vc.val[3], r_nc.val[3]));
+
+    input += input_stride;
+
+    // Row2 - Low part
+    tmp_low   = vld1q_f32(input);
+    tmp_high  = vld1q_f32(input + 4);
+    tmp_high1 = vld1q_f32(input + 8);
+
+    l_nc.val[0] = tmp_low;
+    m_nc.val[0] = vextq_f32(tmp_low, tmp_high, 1);
+    r_nc.val[0] = vextq_f32(tmp_low, tmp_high, 2);
+
+    tmp_low  = tmp_high;
+    tmp_high = tmp_high1;
+
+    l_nc.val[1] = tmp_low;
+    m_nc.val[1] = vextq_f32(tmp_low, tmp_high, 1);
+    r_nc.val[1] = vextq_f32(tmp_low, tmp_high, 2);
+
+    // Row2 - High part
+    tmp_low   = tmp_high1;
+    tmp_high  = vld1q_f32(input + 12);
+    tmp_high1 = vld1q_f32(input + 16);
+
+    l_nc.val[2] = tmp_low;
+    m_nc.val[2] = vextq_f32(tmp_low, tmp_high, 1);
+    r_nc.val[2] = vextq_f32(tmp_low, tmp_high, 2);
+
+    tmp_low  = tmp_high;
+    tmp_high = tmp_high1;
+
+    l_nc.val[3] = tmp_low;
+    m_nc.val[3] = vextq_f32(tmp_low, tmp_high, 1);
+    r_nc.val[3] = vextq_f32(tmp_low, tmp_high, 2);
+
+    // mc > nc.val[0], mc > nc.val[1], mc > nc.val[2]
+    mask.val[0] = vandq_u32(mask.val[0], vcgtq_f32(vc.val[0], l_nc.val[0]));
+    mask.val[0] = vandq_u32(mask.val[0], vcgtq_f32(vc.val[0], m_nc.val[0]));
+    mask.val[0] = vandq_u32(mask.val[0], vcgtq_f32(vc.val[0], r_nc.val[0]));
+    mask.val[1] = vandq_u32(mask.val[1], vcgtq_f32(vc.val[1], l_nc.val[1]));
+    mask.val[1] = vandq_u32(mask.val[1], vcgtq_f32(vc.val[1], m_nc.val[1]));
+    mask.val[1] = vandq_u32(mask.val[1], vcgtq_f32(vc.val[1], r_nc.val[1]));
+    mask.val[2] = vandq_u32(mask.val[2], vcgtq_f32(vc.val[2], l_nc.val[2]));
+    mask.val[2] = vandq_u32(mask.val[2], vcgtq_f32(vc.val[2], m_nc.val[2]));
+    mask.val[2] = vandq_u32(mask.val[2], vcgtq_f32(vc.val[2], r_nc.val[2]));
+    mask.val[3] = vandq_u32(mask.val[3], vcgtq_f32(vc.val[3], l_nc.val[3]));
+    mask.val[3] = vandq_u32(mask.val[3], vcgtq_f32(vc.val[3], m_nc.val[3]));
+    mask.val[3] = vandq_u32(mask.val[3], vcgtq_f32(vc.val[3], r_nc.val[3]));
+
+    static const float32x4_t zero = vdupq_n_f32(0.f);
+
+    // Store
+    vst1q_f32(output + 0, vbslq_f32(mask.val[0], vc.val[0], zero));
+    vst1q_f32(output + 4, vbslq_f32(mask.val[1], vc.val[1], zero));
+    vst1q_f32(output + 8, vbslq_f32(mask.val[2], vc.val[2], zero));
+    vst1q_f32(output + 12, vbslq_f32(mask.val[3], vc.val[3], zero));
+}
+
+inline void non_maxima_suppression3x3_U8_U8(const void *__restrict input_ptr, void *__restrict output_ptr, const uint32_t input_stride)
+{
+    auto       input  = static_cast<const uint8_t *__restrict>(input_ptr) - 1;
+    const auto output = static_cast<uint8_t *__restrict>(output_ptr);
+
+    // Get centre scores
+    const uint8x16_t vc = vld1q_u8(input + 1);
+
+    // Neighboring pixels
+    uint8x16_t l_nc{};
+    uint8x16_t m_nc{};
+    uint8x16_t r_nc{};
+
+    input -= input_stride;
+
+    // Row0
+    l_nc = vld1q_u8(input);
+    m_nc = vld1q_u8(input + 1);
+    r_nc = vld1q_u8(input + 2);
+
+    // mc >= l_nc, mc >= m_nc, mc >= r_nc
+    uint8x16_t mask = vcgeq_u8(vc, l_nc);
+    mask            = vandq_u8(mask, vcgeq_u8(vc, m_nc));
+    mask            = vandq_u8(mask, vcgeq_u8(vc, r_nc));
+
+    input += input_stride;
+
+    // Row1
+    l_nc = vld1q_u8(input);
+    r_nc = vld1q_u8(input + 2);
+
+    // mc >= l_nc, mc > r_nc
+    mask = vandq_u8(mask, vcgeq_u8(vc, l_nc));
+    mask = vandq_u8(mask, vcgtq_u8(vc, r_nc));
+
+    input += input_stride;
+
+    // Row2
+    l_nc = vld1q_u8(input);
+    m_nc = vld1q_u8(input + 1);
+    r_nc = vld1q_u8(input + 2);
+
+    // mc > l_nc, mc > m_nc, mc > r_nc
+    mask = vandq_u8(mask, vcgtq_u8(vc, l_nc));
+    mask = vandq_u8(mask, vcgtq_u8(vc, m_nc));
+    mask = vandq_u8(mask, vcgtq_u8(vc, r_nc));
+
+    static const uint8x16_t zero = vdupq_n_u8(0);
+
+    // Store
+    vst1q_u8(output, vbslq_u8(mask, vc, zero));
+}
+} // namespace
+
+NENonMaximaSuppression3x3Kernel::NENonMaximaSuppression3x3Kernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr)
+{
+}
+
+BorderSize NENonMaximaSuppression3x3Kernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void NENonMaximaSuppression3x3Kernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input  = input;
+    _output = output;
+
+    if(input->info()->data_type() == DataType::U8)
+    {
+        _func = &non_maxima_suppression3x3_U8_U8;
+    }
+    else
+    {
+        _func = &non_maxima_suppression3x3_FLOAT_FLOAT;
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    const unsigned int     num_elems_read_per_iteration      = 16 + 2 * border_size().left + (input->info()->data_type() == DataType::U8 ? 0 : 3);
+    constexpr unsigned int num_elems_written_per_iteration   = 16;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+void NENonMaximaSuppression3x3Kernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    const size_t input_stride = _input->info()->strides_in_bytes()[1] / element_size_from_data_type(_input->info()->data_type());
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        _func(input.ptr(), output.ptr(), input_stride);
+    },
+    input, output);
+}
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
new file mode 100644
index 0000000000..a971dc8d97
--- /dev/null
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+NENormalizationLayerKernel::NENormalizationLayerKernel()
+    : _func(nullptr), _input(nullptr), _input_squared(nullptr), _output(nullptr), _norm_info(NormType::IN_MAP_1D), _border_size()
+{
+}
+
+BorderSize NENormalizationLayerKernel::border_size() const
+{
+    return _border_size;
+}
+
+void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, input_squared);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_squared, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, input_squared, output);
+    ARM_COMPUTE_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
+    ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.beta(), input);
+    ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.kappa(), input);
+    ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.scale_coeff(), input);
+
+    const unsigned int border_width = (norm_info.type() == NormType::CROSS_MAP) ? 0 : std::min(norm_info.norm_size() / 2, 3U);
+
+    _input         = input;
+    _input_squared = input_squared;
+    _output        = output;
+    _norm_info     = norm_info;
+    _border_size   = BorderSize(0, border_width);
+
+    const bool is_dt_f32 = _input->info()->data_type() == DataType::F32;
+
+    switch(norm_info.type())
+    {
+        case NormType::IN_MAP_1D:
+            _func = (is_dt_f32) ? &NENormalizationLayerKernel::normalize<0, false> : &NENormalizationLayerKernel::normalize_fixed_point<0, false>;
+            break;
+        case NormType::IN_MAP_2D:
+            // Normalize over X and Y
+            _func = (is_dt_f32) ? &NENormalizationLayerKernel::normalize<0, true> : &NENormalizationLayerKernel::normalize_fixed_point<0, true>;
+            break;
+        case NormType::CROSS_MAP:
+            _func = (is_dt_f32) ? &NENormalizationLayerKernel::normalize<2, false> : &NENormalizationLayerKernel::normalize_fixed_point<2, false>;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("NOT SUPPORTED!");
+    }
+
+    const unsigned int num_elems_processed_per_iteration = (is_dt_f32) ? 4 : 16;
+    const unsigned int num_elems_read_per_iteration      = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
+    const unsigned int num_rows                          = (norm_info.type() == NormType::IN_MAP_2D) ? norm_info.norm_size() : 1;
+
+    // Configure window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowRectangle  input_access(input->info(), -_border_size.left, 0, num_elems_read_per_iteration, num_rows);
+    AccessWindowRectangle  input_squared_access(input_squared->info(), -_border_size.left, 0, num_elems_read_per_iteration, num_rows);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, input_squared_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+
+template <unsigned int dim, bool do_2D_norm>
+void NENormalizationLayerKernel::normalize(const Window &window)
+{
+    Iterator input(_input, window);
+    Iterator input_squared(_input_squared, window);
+    Iterator output(_output, window);
+
+    const int dim_y                = 1;
+    const int radius               = _norm_info.norm_size() / 2;
+    const int total_size           = _input->info()->dimension(dim) - 1;
+    const int input_squared_stride = _input_squared->info()->strides_in_bytes()[dim];
+    // We account padding across X only and we iterate over rows
+    const int min_left   = (dim == 2) ? 0 : -static_cast<int>(border_size().left);
+    const int max_right  = (dim == 2) ? total_size : total_size + border_size().left;
+    const int min_top    = 0;
+    const int max_bottom = _input->info()->dimension(dim_y) - 1;
+
+    const float32x4_t coeff_vec = vdupq_n_f32(_norm_info.scale_coeff());
+    const float32x4_t beta_vec  = vdupq_n_f32(_norm_info.beta());
+    const float32x4_t kappa_vec = vdupq_n_f32(_norm_info.kappa());
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Get range to normalize
+        const int current_row   = do_2D_norm ? id[dim_y] : 0;
+        const int current_slice = id[dim];
+        const int first_row     = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
+        const int last_row      = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+        const int first_slice   = std::max(current_slice - radius, min_left);
+        const int last_slice    = std::min(current_slice + radius, max_right);
+
+        // Accumulate 2D In-Map values
+        float32x4_t accu = vdupq_n_f32(0.f);
+        for(int j = first_row; j <= last_row; j++)
+        {
+            // Compute row displacement
+            const int            row               = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
+            const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
+            for(int i = first_slice; i <= last_slice; ++i)
+            {
+                accu = vaddq_f32(accu, vld1q_f32(reinterpret_cast<const float *>(input_squared_ptr + i * input_squared_stride)));
+            }
+        }
+
+        // Normalize
+        const float32x4_t normalized       = vpowq_f32(vmlaq_f32(kappa_vec, coeff_vec, accu), beta_vec);
+        const float32x4_t normalized_pixel = vmulq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())), vinvq_f32(normalized));
+        vst1q_f32(reinterpret_cast<float *>(output.ptr()), normalized_pixel);
+    },
+    input, input_squared, output);
+}
+
+template <unsigned int dim, bool do_2D_norm>
+void NENormalizationLayerKernel::normalize_fixed_point(const Window &window)
+{
+    Iterator input(_input, window);
+    Iterator input_squared(_input_squared, window);
+    Iterator output(_output, window);
+
+    const int dim_y                = 1;
+    const int radius               = _norm_info.norm_size() / 2;
+    const int total_size           = _input->info()->dimension(dim) - 1;
+    const int input_squared_stride = _input_squared->info()->strides_in_bytes()[dim];
+    // We account padding across X only and we iterate over rows
+    const int min_left   = (dim == 2) ? 0 : -static_cast<int>(border_size().left);
+    const int max_right  = (dim == 2) ? total_size : total_size + border_size().left;
+    const int min_top    = 0;
+    const int max_bottom = _input->info()->dimension(dim_y) - 1;
+
+    const int fixed_point_position = _input->info()->fixed_point_position();
+
+    const qint8x16_t coeff_vec = vdupq_n_qs8_f32(_norm_info.scale_coeff(), fixed_point_position);
+    const qint8x16_t beta_vec  = vdupq_n_qs8_f32(_norm_info.beta(), fixed_point_position);
+    const qint8x16_t kappa_vec = vdupq_n_qs8_f32(_norm_info.kappa(), fixed_point_position);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Get range to normalize
+        const int current_row   = do_2D_norm ? id[dim_y] : 0;
+        const int current_slice = id[dim];
+        const int first_row     = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
+        const int last_row      = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+        const int first_slice   = std::max(current_slice - radius, min_left);
+        const int last_slice    = std::min(current_slice + radius, max_right);
+
+        // Accumulate 2D In-Map values
+        qint8x16_t accu = vdupq_n_qs8(0);
+        for(int j = first_row; j <= last_row; ++j)
+        {
+            // Compute row displacement
+            const int            row               = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
+            const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
+            for(int i = first_slice; i <= last_slice; ++i)
+            {
+                accu = vqaddq_qs8(accu, vld1q_qs8(reinterpret_cast<const qint8_t *>(input_squared_ptr + i * input_squared_stride)));
+            }
+        }
+
+        // Normalize
+        const qint8x16_t accu_scale       = vqmlaq_qs8(kappa_vec, coeff_vec, accu, fixed_point_position);
+        const qint8x16_t normalized       = vqpowq_qs8(accu_scale, beta_vec, fixed_point_position);
+        const qint8x16_t normalized_pixel = vdivq_qs8(vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr())), normalized, fixed_point_position);
+        vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), normalized_pixel);
+    },
+    input, input_squared, output);
+}
+
+void NENormalizationLayerKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    // Run function
+    (this->*_func)(window);
+}
diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
new file mode 100644
index 0000000000..aa8c7a1847
--- /dev/null
+++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
@@ -0,0 +1,524 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
+
+#include <arm_neon.h>
+#include <climits>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+const float       scale255_constant      = 1.f / 255.f;
+const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant);
+const float32x4_t positive_round_f32q    = vdupq_n_f32(0.5f);
+
+/* Scales a given vector by 1/255.
+ *
+ * @note This does not work for all cases. e.g. for float of 0.49999999999999994 and large floats.
+ *
+ * @param in Input vector to scale.
+ * @return   Scaled output rounded to nearest (round half up).
+ */
+inline int32x4_t scale255_S32_S32(int32x4_t in)
+{
+    // Scale
+    const float32x4_t tmp = vmulq_f32(vcvtq_f32_s32(in), scale255_constant_f32q);
+    // Round to nearest (round half up)
+    // Add +0.5 for all values
+    // Afterwards vcvt rounds toward zero
+    return vcvtq_s32_f32(vaddq_f32(tmp, positive_round_f32q));
+}
+
+inline uint16x8_t scale255_U16_U16(uint16x8_t in)
+{
+    const int32x4_t tmp_s1 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(in))));
+    const int32x4_t tmp_s2 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(in))));
+    return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1)));
+}
+
+template <bool is_scale255, bool is_sat>
+void mul_U8_U8_U8_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n)
+{
+    const auto input1 = static_cast<const uint8_t *__restrict>(input1_ptr);
+    const auto input2 = static_cast<const uint8_t *__restrict>(input2_ptr);
+    const auto output = static_cast<uint8_t *__restrict>(output_ptr);
+
+    const uint8x16_t ta1 = vld1q_u8(input1);
+    const uint8x16_t ta2 = vld1q_u8(input2);
+
+    uint16x8_t       tmp1_high = vmovl_u8(vget_high_u8(ta1));
+    const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));
+    uint16x8_t       tmp1_low  = vmovl_u8(vget_low_u8(ta1));
+    const uint16x8_t tmp2_low  = vmovl_u8(vget_low_u8(ta2));
+
+    tmp1_high = vmulq_u16(tmp1_high, tmp2_high);
+    tmp1_low  = vmulq_u16(tmp1_low, tmp2_low);
+
+    if(is_scale255)
+    {
+        tmp1_high = scale255_U16_U16(tmp1_high);
+        tmp1_low  = scale255_U16_U16(tmp1_low);
+    }
+    else
+    {
+        const int16x8_t vn = vdupq_n_s16(-n);
+
+        if(is_sat)
+        {
+            tmp1_high = vqshlq_u16(tmp1_high, vn);
+            tmp1_low  = vqshlq_u16(tmp1_low, vn);
+        }
+        else
+        {
+            tmp1_high = vshlq_u16(tmp1_high, vn);
+            tmp1_low  = vshlq_u16(tmp1_low, vn);
+        }
+    }
+
+    if(is_sat)
+    {
+        vst1q_u8(output, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));
+    }
+    else
+    {
+        vst1q_u8(output, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));
+    }
+}
+
+template <bool is_scale255, bool is_sat>
+void mul_QS8_QS8_QS8_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n, int fixed_point_position)
+{
+    // n is the exponent of the scaling factor, that is scale = 1/2^n. Currently, we only support scaling factor equal to 1 => n = 0.
+    ARM_COMPUTE_ERROR_ON_MSG(n != 0, "Scaling factor different than 1 not supported for 8-bit fixed-point pixel-wise multiplication");
+    ARM_COMPUTE_UNUSED(n);
+
+    const auto input1 = static_cast<const qint8_t *__restrict>(input1_ptr);
+    const auto input2 = static_cast<const qint8_t *__restrict>(input2_ptr);
+    const auto output = static_cast<qint8_t *__restrict>(output_ptr);
+
+    const qint8x16_t ta1 = vld1q_qs8(input1);
+    const qint8x16_t ta2 = vld1q_qs8(input2);
+
+    qint8x16_t res = (is_sat) ? vqmulq_qs8(ta1, ta2, fixed_point_position) : vmulq_qs8(ta1, ta2, fixed_point_position);
+
+    vst1q_s8(output, res);
+}
+
+template <bool is_scale255, bool is_sat>
+inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &input1, const int16x8_t &input2, int n)
+{
+    int32x4_t       tmp1_high = vmovl_s16(vget_high_s16(input1));
+    const int32x4_t tmp2_high = vmovl_s16(vget_high_s16(input2));
+    int32x4_t       tmp1_low  = vmovl_s16(vget_low_s16(input1));
+    const int32x4_t tmp2_low  = vmovl_s16(vget_low_s16(input2));
+
+    tmp1_high = vmulq_s32(tmp1_high, tmp2_high);
+    tmp1_low  = vmulq_s32(tmp1_low, tmp2_low);
+
+    if(is_scale255)
+    {
+        tmp1_high = scale255_S32_S32(tmp1_high);
+        tmp1_low  = scale255_S32_S32(tmp1_low);
+    }
+    else
+    {
+        // Right shift amount
+        const int32x4_t vn = vdupq_n_s32(-n);
+        // Left shift amount
+        const int32x4_t vnl = vdupq_n_s32(n);
+        // Calculate conversion bit
+        const uint32x4_t tmp1_high_u  = vreinterpretq_u32_s32(tmp1_high);
+        const uint32x4_t tmp1_low_u   = vreinterpretq_u32_s32(tmp1_low);
+        const uint32x4_t sign_high    = vshrq_n_u32(tmp1_high_u, 31);
+        const uint32x4_t sign_low     = vshrq_n_u32(tmp1_low_u, 31);
+        const int32x4_t  sign_high_s  = vreinterpretq_s32_u32(sign_high);
+        const int32x4_t  sign_low_s   = vreinterpretq_s32_u32(sign_low);
+        const int32x4_t  convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s);
+        const int32x4_t  convert_low  = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s);
+        if(is_sat)
+        {
+            tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
+            tmp1_low  = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
+        }
+        else
+        {
+            tmp1_high = vshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
+            tmp1_low  = vshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
+        }
+    }
+
+    if(is_sat)
+    {
+        return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high));
+    }
+    else
+    {
+        return vcombine_s16(vmovn_s32(tmp1_low), vmovn_s32(tmp1_high));
+    }
+}
+
+template <bool is_scale255, bool is_sat>
+inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &input1, const int16x8x2_t &input2, int n)
+{
+    const int16x8x2_t result =
+    {
+        {
+            // First 8 elements
+            mul_S16_S16_S16_n_loop<is_scale255, is_sat>(input1.val[0], input2.val[0], n),
+            // Second 8 elements
+            mul_S16_S16_S16_n_loop<is_scale255, is_sat>(input1.val[1], input2.val[1], n)
+        }
+    };
+
+    return result;
+}
+
+template <bool is_scale255, bool is_sat>
+void mul_S16_S16_S16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n)
+{
+    const auto input1 = static_cast<const int16_t *__restrict>(input1_ptr);
+    const auto input2 = static_cast<const int16_t *__restrict>(input2_ptr);
+    const auto output = static_cast<int16_t *__restrict>(output_ptr);
+
+    const int16x8x2_t ta1    = vld2q_s16(input1);
+    const int16x8x2_t ta2    = vld2q_s16(input2);
+    const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
+
+    vst2q_s16(output, result);
+}
+
+template <bool is_scale255, bool is_sat>
+void mul_F32_F32_F32_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale)
+{
+    const auto input1 = static_cast<const float *__restrict>(input1_ptr);
+    const auto input2 = static_cast<const float *__restrict>(input2_ptr);
+    const auto output = static_cast<float *__restrict>(output_ptr);
+
+    const float32x4x4_t ta1       = vld4q_f32(input1);
+    const float32x4x4_t ta2       = vld4q_f32(input2);
+    const float32x4_t   scale_vec = vdupq_n_f32(scale);
+    const float32x4x4_t result =
+    {
+        {
+            vmulq_f32(vmulq_f32(ta1.val[0], ta2.val[0]), scale_vec),
+            vmulq_f32(vmulq_f32(ta1.val[1], ta2.val[1]), scale_vec),
+            vmulq_f32(vmulq_f32(ta1.val[2], ta2.val[2]), scale_vec),
+            vmulq_f32(vmulq_f32(ta1.val[3], ta2.val[3]), scale_vec)
+        }
+    };
+    vst4q_f32(output, result);
+}
+
+template <bool is_scale255, bool is_sat>
+void mul_U8_U8_S16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n)
+{
+    const auto input1 = static_cast<const uint8_t *__restrict>(input1_ptr);
+    const auto input2 = static_cast<const uint8_t *__restrict>(input2_ptr);
+    const auto output = static_cast<int16_t *__restrict>(output_ptr);
+
+    const uint8x16_t bv = vld1q_u8(input2);
+    const uint8x16_t av = vld1q_u8(input1);
+
+    uint16x8_t tmp_low  = vmovl_u8(vget_low_u8(av));
+    uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));
+    tmp_low             = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));
+    tmp_high            = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));
+
+    if(is_scale255)
+    {
+        tmp_low  = scale255_U16_U16(tmp_low);
+        tmp_high = scale255_U16_U16(tmp_high);
+    }
+    else
+    {
+        const int16x8_t vn = vdupq_n_s16(-n);
+
+        if(is_sat)
+        {
+            tmp_low  = vqshlq_u16(tmp_low, vn);
+            tmp_high = vqshlq_u16(tmp_high, vn);
+        }
+        else
+        {
+            tmp_low  = vshlq_u16(tmp_low, vn);
+            tmp_high = vshlq_u16(tmp_high, vn);
+        }
+    }
+
+    if(is_sat)
+    {
+        static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);
+
+        tmp_low  = vminq_u16(tmp_low, max);
+        tmp_high = vminq_u16(tmp_high, max);
+    }
+
+    vst1q_s16(output, vreinterpretq_s16_u16(tmp_low));
+    vst1q_s16(output + 8, vreinterpretq_s16_u16(tmp_high));
+}
+
+template <bool is_scale255, bool is_sat>
+void mul_S16_U8_S16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n)
+{
+    const auto input1 = static_cast<const int16_t *__restrict>(input1_ptr);
+    const auto input2 = static_cast<const uint8_t *__restrict>(input2_ptr);
+    const auto output = static_cast<int16_t *__restrict>(output_ptr);
+
+    const int16x8x2_t ta1  = vld2q_s16(input1);
+    const uint8x8x2_t ta2u = vld2_u8(input2);
+    const int16x8x2_t ta2 =
+    {
+        {
+            vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])),
+            vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))
+        }
+    };
+
+    const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
+
+    vst2q_s16(output, result);
+}
+
+template <bool is_scale255, bool is_sat>
+void mul_U8_S16_S16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n)
+{
+    // Simply swap the two input buffers
+    mul_S16_U8_S16_n<is_scale255, is_sat>(input2_ptr, input1_ptr, output_ptr, n);
+}
+} // namespace
+
+NEPixelWiseMultiplicationKernel::NEPixelWiseMultiplicationKernel()
+    : _func_float(nullptr), _func_int(nullptr), _func_q_int(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _scale{ 0 }, _scale_exponent{ 0 }
+{
+}
+
+void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
+                             "Output can only be U8 if both inputs are U8");
+    if(output->info()->data_type() == DataType::QS8 || input1->info()->data_type() == DataType::QS8 || output->info()->data_type() == DataType::QS8)
+    {
+        // All data types must be QS8
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input1, input2, output);
+    }
+
+    _input1         = input1;
+    _input2         = input2;
+    _output         = output;
+    _scale          = scale;
+    _scale_exponent = 0;
+    _func_int       = nullptr;
+    _func_q_int     = nullptr;
+    _func_float     = nullptr;
+
+    bool is_scale_255 = false;
+    // Check and validate scaling factor
+    if(std::abs(scale - scale255_constant) < 0.00001f)
+    {
+        ARM_COMPUTE_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
+        ARM_COMPUTE_UNUSED(rounding_policy);
+
+        is_scale_255 = true;
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR_ON(rounding_policy != RoundingPolicy::TO_ZERO);
+        ARM_COMPUTE_UNUSED(rounding_policy);
+
+        int         exponent            = 0;
+        const float normalized_mantissa = std::frexp(scale, &exponent);
+
+        // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
+        // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
+        // Moreover, it will be negative as we deal with 1/2^n
+        if((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1))
+        {
+            // Store the positive exponent. We know that we compute 1/2^n
+            // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
+            _scale_exponent = std::abs(exponent - 1);
+        }
+        else
+        {
+            ARM_COMPUTE_ERROR("Scale value not supported (Should be 1/(2^n) or 1/255");
+        }
+    }
+
+    const DataType dt_input1 = input1->info()->data_type();
+    const DataType dt_input2 = input2->info()->data_type();
+    const DataType dt_output = output->info()->data_type();
+    const bool     is_sat    = (overflow_policy == ConvertPolicy::SATURATE);
+
+    if(DataType::U8 == dt_input1 && DataType::U8 == dt_input2 && DataType::U8 == dt_output)
+    {
+        if(is_scale_255)
+        {
+            _func_int = is_sat ? &mul_U8_U8_U8_n<true, true> : &mul_U8_U8_U8_n<true, false>;
+        }
+        else
+        {
+            _func_int = is_sat ? &mul_U8_U8_U8_n<false, true> : &mul_U8_U8_U8_n<false, false>;
+        }
+    }
+    else if(DataType::S16 == dt_input1 && DataType::S16 == dt_input2 && DataType::S16 == dt_output)
+    {
+        if(is_scale_255)
+        {
+            _func_int = is_sat ? &mul_S16_S16_S16_n<true, true> : &mul_S16_S16_S16_n<true, false>;
+        }
+        else
+        {
+            _func_int = is_sat ? &mul_S16_S16_S16_n<false, true> : &mul_S16_S16_S16_n<false, false>;
+        }
+    }
+    else if(DataType::S16 == dt_input1 && DataType::U8 == dt_input2 && DataType::S16 == dt_output)
+    {
+        if(is_scale_255)
+        {
+            _func_int = is_sat ? &mul_S16_U8_S16_n<true, true> : &mul_S16_U8_S16_n<true, false>;
+        }
+        else
+        {
+            _func_int = is_sat ? &mul_S16_U8_S16_n<false, true> : &mul_S16_U8_S16_n<false, false>;
+        }
+    }
+    else if(DataType::U8 == dt_input1 && DataType::S16 == dt_input2 && DataType::S16 == dt_output)
+    {
+        if(is_scale_255)
+        {
+            _func_int = is_sat ? &mul_U8_S16_S16_n<true, true> : &mul_U8_S16_S16_n<true, false>;
+        }
+        else
+        {
+            _func_int = is_sat ? &mul_U8_S16_S16_n<false, true> : &mul_U8_S16_S16_n<false, false>;
+        }
+    }
+    else if(DataType::U8 == dt_input1 && DataType::U8 == dt_input2 && DataType::S16 == dt_output)
+    {
+        if(is_scale_255)
+        {
+            _func_int = is_sat ? &mul_U8_U8_S16_n<true, true> : &mul_U8_U8_S16_n<true, false>;
+        }
+        else
+        {
+            _func_int = is_sat ? &mul_U8_U8_S16_n<false, true> : &mul_U8_U8_S16_n<false, false>;
+        }
+    }
+    else if(DataType::QS8 == dt_input1 && DataType::QS8 == dt_input2 && DataType::QS8 == dt_output)
+    {
+        if(is_scale_255)
+        {
+            _func_q_int = is_sat ? &mul_QS8_QS8_QS8_n<true, true> : &mul_QS8_QS8_QS8_n<true, false>;
+        }
+        else
+        {
+            _func_q_int = is_sat ? &mul_QS8_QS8_QS8_n<false, true> : &mul_QS8_QS8_QS8_n<false, false>;
+        }
+    }
+    else if(DataType::F32 == dt_input1 && DataType::F32 == dt_input2 && DataType::F32 == dt_output)
+    {
+        _func_float = &mul_F32_F32_F32_n<false, false>;
+        _func_int   = nullptr;
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("You called with the wrong img formats");
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
+                              AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
+                              output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    INEKernel::configure(win);
+}
+
+void NEPixelWiseMultiplicationKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    Iterator input1(_input1, window);
+    Iterator input2(_input2, window);
+    Iterator output(_output, window);
+
+    if(_func_int != nullptr)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            (*_func_int)(input1.ptr(), input2.ptr(), output.ptr(), _scale_exponent);
+        },
+        input1, input2, output);
+    }
+    else if(_func_q_int != nullptr)
+    {
+        int fixed_point_position = _input1->info()->fixed_point_position();
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            (*_func_q_int)(input1.ptr(), input2.ptr(), output.ptr(), _scale_exponent, fixed_point_position);
+        },
+        input1, input2, output);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR_ON(_func_float == nullptr);
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            (*_func_float)(input1.ptr(), input2.ptr(), output.ptr(), _scale);
+        },
+        input1, input2, output);
+    }
+}
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
new file mode 100644
index 0000000000..30b67b64b9
--- /dev/null
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -0,0 +1,415 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/FixedPoint.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <limits>
+#include <string>
+#include <tuple>
+
+using namespace arm_compute;
+
+namespace
+{
+inline float calculate_avg_scale(const Coordinates &id, const int pool_size, const int upper_bound_w, const int upper_bound_h,
+                                 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int start_x = id.x() * stride_x - pad_x;
+    int start_y = id.y() * stride_y - pad_y;
+    int end_x   = std::min(start_x + pool_size, upper_bound_w);
+    int end_y   = std::min(start_y + pool_size, upper_bound_h);
+    return 1.f / ((end_y - start_y) * (end_x - start_x));
+}
+
+inline qint8_t calculate_avg_scale_q8(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
+                                      int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
+{
+    static std::array<qint8_t, 10> scale_values_q8 =
+    { { 0x0, 0x0, 0x40, 0x2A, 0x20, 0x19, 0x15, 0x12, 0x10, 0xE } };
+    const int start_x = id.x() * stride_x - pad_x;
+    const int start_y = id.y() * stride_y - pad_y;
+    const int end_x   = std::min(start_x + pool_size, upper_bound_w);
+    const int end_y   = std::min(start_y + pool_size, upper_bound_h);
+    const int val     = ((end_y - start_y) * (end_x - start_x));
+    return scale_values_q8[val] >> (7 - fixed_point_position);
+}
+} // namespace
+
+NEPoolingLayerKernel::NEPoolingLayerKernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _num_elems_processed_per_iteration(0), _border_size(0)
+{
+}
+
+BorderSize NEPoolingLayerKernel::border_size() const
+{
+    return _border_size;
+}
+
+void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
+{
+    int                   pool_pad_x      = 0;
+    int                   pool_pad_y      = 0;
+    int                   pool_stride_x   = 0;
+    int                   pool_stride_y   = 0;
+    unsigned int          pooled_w        = 0;
+    unsigned int          pooled_h        = 0;
+    PoolingType           pool_type       = pool_info.pool_type();
+    int                   pool_size       = pool_info.pool_size();
+    const PadStrideInfo   pad_stride_info = pool_info.pad_stride_info();
+    DimensionRoundingType pool_round      = pad_stride_info.round();
+    std::tie(pool_pad_x, pool_pad_y)       = pad_stride_info.pad();
+    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
+
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    ARM_COMPUTE_ERROR_ON(2 != pool_size && 3 != pool_size);
+    ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size);
+    ARM_COMPUTE_ERROR_ON(input->info()->data_type() == DataType::QS8 && pool_type == PoolingType::AVG && input->info()->fixed_point_position() > 6);
+    ARM_COMPUTE_ERROR_ON(input->info()->data_type() == DataType::QS8 && pool_stride_x > 2);
+
+    // Check output dimensions
+    std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
+                                                     pool_size, pool_stride_x, pool_stride_y,
+                                                     pool_pad_x, pool_pad_y, pool_round);
+    ARM_COMPUTE_UNUSED(pooled_w);
+    ARM_COMPUTE_UNUSED(pooled_h);
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pooled_w) || (output->info()->dimension(1) != pooled_h));
+
+    unsigned int num_elems_read_per_iteration      = 0;
+    unsigned int num_elems_processed_per_iteration = 0;
+    unsigned int num_elems_horizontal_window       = 0;
+
+    // Select element size
+    switch(input->info()->data_type())
+    {
+        case DataType::QS8:
+            num_elems_read_per_iteration      = 16;
+            num_elems_processed_per_iteration = (pool_size == 2) ? 8 : 7;
+            num_elems_horizontal_window       = 8;
+            break;
+        case DataType::F32:
+            num_elems_read_per_iteration      = (pool_size == 2) ? 2 : 4; // We use vload4 for pooling3
+            num_elems_processed_per_iteration = 1;
+            num_elems_horizontal_window       = 1;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Element size not supported");
+            break;
+    }
+
+    _num_elems_processed_per_iteration = num_elems_processed_per_iteration;
+    const int input_width              = input->info()->dimension(0);
+    const int input_height             = input->info()->dimension(1);
+    const int upper_bound_w            = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width;
+    const int upper_bound_h            = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
+
+    // Set instance variables
+    _input              = input;
+    _output             = output;
+    _pool_info          = pool_info;
+    _border_size        = BorderSize(pool_pad_y, pool_pad_x);
+    _border_size.right  = std::max(upper_bound_w, pool_pad_x);
+    _border_size.bottom = std::max(upper_bound_h, pool_pad_y);
+
+    // Select appropriate function
+    switch(pool_size)
+    {
+        case 2:
+            if(input->info()->data_type() == DataType::QS8)
+            {
+                _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_q8<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_q8<PoolingType::MAX>;
+            }
+            else if(input->info()->data_type() == DataType::F32)
+            {
+                _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX>;
+            }
+            break;
+        case 3:
+            if(input->info()->data_type() == DataType::QS8)
+            {
+                _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
+            }
+            else if(input->info()->data_type() == DataType::F32)
+            {
+                _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX>;
+            }
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported pooling size");
+            break;
+    }
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowStatic     input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_horizontal_window);
+    update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    INEKernel::configure(win);
+}
+
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::pooling2_q8(const Window &window_input, const Window &window)
+{
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    const int     fixed_point_position = _input->info()->fixed_point_position();
+    constexpr int pool_size            = 2;
+    int           pool_pad_x           = 0;
+    int           pool_pad_y           = 0;
+    int           pool_stride_x        = 0;
+    int           pool_stride_y        = 0;
+    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
+    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+
+    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
+    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto top_data    = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
+        const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
+        qint8x8_t  res         = {};
+        if(pooling_type == PoolingType::AVG)
+        {
+            // Calculate scale
+            const qint8_t   scale     = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
+            const qint8x8_t scale_vec = vdup_n_qs8(scale);
+
+            // Perform pooling
+            const qint8x16_t sum_data = vqaddq_qs8(top_data, bottom_data);
+            res                       = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data), vget_high_s8(sum_data)), scale_vec, fixed_point_position);
+        }
+        else
+        {
+            const qint8x16_t max_data = vmaxq_s8(top_data, bottom_data);
+            res                       = vpmax_s8(vget_low_s8(max_data), vget_high_s8(max_data));
+        }
+        vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
+    },
+    input, output);
+}
+
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window &window)
+{
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    constexpr int pool_size = 2;
+    int           pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0;
+    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
+    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+
+    const unsigned char *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
+    const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const float32x2_t top_data    = vld1_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
+        const float32x2_t bottom_data = vld1_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
+        float32x2_t       res         = {};
+        if(pooling_type == PoolingType::AVG)
+        {
+            // Calculate scale
+            float             scale   = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+            const float32x2_t scale_v = vdup_n_f32(scale);
+
+            // Perform pooling
+            const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
+            res                        = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
+        }
+        else
+        {
+            const float32x2_t max_data = vmax_f32(top_data, bottom_data);
+            res                        = vpmax_f32(max_data, max_data);
+        }
+        *(reinterpret_cast<float *>(output.ptr())) = vget_lane_f32(res, 0);
+    },
+    input, output);
+}
+
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::pooling3_q8(const Window &window_input, const Window &window)
+{
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    const int     fixed_point_position = _input->info()->fixed_point_position();
+    constexpr int pool_size            = 3;
+    int           pool_pad_x           = 0;
+    int           pool_pad_y           = 0;
+    int           pool_stride_x        = 0;
+    int           pool_stride_y        = 0;
+    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
+    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+
+    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
+    const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto top_data    = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
+        const auto middle_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_middle_ptr + input.offset()));
+        const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
+        qint8x8_t  res         = {};
+        if(pooling_type == PoolingType::AVG)
+        {
+            // Calculate scale
+            const qint8_t   scale     = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
+            const qint8x8_t scale_vec = vdup_n_qs8(scale);
+
+            // Perform pooling for stride 2
+            const qint8x16_t sum_data  = vqaddq_qs8(vqaddq_qs8(top_data, bottom_data), middle_data);
+            const qint8x16_t sum_data2 = vextq_s8(sum_data, sum_data, 1);
+            const qint8x16_t sum_data3 = vextq_s8(sum_data, sum_data, 2);
+            const qint8x16_t final_sum = vqaddq_qs8(vqaddq_qs8(sum_data, sum_data2), sum_data3);
+            if(pool_stride_x == 2)
+            {
+                const qint8x8x2_t      table      = { { vget_low_s8(final_sum), vget_high_s8(final_sum) } };
+                static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
+                res                               = vtbl2_s8(table, lookup_val);
+            }
+            else
+            {
+                res = vget_low_s8(final_sum);
+            }
+            res = vqmul_qs8(res, scale_vec, fixed_point_position);
+        }
+        else
+        {
+            const qint8x16_t max_data  = vmaxq_s8(vmaxq_s8(top_data, bottom_data), middle_data);
+            const qint8x16_t max_data2 = vextq_s8(max_data, max_data, 1);
+            const qint8x16_t max_data3 = vextq_s8(max_data, max_data, 2);
+            const qint8x16_t final_max = vmaxq_s8(vmaxq_s8(max_data, max_data2), max_data3);
+
+            if(pool_stride_x == 2)
+            {
+                const qint8x8x2_t      table      = { { vget_low_s8(final_max), vget_high_s8(final_max) } };
+                static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
+                res                               = vtbl2_s8(table, lookup_val);
+            }
+            else
+            {
+                res = vget_low_s8(final_max);
+            }
+        }
+        vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
+    },
+    input, output);
+}
+
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window &window)
+{
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    constexpr const int pool_size = 3;
+    int                 pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0;
+    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
+    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+
+    const unsigned char *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
+    const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+    const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const float32x4_t top_data    = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
+        const float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
+        const float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
+        float32x2_t       res         = {};
+        if(pooling_type == PoolingType::AVG)
+        {
+            // Calculate scale
+            float             scale   = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+            const float32x2_t scale_v = vdup_n_f32(scale);
+
+            // Perform pooling
+            const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
+            res                        = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
+            res                        = vmul_f32(vpadd_f32(res, res), scale_v);
+        }
+        else
+        {
+            const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
+            res                        = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
+            res                        = vpmax_f32(res, res);
+        }
+        *(reinterpret_cast<float *>(output.ptr())) = vget_lane_f32(res, 0);
+    },
+    input, output);
+}
+
+void NEPoolingLayerKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    unsigned int pool_stride_x, pool_stride_y = 0;
+    std::tie(pool_stride_x, pool_stride_y)    = _pool_info.pad_stride_info().stride();
+
+    // Set step for input in x and y direction for the input
+    Window       window_input(window);
+    unsigned int window_x_inc = 0;
+    if(_input->info()->data_type() == DataType::QS8)
+    {
+        window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
+    }
+    else
+    {
+        window_x_inc = pool_stride_x;
+    }
+    window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
+    window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
+
+    // Run function
+    (this->*_func)(window_input, window);
+}
diff --git a/src/core/NEON/kernels/NERemapKernel.cpp b/src/core/NEON/kernels/NERemapKernel.cpp
new file mode 100644
index 0000000000..c3c44a5f32
--- /dev/null
+++ b/src/core/NEON/kernels/NERemapKernel.cpp
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NERemapKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+inline int32x4_t offset_nearest_interpolation(const float *mapx_ptr, const float *mapy_ptr, const float32x4_t &width, const float32x4_t &height, const int32x4_t &stride)
+{
+    static const float32x4_t lowerxy = vdupq_n_f32(-1.0f);
+
+    float32x4_t x = vld1q_f32(mapx_ptr);
+    float32x4_t y = vld1q_f32(mapy_ptr);
+
+    // Clamp x coordinates
+    x = vmaxq_f32(lowerxy, vminq_f32(x, width));
+    y = vmaxq_f32(lowerxy, vminq_f32(y, height));
+
+    const int32x4_t x_s32 = vcvtq_s32_f32(x);
+    const int32x4_t y_s32 = vcvtq_s32_f32(y);
+
+    return vmlaq_s32(x_s32, y_s32, stride);
+}
+
+} // namespace
+
+NERemapKernel::NERemapKernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _map_x(nullptr), _map_y(nullptr)
+{
+}
+
+void NERemapKernel::configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
+
+    _input  = input;
+    _output = output;
+    _map_x  = map_x;
+    _map_y  = map_y;
+
+    switch(policy)
+    {
+        case InterpolationPolicy::NEAREST_NEIGHBOR:
+        {
+            _func = &NERemapKernel::remap_nearest;
+            break;
+        }
+        case InterpolationPolicy::BILINEAR:
+        {
+            _func = &NERemapKernel::remap_bilinear;
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Unsupported interpolation mode");
+            break;
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowStatic output_access(output->info(), 0, 0, output->info()->dimension(0), output->info()->dimension(1));
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), 0, 0, num_elems_processed_per_iteration, 1),
+                              AccessWindowRectangle(map_x->info(), 0, 0, num_elems_processed_per_iteration, 1),
+                              AccessWindowRectangle(map_y->info(), 0, 0, num_elems_processed_per_iteration, 1),
+                              output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NERemapKernel::remap_nearest(const Window &window)
+{
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, window);
+    Iterator mapx(_map_x, window);
+    Iterator mapy(_map_y, window);
+
+    const float32x4_t width     = vdupq_n_f32(static_cast<float>(_input->info()->dimension(0)));
+    const float32x4_t height    = vdupq_n_f32(static_cast<float>(_input->info()->dimension(1)));
+    const int32x4_t   in_stride = vdupq_n_s32(static_cast<int32_t>(_input->info()->strides_in_bytes()[1]));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto     mapx_ptr = reinterpret_cast<const float *>(mapx.ptr());
+        const auto     mapy_ptr = reinterpret_cast<const float *>(mapy.ptr());
+        const uint8_t *in_ptr   = in.ptr();
+
+        const int32x4_t offset0 = offset_nearest_interpolation(mapx_ptr + 0, mapy_ptr + 0, width, height, in_stride);
+        const int32x4_t offset1 = offset_nearest_interpolation(mapx_ptr + 4, mapy_ptr + 4, width, height, in_stride);
+        const int32x4_t offset2 = offset_nearest_interpolation(mapx_ptr + 8, mapy_ptr + 8, width, height, in_stride);
+        const int32x4_t offset3 = offset_nearest_interpolation(mapx_ptr + 12, mapy_ptr + 12, width, height, in_stride);
+
+        uint8x8_t tmp0 = vdup_n_u8(0);
+        tmp0           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 0)], tmp0, 0);
+        tmp0           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 1)], tmp0, 1);
+        tmp0           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 2)], tmp0, 2);
+        tmp0           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 3)], tmp0, 3);
+        tmp0           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 0)], tmp0, 4);
+        tmp0           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 1)], tmp0, 5);
+        tmp0           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 2)], tmp0, 6);
+        tmp0           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 3)], tmp0, 7);
+
+        uint8x8_t tmp1 = vdup_n_u8(0);
+        tmp1           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset2, 0)], tmp1, 0);
+        tmp1           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset2, 1)], tmp1, 1);
+        tmp1           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset2, 2)], tmp1, 2);
+        tmp1           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset2, 3)], tmp1, 3);
+        tmp1           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset3, 0)], tmp1, 4);
+        tmp1           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset3, 1)], tmp1, 5);
+        tmp1           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset3, 2)], tmp1, 6);
+        tmp1           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset3, 3)], tmp1, 7);
+
+        vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
+    },
+    in, out, mapx, mapy);
+}
+
+void NERemapKernel::remap_bilinear(const Window &window)
+{
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, window);
+    Iterator mapx(_map_x, window);
+    Iterator mapy(_map_y, window);
+
+    const size_t width     = _input->info()->dimension(0);
+    const size_t height    = _input->info()->dimension(1);
+    const size_t in_stride = _input->info()->strides_in_bytes()[1];
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto     mapx_ptr = reinterpret_cast<float *>(mapx.ptr());
+        const auto     mapy_ptr = reinterpret_cast<float *>(mapy.ptr());
+        const uint8_t *in_ptr   = in.ptr();
+
+        uint8x8_t tmp0 = vdup_n_u8(0);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[0], mapy_ptr[0]), tmp0, 0);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[1], mapy_ptr[1]), tmp0, 1);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[2], mapy_ptr[2]), tmp0, 2);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[3], mapy_ptr[3]), tmp0, 3);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[4], mapy_ptr[4]), tmp0, 4);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[5], mapy_ptr[5]), tmp0, 5);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[6], mapy_ptr[6]), tmp0, 6);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[7], mapy_ptr[7]), tmp0, 7);
+
+        uint8x8_t tmp1 = vdup_n_u8(0);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[8], mapy_ptr[8]), tmp1, 0);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[9], mapy_ptr[9]), tmp1, 1);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[10], mapy_ptr[10]), tmp1, 2);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[11], mapy_ptr[11]), tmp1, 3);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[12], mapy_ptr[12]), tmp1, 4);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[13], mapy_ptr[13]), tmp1, 5);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[14], mapy_ptr[14]), tmp1, 6);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[15], mapy_ptr[15]), tmp1, 7);
+
+        vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
+    },
+    in, out, mapx, mapy);
+}
+
+void NERemapKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (this->*_func)(window);
+}
diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp
new file mode 100644
index 0000000000..fd2978de1c
--- /dev/null
+++ b/src/core/NEON/kernels/NEScaleKernel.cpp
@@ -0,0 +1,359 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+NEScaleKernel::NEScaleKernel()
+    : _func(nullptr), _offsets(nullptr), _dx(nullptr), _dy(nullptr), _input(nullptr), _output(nullptr)
+{
+}
+
+BorderSize NEScaleKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void NEScaleKernel::configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets, ITensor *output, InterpolationPolicy policy, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+
+    if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
+    }
+
+    if(policy == InterpolationPolicy::BILINEAR)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dx, 1, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dy, 1, DataType::F32);
+    }
+
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) == 0);
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) == 0);
+
+    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
+    }
+
+    _input   = input;
+    _output  = output;
+    _offsets = offsets;
+    _dx      = dx;
+    _dy      = dy;
+
+    switch(policy)
+    {
+        case InterpolationPolicy::NEAREST_NEIGHBOR:
+        {
+            _func = &NEScaleKernel::scale_nearest;
+            break;
+        }
+        case InterpolationPolicy::BILINEAR:
+        {
+            ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_dx, 1, DataType::F32);
+            ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_dy, 1, DataType::F32);
+
+            _func = &NEScaleKernel::scale_bilinear;
+            break;
+        }
+        case InterpolationPolicy::AREA:
+        {
+            _func = &NEScaleKernel::scale_area;
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Unsupported interpolation mode");
+    }
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    const int              border_offset                     = (border_undefined) ? 0 : border_size().left;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowStatic     input_access(input->info(), -border_offset, -border_offset, input->info()->dimension(0) + border_offset, input->info()->dimension(1) + border_offset);
+    AccessWindowHorizontal offsets_access(offsets->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal dx_access(dx == nullptr ? nullptr : dx->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal dy_access(dy == nullptr ? nullptr : dy->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              input_access,
+                              offsets_access,
+                              dx_access,
+                              dy_access,
+                              output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NEScaleKernel::scale_nearest(const Window &window)
+{
+    const size_t input_stride = _input->info()->strides_in_bytes()[1];
+
+    // Compute the ratio between source height and destination height
+    const auto hr = static_cast<float>(_input->info()->dimension(1)) / static_cast<float>(_output->info()->dimension(1));
+
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Window win_off;
+    win_off.set(Window::DimX, window[Window::DimX]);
+    win_off.set(Window::DimY, window[Window::DimY]);
+
+    for(size_t d = Window::DimZ; d < _offsets->info()->num_dimensions(); ++d)
+    {
+        win_off.set(d, Window::Dimension(0, 0, 0));
+    }
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, window);
+    Iterator offsets(_offsets, win_off);
+
+    switch(_input->info()->data_type())
+    {
+        case DataType::U8:
+        {
+            uint8x16_t tmp = vdupq_n_u8(0);
+
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                const auto           offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+                const uint8_t *const in_ptr      = in.ptr();
+
+                const size_t in_yi      = (id.y() + 0.5f) * hr;
+                const size_t offset_row = in_yi * input_stride;
+
+                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[0] + offset_row], tmp, 0);
+                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[1] + offset_row], tmp, 1);
+                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[2] + offset_row], tmp, 2);
+                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[3] + offset_row], tmp, 3);
+                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[4] + offset_row], tmp, 4);
+                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[5] + offset_row], tmp, 5);
+                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[6] + offset_row], tmp, 6);
+                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[7] + offset_row], tmp, 7);
+                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[8] + offset_row], tmp, 8);
+                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[9] + offset_row], tmp, 9);
+                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[10] + offset_row], tmp, 10);
+                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[11] + offset_row], tmp, 11);
+                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[12] + offset_row], tmp, 12);
+                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[13] + offset_row], tmp, 13);
+                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[14] + offset_row], tmp, 14);
+                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[15] + offset_row], tmp, 15);
+
+                vst1q_u8(out.ptr(), tmp);
+            },
+            in, offsets, out);
+            break;
+        }
+        case DataType::S16:
+        {
+            int16x8x2_t tmp =
+            {
+                {
+                    vdupq_n_s16(0),
+                    vdupq_n_s16(0)
+                }
+            };
+
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+
+                const size_t in_yi      = (id.y() + 0.5f) * hr;
+                const size_t offset_row = in_yi * input_stride;
+
+                tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0);
+                tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[0], 1);
+                tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[4] + offset_row), tmp.val[0], 2);
+                tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[6] + offset_row), tmp.val[0], 3);
+                tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[8] + offset_row), tmp.val[0], 4);
+                tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[10] + offset_row), tmp.val[0], 5);
+                tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[12] + offset_row), tmp.val[0], 6);
+                tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[14] + offset_row), tmp.val[0], 7);
+
+                tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[1] + offset_row), tmp.val[1], 0);
+                tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[3] + offset_row), tmp.val[1], 1);
+                tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[5] + offset_row), tmp.val[1], 2);
+                tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[7] + offset_row), tmp.val[1], 3);
+                tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[9] + offset_row), tmp.val[1], 4);
+                tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[11] + offset_row), tmp.val[1], 5);
+                tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[13] + offset_row), tmp.val[1], 6);
+                tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[15] + offset_row), tmp.val[1], 7);
+
+                vst2q_s16(reinterpret_cast<int16_t *>(out.ptr()), tmp);
+            },
+            in, offsets, out);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+}
+
+void NEScaleKernel::scale_bilinear(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8);
+
+    // Compute the ratio between source height and destination height
+    const auto hr = static_cast<float>(_input->info()->dimension(1)) / static_cast<float>(_output->info()->dimension(1));
+
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Window win_off;
+    win_off.set(Window::DimX, window.x());
+    win_off.set(Window::DimY, window.y());
+
+    for(size_t d = Window::DimZ; d < _offsets->info()->num_dimensions(); ++d)
+    {
+        win_off.set(d, Window::Dimension(0, 0, 0));
+    }
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, window);
+    Iterator offsets(_offsets, win_off);
+    Iterator dx(_dx, win_off);
+    Iterator dy(_dy, win_off);
+
+    /* Input image stride */
+    const size_t in_stride = _input->info()->strides_in_bytes()[1];
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+        const auto dx_ptr      = reinterpret_cast<const float *>(dx.ptr());
+        const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
+        const auto in_ptr      = reinterpret_cast<const uint8_t *>(in.ptr());
+
+        const size_t in_yi      = std::floor((id.y() + 0.5f) * hr - 0.5f);
+        const size_t offset_row = in_yi * in_stride;
+
+        uint8x8_t tmp0 = vdup_n_u8(0);
+        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0]), tmp0, 0);
+        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1]), tmp0, 1);
+        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2]), tmp0, 2);
+        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3]), tmp0, 3);
+        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4]), tmp0, 4);
+        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5]), tmp0, 5);
+        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6]), tmp0, 6);
+        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7]), tmp0, 7);
+
+        uint8x8_t tmp1 = vdup_n_u8(0);
+        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8]), tmp1, 0);
+        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9]), tmp1, 1);
+        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10]), tmp1, 2);
+        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11]), tmp1, 3);
+        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12]), tmp1, 4);
+        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13]), tmp1, 5);
+        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14]), tmp1, 6);
+        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15]), tmp1, 7);
+
+        vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
+    },
+    in, offsets, dx, dy, out);
+}
+
+void NEScaleKernel::scale_area(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8);
+
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, window);
+
+    const auto   wr        = static_cast<float>(_input->info()->dimension(0)) / static_cast<float>(_output->info()->dimension(0));
+    const auto   hr        = static_cast<float>(_input->info()->dimension(1)) / static_cast<float>(_output->info()->dimension(1));
+    const auto   w         = _input->info()->dimension(0);
+    const auto   h         = _input->info()->dimension(1);
+    const size_t in_stride = _input->info()->strides_in_bytes()[1];
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto in_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
+
+        uint8x8_t tmp0 = vdup_n_u8(0);
+        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x(), id.y()), tmp0, 0);
+        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 1, id.y()), tmp0, 1);
+        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 2, id.y()), tmp0, 2);
+        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 3, id.y()), tmp0, 3);
+        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 4, id.y()), tmp0, 4);
+        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 5, id.y()), tmp0, 5);
+        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 6, id.y()), tmp0, 6);
+        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 7, id.y()), tmp0, 7);
+
+        uint8x8_t tmp1 = vdup_n_u8(0);
+        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 8, id.y()), tmp1, 0);
+        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 9, id.y()), tmp1, 1);
+        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 10, id.y()), tmp1, 2);
+        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 11, id.y()), tmp1, 3);
+        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 12, id.y()), tmp1, 4);
+        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 13, id.y()), tmp1, 5);
+        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 14, id.y()), tmp1, 6);
+        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 15, id.y()), tmp1, 7);
+
+        vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
+    },
+    in, out);
+}
+
+void NEScaleKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (this->*_func)(window);
+}
diff --git a/src/core/NEON/kernels/NEScharr3x3Kernel.cpp b/src/core/NEON/kernels/NEScharr3x3Kernel.cpp
new file mode 100644
index 0000000000..183df1efcb
--- /dev/null
+++ b/src/core/NEON/kernels/NEScharr3x3Kernel.cpp
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace
+{
+const int16x8_t three       = vdupq_n_s16(3);
+const int16x8_t minus_three = vdupq_n_s16(-3);
+const int16x8_t ten         = vdupq_n_s16(10);
+const int16x8_t minus_ten   = vdupq_n_s16(-10);
+
+inline int16x8_t scharr_y(const int16x8x2_t &top, const int16x8x2_t &bottom)
+{
+    // Top left
+    int16x8_t out = vmulq_s16(top.val[0], minus_three);
+    // Top center
+    out = vmlaq_s16(out, vextq_s16(top.val[0], top.val[1], 1), minus_ten);
+    // Top right
+    out = vmlaq_s16(out, vextq_s16(top.val[0], top.val[1], 2), minus_three);
+
+    // Bottom left
+    out = vmlaq_s16(out, bottom.val[0], three);
+    // Bottom center
+    out = vmlaq_s16(out, vextq_s16(bottom.val[0], bottom.val[1], 1), ten);
+    // Bottom right
+    out = vmlaq_s16(out, vextq_s16(bottom.val[0], bottom.val[1], 2), three);
+
+    return out;
+}
+
+inline int16x8_t scharr_x(const int16x8x2_t &top, const int16x8x2_t &middle, const int16x8x2_t &bottom)
+{
+    // Top left
+    int16x8_t out = vmulq_s16(top.val[0], minus_three);
+    // Top right
+    out = vmlaq_s16(out, vextq_s16(top.val[0], top.val[1], 2), three);
+
+    // Middle left
+    out = vmlaq_s16(out, middle.val[0], minus_ten);
+    // Middle right
+    out = vmlaq_s16(out, vextq_s16(middle.val[0], middle.val[1], 2), ten);
+
+    // Bottom left
+    out = vmlaq_s16(out, bottom.val[0], minus_three);
+    // Bottom right
+    out = vmlaq_s16(out, vextq_s16(bottom.val[0], bottom.val[1], 2), three);
+
+    return out;
+}
+} // namespace
+
+NEScharr3x3Kernel::NEScharr3x3Kernel()
+    : _run_scharr_x(false), _run_scharr_y(false), _input(nullptr), _output_x(nullptr), _output_y(nullptr)
+{
+}
+
+void NEScharr3x3Kernel::configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_scharr_x = output_x != nullptr;
+    _run_scharr_y = output_y != nullptr;
+
+    if(_run_scharr_x)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
+    }
+
+    if(_run_scharr_y)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
+    }
+
+    _input    = input;
+    _output_x = output_x;
+    _output_y = output_y;
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_x_access,
+                              output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+BorderSize NEScharr3x3Kernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void NEScharr3x3Kernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-1, -1));
+    const unsigned char *const input_mid_ptr = _input->ptr_to_element(Coordinates(-1, 0));
+    const unsigned char *const input_bot_ptr = _input->ptr_to_element(Coordinates(-1, +1));
+
+    Iterator input(_input, window);
+    Iterator output_y;
+    Iterator output_x;
+
+    if(_run_scharr_y)
+    {
+        output_y = Iterator(_output_y, window);
+    }
+
+    if(_run_scharr_x)
+    {
+        output_x = Iterator(_output_x, window);
+    }
+
+    if(_run_scharr_x && _run_scharr_y)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+
+            const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
+            const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
+            const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
+
+            const int16x8x2_t top_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data)))
+                }
+            };
+            const int16x8x2_t mid_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mid_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mid_data)))
+                }
+            };
+            const int16x8x2_t bot_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data)))
+                }
+            };
+
+            vst1q_s16(reinterpret_cast<int16_t *>(output_x.ptr()), scharr_x(top_s16, mid_s16, bot_s16));
+            vst1q_s16(reinterpret_cast<int16_t *>(output_y.ptr()), scharr_y(top_s16, bot_s16));
+        },
+        input, output_x, output_y);
+    }
+    else if(_run_scharr_x)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+
+            const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
+            const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
+            const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
+
+            const int16x8x2_t top_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data)))
+                }
+            };
+            const int16x8x2_t mid_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mid_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mid_data)))
+                }
+            };
+            const int16x8x2_t bot_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data)))
+                }
+            };
+
+            vst1q_s16(reinterpret_cast<int16_t *>(output_x.ptr()), scharr_x(top_s16, mid_s16, bot_s16));
+        },
+        input, output_x);
+    }
+    else if(_run_scharr_y)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+
+            const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
+            const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
+
+            const int16x8x2_t top_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data)))
+                }
+            };
+            const int16x8x2_t bot_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data)))
+                }
+            };
+
+            vst1q_s16(reinterpret_cast<int16_t *>(output_y.ptr()), scharr_y(top_s16, bot_s16));
+        },
+        input, output_y);
+    }
+}
diff --git a/src/core/NEON/kernels/NESobel3x3Kernel.cpp b/src/core/NEON/kernels/NESobel3x3Kernel.cpp
new file mode 100644
index 0000000000..ab08a1cfeb
--- /dev/null
+++ b/src/core/NEON/kernels/NESobel3x3Kernel.cpp
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NESobel3x3Kernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstdint>
+
+using namespace arm_compute;
+
+NESobel3x3Kernel::NESobel3x3Kernel()
+    : _run_sobel_x(false), _run_sobel_y(false), _input(nullptr), _output_x(nullptr), _output_y(nullptr)
+{
+}
+
+BorderSize NESobel3x3Kernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void NESobel3x3Kernel::configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_sobel_x = output_x != nullptr;
+    _run_sobel_y = output_y != nullptr;
+
+    if(_run_sobel_x)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
+    }
+
+    if(_run_sobel_y)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
+    }
+
+    _input    = input;
+    _output_x = output_x;
+    _output_y = output_y;
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 3;
+
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_x_access,
+                              output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+void NESobel3x3Kernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-1, -1));
+    const unsigned char *const input_mid_ptr = _input->ptr_to_element(Coordinates(-1, 0));
+    const unsigned char *const input_bot_ptr = _input->ptr_to_element(Coordinates(-1, 1));
+
+    Iterator input(_input, window);
+    Iterator output_y;
+    Iterator output_x;
+
+    if(_run_sobel_y)
+    {
+        output_y = Iterator(_output_y, window);
+    }
+
+    if(_run_sobel_x)
+    {
+        output_x = Iterator(_output_x, window);
+    }
+
+    static const int16x8_t two      = vdupq_n_s16(2);
+    static const int16x8_t minustwo = vdupq_n_s16(-2);
+
+    if(_run_sobel_y && _run_sobel_x)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
+            const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
+            const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
+
+            const int16x8x2_t top_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data)))
+                }
+            };
+            const int16x8x2_t mid_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mid_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mid_data)))
+                }
+            };
+            const int16x8x2_t bot_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data)))
+                }
+            };
+
+            //SOBEL Y
+            //top left
+            int16x8_t out_y = vnegq_s16(top_s16.val[0]);
+            //top mid
+            out_y = vmlaq_s16(out_y, vextq_s16(top_s16.val[0], top_s16.val[1], 1), minustwo);
+            //top right
+            out_y = vsubq_s16(out_y, vextq_s16(top_s16.val[0], top_s16.val[1], 2));
+            //bot left
+            out_y = vaddq_s16(out_y, bot_s16.val[0]);
+            //bot mid
+            out_y = vmlaq_s16(out_y, vextq_s16(bot_s16.val[0], bot_s16.val[1], 1), two);
+            //bot right
+            out_y = vaddq_s16(out_y, vextq_s16(bot_s16.val[0], bot_s16.val[1], 2));
+
+            vst1q_s16(reinterpret_cast<int16_t *>(output_y.ptr()), out_y);
+
+            //SOBEL X
+            //top left
+            int16x8_t out_x = vnegq_s16(top_s16.val[0]);
+            //top right
+            out_x = vaddq_s16(out_x, vextq_s16(top_s16.val[0], top_s16.val[1], 2));
+            //mid left
+            out_x = vmlaq_s16(out_x, mid_s16.val[0], minustwo);
+            //mid right
+            out_x = vmlaq_s16(out_x, vextq_s16(mid_s16.val[0], mid_s16.val[1], 2), two);
+            //bot left
+            out_x = vsubq_s16(out_x, bot_s16.val[0]);
+            //bot right
+            out_x = vaddq_s16(out_x, vextq_s16(bot_s16.val[0], bot_s16.val[1], 2));
+
+            vst1q_s16(reinterpret_cast<int16_t *>(output_x.ptr()), out_x);
+        },
+        input, output_x, output_y);
+    }
+    else if(_run_sobel_x)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
+            const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
+            const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
+
+            const int16x8x2_t top_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data)))
+                }
+            };
+            const int16x8x2_t mid_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mid_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mid_data)))
+                }
+            };
+            const int16x8x2_t bot_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data)))
+                }
+            };
+
+            //SOBEL X
+            //top left
+            int16x8_t out = vnegq_s16(top_s16.val[0]);
+            //top right
+            out = vaddq_s16(out, vextq_s16(top_s16.val[0], top_s16.val[1], 2));
+            //mid left
+            out = vmlaq_s16(out, mid_s16.val[0], minustwo);
+            //mid right
+            out = vmlaq_s16(out, vextq_s16(mid_s16.val[0], mid_s16.val[1], 2), two);
+            //bot left
+            out = vsubq_s16(out, bot_s16.val[0]);
+            //bot right
+            out = vaddq_s16(out, vextq_s16(bot_s16.val[0], bot_s16.val[1], 2));
+
+            vst1q_s16(reinterpret_cast<int16_t *>(output_x.ptr()), out);
+        },
+        input, output_x);
+    }
+    else if(_run_sobel_y)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
+            const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
+
+            const int16x8x2_t top_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data)))
+                }
+            };
+            const int16x8x2_t bot_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data)))
+                }
+            };
+
+            //SOBEL Y
+            //top left
+            int16x8_t out = vnegq_s16(top_s16.val[0]);
+            //top mid
+            out = vmlaq_s16(out, vextq_s16(top_s16.val[0], top_s16.val[1], 1), minustwo);
+            //top right
+            out = vsubq_s16(out, vextq_s16(top_s16.val[0], top_s16.val[1], 2));
+            //bot left
+            out = vaddq_s16(out, bot_s16.val[0]);
+            //bot mid
+            out = vmlaq_s16(out, vextq_s16(bot_s16.val[0], bot_s16.val[1], 1), two);
+            //bot right
+            out = vaddq_s16(out, vextq_s16(bot_s16.val[0], bot_s16.val[1], 2));
+
+            vst1q_s16(reinterpret_cast<int16_t *>(output_y.ptr()), out);
+        },
+        input, output_y);
+    }
+}
diff --git a/src/core/NEON/kernels/NESobel5x5Kernel.cpp b/src/core/NEON/kernels/NESobel5x5Kernel.cpp
new file mode 100644
index 0000000000..488eee1176
--- /dev/null
+++ b/src/core/NEON/kernels/NESobel5x5Kernel.cpp
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NESobel5x5Kernel.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+NESobel5x5HorKernel::NESobel5x5HorKernel()
+    : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0)
+{
+}
+
+BorderSize NESobel5x5HorKernel::border_size() const
+{
+    return _border_size;
+}
+
+void NESobel5x5HorKernel::configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_sobel_x = output_x != nullptr;
+    _run_sobel_y = output_y != nullptr;
+
+    if(_run_sobel_x)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
+    }
+
+    if(_run_sobel_y)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
+    }
+
+    _input       = input;
+    _output_x    = output_x;
+    _output_y    = output_y;
+    _border_size = BorderSize(border_undefined ? 0 : 2, 2);
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+
+    Window                 win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), -border_size().left, num_elems_read_per_iteration),
+                              output_x_access,
+                              output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+void NESobel5x5HorKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    Window win_in(window);
+    win_in.shift(Window::DimX, -2);
+
+    Iterator input(_input, win_in);
+    Iterator output_x;
+    Iterator output_y;
+
+    if(_run_sobel_x)
+    {
+        output_x = Iterator(_output_x, window);
+    }
+
+    if(_run_sobel_y)
+    {
+        output_y = Iterator(_output_y, window);
+    }
+
+    if(_run_sobel_y && _run_sobel_x)
+    {
+        static const int16x8_t six      = vdupq_n_s16(6);
+        static const int16x8_t four     = vdupq_n_s16(4);
+        static const int16x8_t two      = vdupq_n_s16(2);
+        static const int16x8_t minustwo = vdupq_n_s16(-2);
+
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const uint8x16_t data = vld1q_u8(input.ptr());
+
+            const int16x8x2_t data_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
+                }
+            };
+
+            int16x8_t out_y = data_s16.val[0];
+            out_y           = vmlaq_s16(out_y, vextq_s16(data_s16.val[0], data_s16.val[1], 1), four);
+            out_y           = vmlaq_s16(out_y, vextq_s16(data_s16.val[0], data_s16.val[1], 2), six);
+            out_y           = vmlaq_s16(out_y, vextq_s16(data_s16.val[0], data_s16.val[1], 3), four);
+            out_y           = vaddq_s16(out_y, vextq_s16(data_s16.val[0], data_s16.val[1], 4));
+
+            vst1q_s16(reinterpret_cast<int16_t *>(output_y.ptr()), out_y);
+
+            int16x8_t out_x = vnegq_s16(data_s16.val[0]);
+            out_x           = vmlaq_s16(out_x, vextq_s16(data_s16.val[0], data_s16.val[1], 1), minustwo);
+            out_x           = vmlaq_s16(out_x, vextq_s16(data_s16.val[0], data_s16.val[1], 3), two);
+            out_x           = vaddq_s16(out_x, vextq_s16(data_s16.val[0], data_s16.val[1], 4));
+
+            vst1q_s16(reinterpret_cast<int16_t *>(output_x.ptr()), out_x);
+        },
+        input, output_x, output_y);
+    }
+    else if(_run_sobel_x)
+    {
+        static const int16x8_t two      = vdupq_n_s16(2);
+        static const int16x8_t minustwo = vdupq_n_s16(-2);
+
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const uint8x16_t data = vld1q_u8(input.ptr());
+
+            const int16x8x2_t data_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
+                }
+            };
+
+            int16x8_t out = vnegq_s16(data_s16.val[0]);
+            out           = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), minustwo);
+            out           = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), two);
+            out           = vaddq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4));
+
+            vst1q_s16(reinterpret_cast<int16_t *>(output_x.ptr()), out);
+        },
+        input, output_x);
+    }
+    else if(_run_sobel_y)
+    {
+        static const int16x8_t six  = vdupq_n_s16(6);
+        static const int16x8_t four = vdupq_n_s16(4);
+
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const uint8x16_t data = vld1q_u8(input.ptr());
+
+            const int16x8x2_t data_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
+                }
+            };
+
+            int16x8_t out = data_s16.val[0];
+            out           = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), four);
+            out           = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), six);
+            out           = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), four);
+            out           = vaddq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4));
+
+            vst1q_s16(reinterpret_cast<int16_t *>(output_y.ptr()), out);
+        },
+        input, output_y);
+    }
+}
+
+NESobel5x5VertKernel::NESobel5x5VertKernel()
+    : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
+{
+}
+
+BorderSize NESobel5x5VertKernel::border_size() const
+{
+    return BorderSize(2, 0);
+}
+
+void NESobel5x5VertKernel::configure(ITensor *input_x, ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_sobel_x = output_x != nullptr;
+    _run_sobel_y = output_y != nullptr;
+
+    if(_run_sobel_x)
+    {
+        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input_x, Format::S16);
+        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_x, Format::S16);
+    }
+
+    if(_run_sobel_y)
+    {
+        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input_y, Format::S16);
+        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_y, Format::S16);
+    }
+
+    _input_x  = input_x;
+    _input_y  = input_y;
+    _output_x = output_x;
+    _output_y = output_y;
+
+    const ITensor *const input = _run_sobel_x ? input_x : input_y;
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 16;
+    constexpr unsigned int num_rows_read_per_iteration       = 5;
+
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              AccessWindowRectangle(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_x_access,
+                              output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+void NESobel5x5VertKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    Iterator input_x;
+    Iterator input_y;
+    Iterator output_x;
+    Iterator output_y;
+
+    const int16_t *input_x_low2_ptr = nullptr;
+    const int16_t *input_x_low_ptr  = nullptr;
+    const int16_t *input_x_mid_ptr  = nullptr;
+    const int16_t *input_x_top_ptr  = nullptr;
+    const int16_t *input_x_top2_ptr = nullptr;
+
+    const int16_t *input_y_low2_ptr = nullptr;
+    const int16_t *input_y_low_ptr  = nullptr;
+    const int16_t *input_y_top_ptr  = nullptr;
+    const int16_t *input_y_top2_ptr = nullptr;
+
+    if(_run_sobel_x)
+    {
+        input_x          = Iterator(_input_x, window);
+        output_x         = Iterator(_output_x, window);
+        input_x_top2_ptr = reinterpret_cast<const int16_t *>(_input_x->ptr_to_element(Coordinates(0, -2)));
+        input_x_top_ptr  = reinterpret_cast<const int16_t *>(_input_x->ptr_to_element(Coordinates(0, -1)));
+        input_x_mid_ptr  = reinterpret_cast<const int16_t *>(_input_x->ptr_to_element(Coordinates(0, 0)));
+        input_x_low_ptr  = reinterpret_cast<const int16_t *>(_input_x->ptr_to_element(Coordinates(0, 1)));
+        input_x_low2_ptr = reinterpret_cast<const int16_t *>(_input_x->ptr_to_element(Coordinates(0, 2)));
+    }
+
+    if(_run_sobel_y)
+    {
+        input_y          = Iterator(_input_y, window);
+        output_y         = Iterator(_output_y, window);
+        input_y_top2_ptr = reinterpret_cast<const int16_t *>(_input_y->ptr_to_element(Coordinates(0, -2)));
+        input_y_top_ptr  = reinterpret_cast<const int16_t *>(_input_y->ptr_to_element(Coordinates(0, -1)));
+        input_y_low_ptr  = reinterpret_cast<const int16_t *>(_input_y->ptr_to_element(Coordinates(0, 1)));
+        input_y_low2_ptr = reinterpret_cast<const int16_t *>(_input_y->ptr_to_element(Coordinates(0, 2)));
+    }
+
+    static const int16x8_t six      = vdupq_n_s16(6);
+    static const int16x8_t four     = vdupq_n_s16(4);
+    static const int16x8_t two      = vdupq_n_s16(2);
+    static const int16x8_t minustwo = vdupq_n_s16(-2);
+
+    if(_run_sobel_x)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            // Convert offset from uint8_t* to uint16_t*
+            const size_t input_offset_high_s16 = input_x.offset() / 2;
+            const size_t input_offset_low_s16  = input_offset_high_s16 + 8;
+
+            //HIGH DATA
+            //top2
+            int16x8_t data_high = vld1q_s16(input_x_top2_ptr + input_offset_high_s16);
+            int16x8_t out_high  = data_high;
+            //top
+            data_high = vld1q_s16(input_x_top_ptr + input_offset_high_s16);
+            out_high  = vmlaq_s16(out_high, data_high, four);
+            //mid
+            data_high = vld1q_s16(input_x_mid_ptr + input_offset_high_s16);
+            out_high  = vmlaq_s16(out_high, data_high, six);
+            //low
+            data_high = vld1q_s16(input_x_low_ptr + input_offset_high_s16);
+            out_high  = vmlaq_s16(out_high, data_high, four);
+            //low2
+            data_high = vld1q_s16(input_x_low2_ptr + input_offset_high_s16);
+            out_high  = vaddq_s16(out_high, data_high);
+
+            vst1q_s16((reinterpret_cast<int16_t *>(output_x.ptr())), out_high);
+
+            //LOW DATA
+            //top2
+            int16x8_t data_low = vld1q_s16(input_x_top2_ptr + input_offset_low_s16);
+            int16x8_t out_low  = data_low;
+            //top
+            data_low = vld1q_s16(input_x_top_ptr + input_offset_low_s16);
+            out_low  = vmlaq_s16(out_low, data_low, four);
+            //mid
+            data_low = vld1q_s16(input_x_mid_ptr + input_offset_low_s16);
+            out_low  = vmlaq_s16(out_low, data_low, six);
+            //low
+            data_low = vld1q_s16(input_x_low_ptr + input_offset_low_s16);
+            out_low  = vmlaq_s16(out_low, data_low, four);
+            //low2
+            data_low = vld1q_s16(input_x_low2_ptr + input_offset_low_s16);
+            out_low  = vaddq_s16(out_low, data_low);
+
+            vst1q_s16((reinterpret_cast<int16_t *>(output_x.ptr())) + 8, out_low);
+        },
+        input_x, output_x);
+    }
+
+    if(_run_sobel_y)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            // Convert offset from uint8_t* to uint16_t*
+            const size_t input_offset_high_s16 = input_y.offset() / 2;
+            const size_t input_offset_low_s16  = input_offset_high_s16 + 8;
+
+            //HIGH DATA
+            //top2
+            int16x8_t data_high = vld1q_s16(input_y_top2_ptr + input_offset_high_s16);
+            int16x8_t out_high  = vnegq_s16(data_high);
+            //top
+            data_high = vld1q_s16(input_y_top_ptr + input_offset_high_s16);
+            out_high  = vmlaq_s16(out_high, data_high, minustwo);
+            //low
+            data_high = vld1q_s16(input_y_low_ptr + input_offset_high_s16);
+            out_high  = vmlaq_s16(out_high, data_high, two);
+            //low2
+            data_high = vld1q_s16(input_y_low2_ptr + input_offset_high_s16);
+            out_high  = vaddq_s16(out_high, data_high);
+
+            vst1q_s16((reinterpret_cast<int16_t *>(output_y.ptr())), out_high);
+
+            //LOW DATA
+            //top2
+            int16x8_t data_low = vld1q_s16(input_y_top2_ptr + input_offset_low_s16);
+            int16x8_t out_low  = vnegq_s16(data_low);
+            //top
+            data_low = vld1q_s16(input_y_top_ptr + input_offset_low_s16);
+            out_low  = vmlaq_s16(out_low, data_low, minustwo);
+            //low
+            data_low = vld1q_s16(input_y_low_ptr + input_offset_low_s16);
+            out_low  = vmlaq_s16(out_low, data_low, two);
+            //low2
+            data_low = vld1q_s16(input_y_low2_ptr + input_offset_low_s16);
+            out_low  = vaddq_s16(out_low, data_low);
+
+            vst1q_s16((reinterpret_cast<int16_t *>(output_y.ptr())) + 8, out_low);
+        },
+        input_y, output_y);
+    }
+}
diff --git a/src/core/NEON/kernels/NESobel7x7Kernel.cpp b/src/core/NEON/kernels/NESobel7x7Kernel.cpp
new file mode 100644
index 0000000000..9761942c69
--- /dev/null
+++ b/src/core/NEON/kernels/NESobel7x7Kernel.cpp
@@ -0,0 +1,520 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NESobel7x7Kernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+const int32x4_t minusfour = vdupq_n_s32(-4);
+const int32x4_t minusfive = vdupq_n_s32(-5);
+const int32x4_t four      = vdupq_n_s32(4);
+const int32x4_t five      = vdupq_n_s32(5);
+const int32x4_t six       = vdupq_n_s32(6);
+const int32x4_t fifteen   = vdupq_n_s32(15);
+const int32x4_t twenty    = vdupq_n_s32(20);
+
+inline int32x4x2_t compute_hor_sobel_x(const int32x4x4_t &data)
+{
+    int32x4x2_t out =
+    {
+        {
+            vnegq_s32(data.val[0]),
+            vnegq_s32(data.val[1])
+        }
+    };
+
+    out.val[0] = vmlaq_s32(out.val[0],
+                           vextq_s32(data.val[0], data.val[1], 1), minusfour);
+
+    out.val[0] = vmlaq_s32(out.val[0],
+                           vextq_s32(data.val[0], data.val[1], 2), minusfive);
+
+    out.val[0] = vmlaq_s32(out.val[0], data.val[1], five);
+
+    out.val[0] = vmlaq_s32(out.val[0],
+                           vextq_s32(data.val[1], data.val[2], 1), four);
+
+    out.val[0] = vaddq_s32(out.val[0],
+                           vextq_s32(data.val[1], data.val[2], 2));
+
+    out.val[1] = vmlaq_s32(out.val[1],
+                           vextq_s32(data.val[1], data.val[2], 1), minusfour);
+
+    out.val[1] = vmlaq_s32(out.val[1],
+                           vextq_s32(data.val[1], data.val[2], 2), minusfive);
+
+    out.val[1] = vmlaq_s32(out.val[1], data.val[2], five);
+
+    out.val[1] = vmlaq_s32(out.val[1],
+                           vextq_s32(data.val[2], data.val[3], 1), four);
+
+    out.val[1] = vaddq_s32(out.val[1],
+                           vextq_s32(data.val[2], data.val[3], 2));
+
+    return out;
+}
+
+inline int32x4x2_t compute_hor_sobel_y(const int32x4x4_t &data)
+{
+    int32x4x2_t out =
+    {
+        {
+            data.val[0],
+            data.val[1]
+        }
+    };
+
+    out.val[0] = vmlaq_s32(out.val[0],
+                           vextq_s32(data.val[0], data.val[1], 1), six);
+
+    out.val[0] = vmlaq_s32(out.val[0],
+                           vextq_s32(data.val[0], data.val[1], 2), fifteen);
+
+    out.val[0] = vmlaq_s32(out.val[0],
+                           vextq_s32(data.val[0], data.val[1], 3), twenty);
+
+    out.val[0] = vmlaq_s32(out.val[0], data.val[1], fifteen);
+
+    out.val[0] = vmlaq_s32(out.val[0],
+                           vextq_s32(data.val[1], data.val[2], 1), six);
+
+    out.val[0] = vaddq_s32(out.val[0],
+                           vextq_s32(data.val[1], data.val[2], 2));
+
+    out.val[1] = vmlaq_s32(out.val[1],
+                           vextq_s32(data.val[1], data.val[2], 1), six);
+
+    out.val[1] = vmlaq_s32(out.val[1],
+                           vextq_s32(data.val[1], data.val[2], 2), fifteen);
+
+    out.val[1] = vmlaq_s32(out.val[1],
+                           vextq_s32(data.val[1], data.val[2], 3), twenty);
+
+    out.val[1] = vmlaq_s32(out.val[1], data.val[2], fifteen);
+
+    out.val[1] = vmlaq_s32(out.val[1],
+                           vextq_s32(data.val[2], data.val[3], 1), six);
+
+    out.val[1] = vaddq_s32(out.val[1],
+                           vextq_s32(data.val[2], data.val[3], 2));
+
+    return out;
+}
+} // namespace
+
+NESobel7x7HorKernel::NESobel7x7HorKernel()
+    : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0)
+{
+}
+
+BorderSize NESobel7x7HorKernel::border_size() const
+{
+    return _border_size;
+}
+
+void NESobel7x7HorKernel::configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::U8);
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_sobel_x = output_x != nullptr;
+    _run_sobel_y = output_y != nullptr;
+
+    if(_run_sobel_x)
+    {
+        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_x, Format::S32);
+    }
+
+    if(_run_sobel_y)
+    {
+        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_y, Format::S32);
+    }
+
+    _input       = input;
+    _output_x    = output_x;
+    _output_y    = output_y;
+    _border_size = BorderSize(border_undefined ? 0 : 3, 3);
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 16;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+
+    Window                 win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), -border_size().left, num_elems_read_per_iteration),
+                              output_x_access,
+                              output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+void NESobel7x7HorKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    Iterator input(_input, window);
+    Iterator output_x;
+    Iterator output_y;
+
+    if(_run_sobel_x)
+    {
+        output_x = Iterator(_output_x, window);
+    }
+
+    if(_run_sobel_y)
+    {
+        output_y = Iterator(_output_y, window);
+    }
+
+    if(_run_sobel_y && _run_sobel_x)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const uint8x16_t data = vld1q_u8(input.ptr() - 3);
+
+            const uint16x8_t tmp_low_u16  = vmovl_u8(vget_low_u8(data));
+            const uint16x8_t tmp_high_u16 = vmovl_u8(vget_high_u8(data));
+
+            const int32x4x4_t data_s32 =
+            {
+                {
+                    vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_low_u16))),
+                    vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_low_u16))),
+                    vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_high_u16))),
+                    vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_high_u16)))
+                }
+            };
+
+            const int32x4x2_t out_y = compute_hor_sobel_y(data_s32);
+            vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()), out_y.val[0]);
+            vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 4, out_y.val[1]);
+
+            const int32x4x2_t out_x = compute_hor_sobel_x(data_s32);
+            vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()), out_x.val[0]);
+            vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()) + 4, out_x.val[1]);
+        },
+        input, output_x, output_y);
+    }
+    else if(_run_sobel_x)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const uint8x16_t data = vld1q_u8(input.ptr() - 3);
+
+            const uint16x8_t tmp_low_u16  = vmovl_u8(vget_low_u8(data));
+            const uint16x8_t tmp_high_u16 = vmovl_u8(vget_high_u8(data));
+
+            const int32x4x4_t data_s32 =
+            {
+                {
+                    vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_low_u16))),
+                    vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_low_u16))),
+                    vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_high_u16))),
+                    vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_high_u16)))
+                }
+            };
+
+            const int32x4x2_t out = compute_hor_sobel_x(data_s32);
+            vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()), out.val[0]);
+            vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()) + 4, out.val[1]);
+        },
+        input, output_x);
+    }
+    else if(_run_sobel_y)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const uint8x16_t data = vld1q_u8(input.ptr() - 3);
+
+            const uint16x8_t tmp_low_u16  = vmovl_u8(vget_low_u8(data));
+            const uint16x8_t tmp_high_u16 = vmovl_u8(vget_high_u8(data));
+
+            const int32x4x4_t data_s32 =
+            {
+                {
+                    vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_low_u16))),
+                    vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_low_u16))),
+                    vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_high_u16))),
+                    vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_high_u16)))
+                }
+            };
+
+            const int32x4x2_t out = compute_hor_sobel_x(data_s32);
+            vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()), out.val[0]);
+            vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 4, out.val[1]);
+        },
+        input, output_y);
+    }
+}
+
+NESobel7x7VertKernel::NESobel7x7VertKernel()
+    : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
+{
+}
+
+BorderSize NESobel7x7VertKernel::border_size() const
+{
+    return BorderSize(3, 0);
+}
+
+void NESobel7x7VertKernel::configure(const ITensor *input_x, const ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined)
+{
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _run_sobel_x = (output_x != nullptr);
+    _run_sobel_y = (output_y != nullptr);
+
+    if(_run_sobel_x)
+    {
+        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input_x, Format::S32);
+        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_x, Format::S32);
+    }
+
+    if(_run_sobel_y)
+    {
+        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input_y, Format::S32);
+        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_y, Format::S32);
+    }
+
+    _input_x  = input_x;
+    _input_y  = input_y;
+    _output_x = output_x;
+    _output_y = output_y;
+
+    const ITensor *const input = _run_sobel_x ? input_x : input_y;
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    constexpr unsigned int num_elems_read_per_iteration      = 8;
+    constexpr unsigned int num_elems_written_per_iteration   = 8;
+    constexpr unsigned int num_rows_read_per_iteration       = 7;
+
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              AccessWindowRectangle(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              output_x_access,
+                              output_y_access);
+
+    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+    INEKernel::configure(win);
+}
+
+void NESobel7x7VertKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    Iterator input_x;
+    Iterator input_y;
+    Iterator output_x;
+    Iterator output_y;
+
+    int32_t in_x_stride = 0;
+    int32_t in_y_stride = 0;
+
+    if(_run_sobel_x)
+    {
+        input_x     = Iterator(_input_x, window);
+        output_x    = Iterator(_output_x, window);
+        in_x_stride = _input_x->info()->strides_in_bytes()[1] / pixel_size_from_format(_input_x->info()->format());
+    }
+
+    if(_run_sobel_y)
+    {
+        input_y     = Iterator(_input_y, window);
+        output_y    = Iterator(_output_y, window);
+        in_y_stride = _input_y->info()->strides_in_bytes()[1] / pixel_size_from_format(_input_y->info()->format());
+    }
+
+    if(_run_sobel_x)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            auto in_ptr = reinterpret_cast<int32_t *>(input_x.ptr()) - 3 * in_x_stride;
+
+            //top3
+            int32x4x2_t data =
+            {
+                {
+                    vld1q_s32(in_ptr),
+                    vld1q_s32(in_ptr + 4)
+                }
+            };
+
+            int32x4x2_t out = data;
+
+            //top2
+            in_ptr += in_x_stride;
+            data.val[0] = vld1q_s32(in_ptr);
+            out.val[0]  = vmlaq_s32(out.val[0], data.val[0], six);
+
+            data.val[1] = vld1q_s32(in_ptr + 4);
+            out.val[1]  = vmlaq_s32(out.val[1], data.val[1], six);
+
+            //top
+            in_ptr += in_x_stride;
+            data.val[0] = vld1q_s32(in_ptr);
+            out.val[0]  = vmlaq_s32(out.val[0], data.val[0], fifteen);
+
+            data.val[1] = vld1q_s32(in_ptr + 4);
+            out.val[1]  = vmlaq_s32(out.val[1], data.val[1], fifteen);
+
+            //mid
+            in_ptr += in_x_stride;
+            data.val[0] = vld1q_s32(in_ptr);
+            out.val[0]  = vmlaq_s32(out.val[0], data.val[0], twenty);
+
+            data.val[1] = vld1q_s32(in_ptr + 4);
+            out.val[1]  = vmlaq_s32(out.val[1], data.val[1], twenty);
+
+            //low
+            in_ptr += in_x_stride;
+            data.val[0] = vld1q_s32(in_ptr);
+            out.val[0]  = vmlaq_s32(out.val[0], data.val[0], fifteen);
+
+            data.val[1] = vld1q_s32(in_ptr + 4);
+            out.val[1]  = vmlaq_s32(out.val[1], data.val[1], fifteen);
+
+            //low2
+            in_ptr += in_x_stride;
+            data.val[0] = vld1q_s32(in_ptr);
+            out.val[0]  = vmlaq_s32(out.val[0], data.val[0], six);
+
+            data.val[1] = vld1q_s32(in_ptr + 4);
+            out.val[1]  = vmlaq_s32(out.val[1], data.val[1], six);
+
+            //low3
+            in_ptr += in_x_stride;
+            data.val[0] = vld1q_s32(in_ptr);
+            out.val[0]  = vaddq_s32(out.val[0], data.val[0]);
+
+            data.val[1] = vld1q_s32(in_ptr + 4);
+            out.val[1]  = vaddq_s32(out.val[1], data.val[1]);
+
+            vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()) + 0, out.val[0]);
+            vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()) + 4, out.val[1]);
+        },
+        input_x, output_x);
+    }
+
+    if(_run_sobel_y)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            auto in_ptr = reinterpret_cast<int32_t *>(input_y.ptr()) - 3 * in_y_stride;
+
+            //top3
+            int32x4x2_t data =
+            {
+                {
+                    vld1q_s32(in_ptr),
+                    vld1q_s32(in_ptr + 4)
+                }
+            };
+
+            int32x4x2_t out =
+            {
+                {
+                    vnegq_s32(data.val[0]),
+                    vnegq_s32(data.val[1])
+                }
+            };
+
+            //top2
+            in_ptr += in_y_stride;
+            data.val[0] = vld1q_s32(in_ptr);
+            out.val[0]  = vmlaq_s32(out.val[0], data.val[0], minusfour);
+
+            data.val[1] = vld1q_s32(in_ptr + 4);
+            out.val[1]  = vmlaq_s32(out.val[1], data.val[1], minusfour);
+
+            //top
+            in_ptr += in_y_stride;
+            data.val[0] = vld1q_s32(in_ptr);
+            out.val[0]  = vmlaq_s32(out.val[0], data.val[0], minusfive);
+
+            data.val[1] = vld1q_s32(in_ptr + 4);
+            out.val[1]  = vmlaq_s32(out.val[1], data.val[1], minusfive);
+
+            //low
+            in_ptr += (2 * in_y_stride);
+            data.val[0] = vld1q_s32(in_ptr);
+            out.val[0]  = vmlaq_s32(out.val[0], data.val[0], five);
+
+            data.val[1] = vld1q_s32(in_ptr + 4);
+            out.val[1]  = vmlaq_s32(out.val[1], data.val[1], five);
+
+            //low2
+            in_ptr += in_y_stride;
+            data.val[0] = vld1q_s32(in_ptr);
+            out.val[0]  = vmlaq_s32(out.val[0], data.val[0], four);
+
+            data.val[1] = vld1q_s32(in_ptr + 4);
+            out.val[1]  = vmlaq_s32(out.val[1], data.val[1], four);
+
+            //low3
+            in_ptr += in_y_stride;
+            data.val[0] = vld1q_s32(in_ptr);
+            out.val[0]  = vaddq_s32(out.val[0], data.val[0]);
+
+            data.val[1] = vld1q_s32(in_ptr + 4);
+            out.val[1]  = vaddq_s32(out.val[1], data.val[1]);
+
+            vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 0, out.val[0]);
+            vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 4, out.val[1]);
+        },
+        input_y, output_y);
+    }
+}
diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
new file mode 100644
index 0000000000..942662e84b
--- /dev/null
+++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
@@ -0,0 +1,474 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <algorithm>
+#include <arm_neon.h>
+#include <cfloat>
+
+using namespace arm_compute;
+
+namespace
+{
+void logits_1d_max_f32(const ITensor *in, ITensor *out, const Window &window)
+{
+    Window in_slice = window.first_slice_window_1D();
+
+    Window window_max(window);
+    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+    Window max_slice = window_max.first_slice_window_1D();
+
+    do
+    {
+        Iterator input(in, in_slice);
+        Iterator output(out, max_slice);
+
+        float32x4_t vec_max = vdupq_n_f32(-FLT_MAX);
+
+        execute_window_loop(in_slice, [&](const Coordinates & id)
+        {
+            const auto        in_ptr        = reinterpret_cast<const float *>(input.ptr());
+            const float32x4_t current_value = vld1q_f32(in_ptr);
+            vec_max                         = vmaxq_f32(vec_max, current_value);
+        },
+        input);
+
+        float32x2_t carry_max = vpmax_f32(vget_high_f32(vec_max), vget_low_f32(vec_max));
+        carry_max             = vpmax_f32(carry_max, carry_max);
+
+        *(reinterpret_cast<float *>(output.ptr())) = vget_lane_f32(carry_max, 0);
+    }
+    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+}
+
+void logits_1d_max_qs8(const ITensor *in, ITensor *out, const Window &window)
+{
+    Window in_slice = window.first_slice_window_1D();
+
+    Window window_max(window);
+    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+    Window max_slice = window_max.first_slice_window_1D();
+
+    do
+    {
+        Iterator input(in, in_slice);
+        Iterator output(out, max_slice);
+
+        qint8x16_t vec_max = vdupq_n_s8(-1);
+
+        execute_window_loop(in_slice, [&](const Coordinates & id)
+        {
+            const auto       in_ptr        = reinterpret_cast<const qint8_t *>(input.ptr());
+            const qint8x16_t current_value = vld1q_qs8(in_ptr);
+            vec_max                        = vmaxq_qs8(vec_max, current_value);
+        },
+        input);
+
+        qint8x8_t carry_max = vpmax_qs8(vget_high_s8(vec_max), vget_low_s8(vec_max));
+        carry_max           = vpmax_qs8(carry_max, carry_max);
+        carry_max           = vpmax_qs8(carry_max, carry_max);
+        carry_max           = vpmax_qs8(carry_max, carry_max);
+
+        *(reinterpret_cast<int8_t *>(output.ptr())) = vget_lane_s8(carry_max, 0);
+    }
+    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+}
+} // namespace
+
+NELogits1DMaxKernel::NELogits1DMaxKernel()
+    : _func(nullptr), _border_size()
+{
+}
+
+BorderSize NELogits1DMaxKernel::border_size() const
+{
+    return _border_size;
+}
+
+void NELogits1DMaxKernel::configure(const ITensor *input, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    const int    input_width                       = input->info()->valid_region().shape.x();
+    unsigned int num_elems_processed_per_iteration = 0;
+
+    switch(input->info()->data_type())
+    {
+        case DataType::QS8:
+            _func                             = &logits_1d_max_qs8;
+            num_elems_processed_per_iteration = 16;
+            break;
+        case DataType::F32:
+            num_elems_processed_per_iteration = 4;
+            _func                             = &logits_1d_max_f32;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type.");
+    }
+
+    _input       = input;
+    _output      = output;
+    _border_size = BorderSize(0, input_width % num_elems_processed_per_iteration, 0, 0);
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_written_per_row = 1;
+
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_row, 1.f / input_width);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NELogits1DMaxKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (*_func)(_input, _output, window);
+}
+
+namespace
+{
+void logits_1d_shift_exp_sum_f32(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window)
+{
+    Window window_max(window);
+    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+    Window max_slice = window_max.first_slice_window_1D();
+    Window in_slice  = window.first_slice_window_1D();
+
+    constexpr int step        = 4;
+    const int     long_steps  = in->info()->valid_region().shape.x() / step;
+    const int     small_steps = in->info()->valid_region().shape.x() % step;
+
+    do
+    {
+        Iterator input(in, in_slice);
+        Iterator exp(out, in_slice);
+        Iterator _max(max, max_slice);
+        Iterator _sum(sum, max_slice);
+
+        // Get pointers
+        auto in_ptr  = reinterpret_cast<const float *>(input.ptr());
+        auto exp_ptr = reinterpret_cast<float *>(exp.ptr());
+
+        // Init sum to zero
+        float32x4_t vec_sum_value = vdupq_n_f32(0.0f);
+
+        // Get max value
+        const auto        max_ptr = reinterpret_cast<const float *>(_max.ptr());
+        const float32x4_t vec_max = vdupq_n_f32(*max_ptr);
+
+        // Run neon loop
+        for(int i = 0; i < long_steps; ++i)
+        {
+            float32x4_t vec_elements = vld1q_f32(in_ptr);
+            vec_elements             = vsubq_f32(vec_elements, vec_max);
+            vec_elements             = vexpq_f32(vec_elements);
+
+            vst1q_f32(exp_ptr, vec_elements);
+            vec_sum_value = vaddq_f32(vec_elements, vec_sum_value);
+
+            in_ptr += step;
+            exp_ptr += step;
+        }
+
+        // Reduce sum
+        float32x2_t carry_addition = vpadd_f32(vget_high_f32(vec_sum_value), vget_low_f32(vec_sum_value));
+        carry_addition             = vpadd_f32(carry_addition, carry_addition);
+        float sum                  = vget_lane_f32(carry_addition, 0);
+
+        // Run remaining elements
+        for(int i = 0; i < small_steps; ++i)
+        {
+            float element = std::exp(in_ptr[i] - *max_ptr);
+            exp_ptr[i]    = element;
+            sum += element;
+        }
+
+        *(reinterpret_cast<float *>(_sum.ptr())) = sum;
+    }
+    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+}
+void logits_1d_shift_exp_sum_qs8(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window)
+{
+    Window window_max(window);
+    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+    Window max_slice = window_max.first_slice_window_1D();
+    Window in_slice  = window.first_slice_window_1D();
+
+    constexpr int step                 = 8;
+    const int     long_steps           = in->info()->valid_region().shape.x() / step;
+    const int     small_steps          = in->info()->valid_region().shape.x() % step;
+    const int     fixed_point_position = in->info()->fixed_point_position();
+
+    do
+    {
+        Iterator input(in, in_slice);
+        Iterator exp(out, in_slice);
+        Iterator _max(max, max_slice);
+        Iterator _sum(sum, max_slice);
+
+        // Get pointers
+        auto in_ptr  = reinterpret_cast<const qint8_t *>(input.ptr());
+        auto exp_ptr = reinterpret_cast<qint8_t *>(exp.ptr());
+
+        // Init sum to zero
+        qint16x8_t vec_sum_value = vdupq_n_qs16(0);
+
+        // Get max value
+        const auto      max_ptr = reinterpret_cast<const qint8_t *>(_max.ptr());
+        const qint8x8_t vec_max = vdup_n_qs8(*max_ptr);
+
+        // Run neon loop
+        for(int i = 0; i < long_steps; ++i)
+        {
+            qint8x8_t vec_elements = vld1_qs8(in_ptr);
+            vec_elements           = vqsub_qs8(vec_elements, vec_max);
+            vec_elements           = vqexp_qs8(vec_elements, fixed_point_position);
+
+            vst1_qs8(exp_ptr, vec_elements);
+            vec_sum_value = vqaddq_qs16(vec_sum_value, vmovl_s8(vec_elements));
+
+            in_ptr += step;
+            exp_ptr += step;
+        }
+        // Reduce sum
+        const qint16x4_t sum_red = vqadd_qs16(vget_low_s16(vec_sum_value), vget_high_s16(vec_sum_value));
+        const qint16_t   sum0    = sqadd_qs16(vget_lane_s16(sum_red, 0), vget_lane_s16(sum_red, 1));
+        const qint16_t   sum1    = sqadd_qs16(vget_lane_s16(sum_red, 2), vget_lane_s16(sum_red, 3));
+        qint16_t         sum     = sqadd_qs16(sum0, sum1);
+
+        // Run remaining elements
+        for(int i = 0; i < small_steps; ++i)
+        {
+            qint8_t element = sqexp_qs8(sqsub_qs8(in_ptr[i], *max_ptr), fixed_point_position);
+            exp_ptr[i]      = element;
+            sum             = sqadd_qs16(sum, element);
+        }
+
+        *(reinterpret_cast<qint8_t *>(_sum.ptr())) = sqmovn_qs16(sum);
+    }
+    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+}
+} //namespace
+
+NELogits1DShiftExpSumKernel::NELogits1DShiftExpSumKernel()
+    : _func(nullptr), _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr)
+{
+}
+
+void NELogits1DShiftExpSumKernel::configure(const ITensor *input, const ITensor *max, ITensor *output, ITensor *sum)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(max, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, max, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, max, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(max, sum);
+
+    unsigned int num_elems_processed_per_iteration = input->info()->valid_region().shape.x();
+
+    switch(input->info()->data_type())
+    {
+        case DataType::QS8:
+            _func = &logits_1d_shift_exp_sum_qs8;
+            break;
+        case DataType::F32:
+            _func = &logits_1d_shift_exp_sum_f32;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type.");
+    }
+
+    _input  = input;
+    _max    = max;
+    _output = output;
+    _sum    = sum;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal max_access(max->info(), 0, 1);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal sum_access(sum->info(), 0, 1);
+
+    update_window_and_padding(win, input_access, max_access, output_access, sum_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+    sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NELogits1DShiftExpSumKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (*_func)(_input, _max, _output, _sum, window);
+}
+
+namespace
+{
+void logits_1d_norm_f32(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window)
+{
+    Window window_sum(window);
+    window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
+    Window sum_slice = window_sum.first_slice_window_1D();
+    Window in_slice  = window.first_slice_window_1D();
+
+    do
+    {
+        Iterator input(in, in_slice);
+        Iterator _sum(sum, sum_slice);
+        Iterator output(out, in_slice);
+
+        const float       sum_value        = *reinterpret_cast<const float *>(_sum.ptr());
+        const float32x4_t vec_sum_inversed = vdupq_n_f32(1.0f / sum_value);
+
+        execute_window_loop(in_slice, [&](const Coordinates & id)
+        {
+            const auto in_ptr  = reinterpret_cast<const float *>(input.ptr());
+            const auto out_ptr = reinterpret_cast<float *>(output.ptr());
+
+            const float32x4_t vec_in           = vld1q_f32(in_ptr);
+            const float32x4_t normalized_value = vmulq_f32(vec_in, vec_sum_inversed);
+
+            vst1q_f32(out_ptr, normalized_value);
+        },
+        input, output);
+    }
+    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
+}
+void logits_1d_norm_qs8(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window)
+{
+    Window window_sum(window);
+    window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
+    Window sum_slice = window_sum.first_slice_window_1D();
+    Window in_slice  = window.first_slice_window_1D();
+
+    const int fixed_point_position = in->info()->fixed_point_position();
+
+    do
+    {
+        Iterator input(in, in_slice);
+        Iterator _sum(sum, sum_slice);
+        Iterator output(out, in_slice);
+
+        const int8_t     sum_value        = *reinterpret_cast<const qint8_t *>(_sum.ptr());
+        const qint8x16_t vec_sum_inversed = vqrecipq_qs8(vdupq_n_qs8(sum_value), fixed_point_position);
+
+        execute_window_loop(in_slice, [&](const Coordinates & id)
+        {
+            const auto in_ptr  = reinterpret_cast<const qint8_t *>(input.ptr());
+            const auto out_ptr = reinterpret_cast<qint8_t *>(output.ptr());
+
+            const qint8x16_t vec_in           = vld1q_qs8(in_ptr);
+            const qint8x16_t normalized_value = vqmulq_qs8(vec_in, vec_sum_inversed, fixed_point_position);
+
+            vst1q_qs8(out_ptr, normalized_value);
+        },
+        input, output);
+    }
+    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
+}
+} // namespace
+
+NELogits1DNormKernel::NELogits1DNormKernel()
+    : _func(nullptr), _input(nullptr), _sum(nullptr), _output(nullptr)
+{
+}
+
+void NELogits1DNormKernel::configure(const ITensor *input, const ITensor *sum, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, sum);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output, sum);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+    _input  = input;
+    _sum    = sum;
+    _output = output;
+
+    // Configure kernel window
+    unsigned int num_elems_processed_per_iteration = 0;
+
+    switch(input->info()->data_type())
+    {
+        case DataType::QS8:
+            _func                             = &logits_1d_norm_qs8;
+            num_elems_processed_per_iteration = 16;
+            break;
+        case DataType::F32:
+            num_elems_processed_per_iteration = 4;
+            _func                             = &logits_1d_norm_f32;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type.");
+    }
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     sum_access(sum->info(), 0, 0, 1, sum->info()->dimension(1));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, sum_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+
+void NELogits1DNormKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (*_func)(_input, _sum, _output, window);
+}
diff --git a/src/core/NEON/kernels/NETableLookupKernel.cpp b/src/core/NEON/kernels/NETableLookupKernel.cpp
new file mode 100644
index 0000000000..f0b58d82f6
--- /dev/null
+++ b/src/core/NEON/kernels/NETableLookupKernel.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NETableLookupKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ILut.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+
+constexpr unsigned int num_num_elems_processed_per_iteration = 16;
+} // namespace arm_compute
+
+NETableLookupKernel::NETableLookupKernel()
+    : _func(nullptr), _lut(nullptr)
+{
+}
+
+template <class T>
+void NETableLookupKernel::tableLookup(const Window &window)
+{
+    uint32_t     offset = _lut->index_offset();
+    size_t       count  = _lut->num_elements();
+    const auto   lut    = reinterpret_cast<const T *>(_lut->buffer());
+    unsigned int step   = num_num_elems_processed_per_iteration;
+
+    ARM_COMPUTE_ERROR_ON(lut == nullptr);
+
+    Iterator input  = Iterator(_input, window);
+    Iterator output = Iterator(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+        auto output_ptr = reinterpret_cast<T *>(output.ptr());
+
+        for(unsigned int i = 0; i < step; ++i, ++input_ptr, ++output_ptr)
+        {
+            const int32_t index = offset + *input_ptr;
+
+            if(0 <= index && index < static_cast<int32_t>(count))
+            {
+                *output_ptr = lut[index];
+            }
+        }
+    },
+    input, output);
+}
+
+namespace arm_compute
+{
+template <>
+void NETableLookupKernel::tableLookup<uint8_t>(const Window &window)
+{
+    const uint8_t *const lut  = _lut->buffer();
+    unsigned int         step = num_num_elems_processed_per_iteration;
+
+    ARM_COMPUTE_ERROR_ON(lut == nullptr);
+
+    Iterator input  = Iterator(_input, window);
+    Iterator output = Iterator(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8_t *input_ptr  = input.ptr();
+        uint8_t       *output_ptr = output.ptr();
+
+        for(unsigned int i = 0; i < step; ++i)
+        {
+            *output_ptr++ = lut[*input_ptr++];
+        }
+    },
+    input, output);
+}
+} // namespace arm_compute
+
+void NETableLookupKernel::configure(const ITensor *input, const ILut *lut, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON(lut == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _lut = lut;
+
+    if(input->info()->data_type() == DataType::U8 && output->info()->data_type() == DataType::U8)
+    {
+        _func = &NETableLookupKernel::tableLookup<uint8_t>;
+    }
+    else if(input->info()->data_type() == DataType::S16 && output->info()->data_type() == DataType::S16)
+    {
+        _func = &NETableLookupKernel::tableLookup<int16_t>;
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unsupported combination of input and output DataType.");
+    }
+
+    INESimpleKernel::configure(input, output, num_num_elems_processed_per_iteration);
+}
+
+void NETableLookupKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+    (this->*_func)(window);
+}
diff --git a/src/core/NEON/kernels/NEThresholdKernel.cpp b/src/core/NEON/kernels/NEThresholdKernel.cpp
new file mode 100644
index 0000000000..72031195d9
--- /dev/null
+++ b/src/core/NEON/kernels/NEThresholdKernel.cpp
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEThresholdKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+NEThresholdKernel::NEThresholdKernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _threshold(0), _false_value(0), _true_value(0), _upper(0)
+{
+}
+
+void NEThresholdKernel::configure(const ITensor *input, ITensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    _input       = input;
+    _output      = output;
+    _threshold   = threshold;
+    _false_value = false_value;
+    _true_value  = true_value;
+    _upper       = upper;
+
+    switch(type)
+    {
+        case ThresholdType::BINARY:
+            _func = &NEThresholdKernel::run_binary;
+            break;
+        case ThresholdType::RANGE:
+            _func = &NEThresholdKernel::run_range;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Thresholding type not recognized");
+            break;
+    }
+
+    const unsigned int num_elems_processed_per_iteration = 16;
+
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), output_access);
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+
+inline void NEThresholdKernel::run_binary(const Window &window)
+{
+    const uint8x16_t threshold   = vdupq_n_u8(_threshold);
+    const uint8x16_t true_value  = vdupq_n_u8(_true_value);
+    const uint8x16_t false_value = vdupq_n_u8(_false_value);
+
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t data = vld1q_u8(input.ptr());
+        const uint8x16_t mask = vcgtq_u8(data, threshold);
+
+        vst1q_u8(output.ptr(), vbslq_u8(mask, true_value, false_value));
+    },
+    input, output);
+}
+
+inline void NEThresholdKernel::run_range(const Window &window)
+{
+    const uint8x16_t lower_threshold = vdupq_n_u8(_threshold);
+    const uint8x16_t upper_threshold = vdupq_n_u8(_upper);
+    const uint8x16_t true_value      = vdupq_n_u8(_true_value);
+    const uint8x16_t false_value     = vdupq_n_u8(_false_value);
+
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x16_t data = vld1q_u8(input.ptr());
+
+        uint8x16_t mask = vcleq_u8(data, upper_threshold);
+
+        mask = vandq_u8(vcgeq_u8(data, lower_threshold), mask);
+
+        vst1q_u8(output.ptr(), vbslq_u8(mask, true_value, false_value));
+    },
+    input, output);
+}
+
+void NEThresholdKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (this->*_func)(window);
+}
diff --git a/src/core/NEON/kernels/NETransposeKernel.cpp b/src/core/NEON/kernels/NETransposeKernel.cpp
new file mode 100644
index 0000000000..492de8a6ee
--- /dev/null
+++ b/src/core/NEON/kernels/NETransposeKernel.cpp
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
+
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+void transpose_8bit_elements(const ITensor *in, ITensor *out, const Window &window)
+{
+    Window window_out(window);
+    window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator input(in, window);
+    Iterator output(out, window_out);
+
+    const size_t input_stride_in_bytes  = in->info()->strides_in_bytes()[1];
+    const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1];
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8x8_t row0 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + 0 * input_stride_in_bytes));
+        const uint8x8_t row1 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + 1 * input_stride_in_bytes));
+        const uint8x8_t row2 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + 2 * input_stride_in_bytes));
+        const uint8x8_t row3 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + 3 * input_stride_in_bytes));
+        const uint8x8_t row4 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + 4 * input_stride_in_bytes));
+        const uint8x8_t row5 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + 5 * input_stride_in_bytes));
+        const uint8x8_t row6 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + 6 * input_stride_in_bytes));
+        const uint8x8_t row7 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + 7 * input_stride_in_bytes));
+
+        // Transpose 2x2
+        const uint8x8x2_t k0_u8 = vtrn_u8(row0, row1);
+        const uint8x8x2_t k1_u8 = vtrn_u8(row2, row3);
+        const uint8x8x2_t k2_u8 = vtrn_u8(row4, row5);
+        const uint8x8x2_t k3_u8 = vtrn_u8(row6, row7);
+
+        // Transpose 4x4
+        const uint16x4x2_t k0_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[0]), vreinterpret_u16_u8(k1_u8.val[0]));
+        const uint16x4x2_t k1_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[1]), vreinterpret_u16_u8(k1_u8.val[1]));
+        const uint16x4x2_t k2_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[0]), vreinterpret_u16_u8(k3_u8.val[0]));
+        const uint16x4x2_t k3_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[1]), vreinterpret_u16_u8(k3_u8.val[1]));
+
+        // Transpose 8x8
+        const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k2_u16.val[0]));
+        const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k2_u16.val[1]));
+        const uint32x2x2_t k2_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[0]), vreinterpret_u32_u16(k3_u16.val[0]));
+        const uint32x2x2_t k3_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[1]), vreinterpret_u32_u16(k3_u16.val[1]));
+
+        // Compute destination address
+        const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + id.x() * output_stride_in_bytes;
+
+        vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[0])));
+        vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[0])));
+        vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[0])));
+        vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[0])));
+        vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[1])));
+        vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[1])));
+        vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[1])));
+        vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[1])));
+    },
+    input, output);
+}
+
+void transpose_16bit_elements(const ITensor *in, ITensor *out, const Window &window)
+{
+    Window window_out(window);
+    window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator input(in, window);
+    Iterator output(out, window_out);
+
+    const size_t input_stride_in_bytes  = in->info()->strides_in_bytes()[1];
+    const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1];
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint16x4_t row0 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 0 * input_stride_in_bytes));
+        const uint16x4_t row1 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 1 * input_stride_in_bytes));
+        const uint16x4_t row2 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 2 * input_stride_in_bytes));
+        const uint16x4_t row3 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 3 * input_stride_in_bytes));
+
+        // Transpose 2x2
+        const uint16x4x2_t k0_u16 = vtrn_u16(row0, row1);
+        const uint16x4x2_t k1_u16 = vtrn_u16(row2, row3);
+
+        // Transpose 4x4
+        const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k1_u16.val[0]));
+        const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k1_u16.val[1]));
+
+        // Compute destination address
+        const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + id.x() * output_stride_in_bytes;
+
+        vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[0]));
+        vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[0]));
+        vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[1]));
+        vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[1]));
+    },
+    input, output);
+}
+
+void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &window)
+{
+    Window window_out(window);
+    window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator input(in, window);
+    Iterator output(out, window_out);
+
+    const size_t input_stride_in_bytes  = in->info()->strides_in_bytes()[1];
+    const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1];
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint32x4_t row0 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes));
+        const uint32x4_t row1 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes));
+        const uint32x4_t row2 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes));
+        const uint32x4_t row3 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes));
+
+        // Transpose 2x2
+        const uint32x2x2_t k0_u32 = vtrn_u32(vget_low_u32(row0), vget_low_u32(row1));
+        const uint32x2x2_t k1_u32 = vtrn_u32(vget_high_u32(row2), vget_high_u32(row3));
+        const uint32x2x2_t k2_u32 = vtrn_u32(vget_high_u32(row0), vget_high_u32(row1));
+        const uint32x2x2_t k3_u32 = vtrn_u32(vget_low_u32(row2), vget_low_u32(row3));
+
+        // Compute destination address
+        const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes;
+
+        // Swap block 01 with block 10 and store
+        vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vcombine_u32(k0_u32.val[0], k3_u32.val[0]));
+        vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vcombine_u32(k0_u32.val[1], k3_u32.val[1]));
+        vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vcombine_u32(k2_u32.val[0], k1_u32.val[0]));
+        vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vcombine_u32(k2_u32.val[1], k1_u32.val[1]));
+    },
+    input, output);
+}
+} // namespace
+
+NETransposeKernel::NETransposeKernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr)
+{
+}
+
+void NETransposeKernel::configure(const ITensor *input, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    TensorShape  output_shape{ input->info()->tensor_shape() };
+    const size_t w_out = input->info()->dimension(1);
+    const size_t h_out = input->info()->dimension(0);
+    output_shape.set(0, w_out);
+    output_shape.set(1, h_out);
+
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+    _input  = input;
+    _output = output;
+
+    unsigned int num_elems_processed_per_iteration = 0;
+
+    switch(input->info()->element_size())
+    {
+        case 1:
+            _func                             = &transpose_8bit_elements;
+            num_elems_processed_per_iteration = 8;
+            break;
+        case 2:
+            _func                             = &transpose_16bit_elements;
+            num_elems_processed_per_iteration = 4;
+            break;
+        case 4:
+            _func                             = &transpose_32bit_elements;
+            num_elems_processed_per_iteration = 4;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Element size not supported");
+            break;
+    }
+
+    // Configure kernel window
+    Window                win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_elems_processed_per_iteration));
+    AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowRectangle(input->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    INEKernel::configure(win);
+}
+
+void NETransposeKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (*_func)(_input, _output, window);
+}
diff --git a/src/core/NEON/kernels/NEWarpKernel.cpp b/src/core/NEON/kernels/NEWarpKernel.cpp
new file mode 100644
index 0000000000..6c90a334af
--- /dev/null
+++ b/src/core/NEON/kernels/NEWarpKernel.cpp
@@ -0,0 +1,651 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cstddef>
+
+using namespace arm_compute;
+
+namespace
+{
+inline uint8_t nearest_interpolation(const uint8_t *in_ptr, int x, int y, size_t stride)
+{
+    return in_ptr[x + y * stride];
+}
+} // namespace
+
+INEWarpKernel::INEWarpKernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _constant_border_value(0), _matrix(nullptr)
+{
+}
+
+void INEWarpKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (this->*_func)(window);
+}
+
+void INEWarpKernel::configure(const ITensor *input, ITensor *output, const float *matrix, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == matrix);
+
+    _matrix                = matrix;
+    _constant_border_value = constant_border_value;
+
+    switch(border_mode)
+    {
+        case BorderMode::UNDEFINED:
+            _func = &INEWarpKernel::warp_undefined;
+            break;
+        case BorderMode::CONSTANT:
+            _func = &INEWarpKernel::warp_constant;
+            break;
+        case BorderMode::REPLICATE:
+            _func = &INEWarpKernel::warp_replicate;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Border mode not supported");
+            break;
+    }
+
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info(), Steps(1U));
+
+    const ValidRegion &input_valid_region = input->info()->valid_region();
+
+    // Reads can occur within the valid region of the input
+    AccessWindowStatic input_access(input->info(),
+                                    input_valid_region.anchor[0], input_valid_region.anchor[1],
+                                    input_valid_region.anchor[0] + input_valid_region.shape[0],
+                                    input_valid_region.anchor[1] + input_valid_region.shape[1]);
+    AccessWindowHorizontal output_access(output->info(), 0, 1);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+template <InterpolationPolicy interpolation>
+void NEWarpAffineKernel<interpolation>::warp_undefined(const Window &window)
+{
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, window);
+
+    const int    min_x  = _input->info()->valid_region().anchor[0];
+    const int    max_x  = min_x + _input->info()->valid_region().shape[0];
+    const int    min_y  = _input->info()->valid_region().anchor[1];
+    const int    max_y  = min_y + _input->info()->valid_region().shape[1];
+    const size_t stride = _input->info()->strides_in_bytes()[1];
+
+    // x0 = M01 * x + M01 * y + M02
+    // y0 = M11 * x + M11 * y + M12
+    const float M00 = _matrix[0];
+    const float M10 = _matrix[1];
+    const float M01 = _matrix[0 + 1 * 2];
+    const float M11 = _matrix[1 + 1 * 2];
+    const float M02 = _matrix[0 + 2 * 2];
+    const float M12 = _matrix[1 + 2 * 2];
+
+    // "M00 * x" and "M10 * x", when x = window.x.start
+    const float start_x0 = M00 * window.x().start();
+    const float start_y0 = M10 * window.x().start();
+
+    // Current row
+    int y_cur = window.y().start();
+
+    // const_x0 and const_y0 are the constant parts of x0 and y0 during the row processing
+    float const_x0 = M01 * y_cur + M02;
+    float const_y0 = M11 * y_cur + M12;
+
+    // Affine warp coordinates
+    float x0 = start_x0 + const_x0;
+    float y0 = start_y0 + const_y0;
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Check if we are processing a new row. If so, update the current row (y_cur), x0 and y0
+        if(y_cur != id.y())
+        {
+            y_cur = id.y();
+
+            const_x0 = M01 * y_cur + M02;
+            const_y0 = M11 * y_cur + M12;
+
+            x0 = start_x0 + const_x0;
+            y0 = start_y0 + const_y0;
+        }
+
+        // Only write to output if x0 and y0 are within the valid region.
+        // Otherwise the read value would be undefined.
+        if((min_y <= y0) && (y0 < max_y) && (min_x <= x0) && (x0 < max_x))
+        {
+            switch(interpolation)
+            {
+                case InterpolationPolicy::NEAREST_NEIGHBOR:
+                    *out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride);
+                    break;
+                case InterpolationPolicy::BILINEAR:
+                    *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, x0, y0);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Interpolation not supported");
+            }
+        }
+
+        x0 += M00;
+        y0 += M10;
+    },
+    in, out);
+}
+
+template <InterpolationPolicy interpolation>
+void NEWarpAffineKernel<interpolation>::warp_constant(const Window &window)
+{
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, window);
+
+    const int    min_x  = _input->info()->valid_region().anchor[0];
+    const int    max_x  = min_x + _input->info()->valid_region().shape[0];
+    const int    min_y  = _input->info()->valid_region().anchor[1];
+    const int    max_y  = min_y + _input->info()->valid_region().shape[1];
+    const size_t stride = _input->info()->strides_in_bytes()[1];
+
+    // x0 = M01 * x + M01 * y + M02
+    // y0 = M11 * x + M11 * y + M12
+    const float M00 = _matrix[0];
+    const float M10 = _matrix[1];
+    const float M01 = _matrix[0 + 1 * 2];
+    const float M11 = _matrix[1 + 1 * 2];
+    const float M02 = _matrix[0 + 2 * 2];
+    const float M12 = _matrix[1 + 2 * 2];
+
+    // "M00 * x" and "M10 * x", when x = window.x.start
+    const float start_x0 = M00 * window.x().start();
+    const float start_y0 = M10 * window.x().start();
+
+    // Current row
+    int y_cur = window.y().start();
+
+    // const_x0 and const_y0 are the constant parts of x0 and y0 during the row processing
+    float const_x0 = M01 * y_cur + M02;
+    float const_y0 = M11 * y_cur + M12;
+
+    // Affine warp coordinates
+    float x0 = start_x0 + const_x0;
+    float y0 = start_y0 + const_y0;
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Check if we are processing a new row. If so, update the current row (y_cur), x0 and y0
+        if(y_cur != id.y())
+        {
+            y_cur = id.y();
+
+            const_x0 = M01 * y_cur + M02;
+            const_y0 = M11 * y_cur + M12;
+
+            x0 = start_x0 + const_x0;
+            y0 = start_y0 + const_y0;
+        }
+
+        // Only use input values if x0 and y0 are within the valid region.
+        // Otherwise write the constant border value.
+        if((min_y <= y0) && (y0 < max_y) && (min_x <= x0) && (x0 < max_x))
+        {
+            switch(interpolation)
+            {
+                case InterpolationPolicy::NEAREST_NEIGHBOR:
+                    *out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride);
+                    break;
+                case InterpolationPolicy::BILINEAR:
+                    *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, x0, y0);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Interpolation not supported");
+            }
+        }
+        else
+        {
+            *out.ptr() = _constant_border_value;
+        }
+
+        x0 += M00;
+        y0 += M10;
+    },
+    in, out);
+}
+
+template <InterpolationPolicy interpolation>
+void NEWarpAffineKernel<interpolation>::warp_replicate(const Window &window)
+{
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, window);
+
+    const int    min_x  = _input->info()->valid_region().anchor[0];
+    const int    max_x  = min_x + _input->info()->valid_region().shape[0];
+    const int    min_y  = _input->info()->valid_region().anchor[1];
+    const int    max_y  = min_y + _input->info()->valid_region().shape[1];
+    const size_t stride = _input->info()->strides_in_bytes()[1];
+
+    // Current row
+    int y_cur = window.y().start();
+
+    const float M00 = _matrix[0];
+    const float M10 = _matrix[1];
+    const float M01 = _matrix[0 + 1 * 2];
+    const float M11 = _matrix[1 + 1 * 2];
+    const float M02 = _matrix[0 + 2 * 2];
+    const float M12 = _matrix[1 + 2 * 2];
+
+    // "M00 * x" and "M10 * x", when x = window.x.start
+    const float start_x0 = M00 * window.x().start();
+    const float start_y0 = M10 * window.x().start();
+
+    // const_x0 and const_y0 are the constant parts of x0 and y0 during the row processing
+    float const_x0 = M01 * y_cur + M02;
+    float const_y0 = M11 * y_cur + M12;
+
+    float x0 = start_x0 + const_x0;
+    float y0 = start_y0 + const_y0;
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Check if we are processing a new row. If so, update the current row (y_cur), x0 and y0
+        if(y_cur != id.y())
+        {
+            y_cur = id.y();
+
+            const_x0 = M01 * y_cur + M02;
+            const_y0 = M11 * y_cur + M12;
+
+            x0 = start_x0 + const_x0;
+            y0 = start_y0 + const_y0;
+        }
+
+        // Only load from (x0, y0) if the point is within the valid region.
+        // Otherwise load from the edge of the valid region.
+        if((min_y <= y0) && (y0 < max_y) && (min_x <= x0) && (x0 < max_x))
+        {
+            switch(interpolation)
+            {
+                case InterpolationPolicy::NEAREST_NEIGHBOR:
+                    *out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride);
+                    break;
+                case InterpolationPolicy::BILINEAR:
+                    *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, x0, y0);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Interpolation not supported");
+            }
+        }
+        else
+        {
+            // Clamp coordinates
+            const auto xi = clamp<int>(x0, min_x, max_x - 1);
+            const auto yi = clamp<int>(y0, min_y, max_y - 1);
+
+            *out.ptr() = *(in.ptr() + xi + yi * stride);
+        }
+
+        x0 += M00;
+        y0 += M10;
+    },
+    in, out);
+}
+
+template <InterpolationPolicy interpolation>
+void NEWarpPerspectiveKernel<interpolation>::warp_undefined(const Window &window)
+{
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, window);
+
+    const int    min_x  = _input->info()->valid_region().anchor[0];
+    const int    max_x  = min_x + _input->info()->valid_region().shape[0];
+    const int    min_y  = _input->info()->valid_region().anchor[1];
+    const int    max_y  = min_y + _input->info()->valid_region().shape[1];
+    const size_t stride = _input->info()->strides_in_bytes()[1];
+
+    // x0 = M00 * x + M01 * y + M02
+    // y0 = M10 * x + M11 * y + M12
+    // z0 = M20 * x + M21 * y + M22
+    // xn = x0 / z0
+    // yn = y0 / z0
+    const float M00 = _matrix[0];
+    const float M10 = _matrix[1];
+    const float M20 = _matrix[2];
+    const float M01 = _matrix[0 + 1 * 3];
+    const float M11 = _matrix[1 + 1 * 3];
+    const float M21 = _matrix[2 + 1 * 3];
+    const float M02 = _matrix[0 + 2 * 3];
+    const float M12 = _matrix[1 + 2 * 3];
+    const float M22 = _matrix[2 + 2 * 3];
+
+    // "M00 * x", "M10 * x" and "M20 * x", when x = window.x.start
+    const float start_x0 = M00 * window.x().start();
+    const float start_y0 = M10 * window.x().start();
+    const float start_z0 = M20 * window.x().start();
+
+    // Current row
+    int y_cur = window.y().start();
+
+    // const_x0, const_y0 and const_z0 are the constant parts of x0, y0 and z0 during the row processing
+    float const_x0 = M01 * y_cur + M02;
+    float const_y0 = M11 * y_cur + M12;
+    float const_z0 = M21 * y_cur + M22;
+
+    // Perspective warp coordinates
+    float x0 = start_x0 + const_x0;
+    float y0 = start_y0 + const_y0;
+    float z0 = start_z0 + const_z0;
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Check if we are processing a new row. If so, update the current processed row (y_cur), x0, y0 and z0
+        if(y_cur != id.y())
+        {
+            y_cur = id.y();
+
+            const_x0 = M01 * y_cur + M02;
+            const_y0 = M11 * y_cur + M12;
+            const_z0 = M21 * y_cur + M22;
+
+            x0 = start_x0 + const_x0;
+            y0 = start_y0 + const_y0;
+            z0 = start_z0 + const_z0;
+        }
+
+        const float xn = x0 / z0;
+        const float yn = y0 / z0;
+
+        // Only write to output if xn and yn are within the valid region.
+        // Otherwise the read value would be undefined.
+        if((min_y <= yn) && (yn < max_y) && (min_x <= xn) && (xn < max_x))
+        {
+            switch(interpolation)
+            {
+                case InterpolationPolicy::NEAREST_NEIGHBOR:
+                    *out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride);
+                    break;
+                case InterpolationPolicy::BILINEAR:
+                    *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, xn, yn);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Interpolation not supported");
+            }
+        }
+
+        x0 += M00;
+        y0 += M10;
+        z0 += M20;
+    },
+    in, out);
+}
+
+template <InterpolationPolicy interpolation>
+void NEWarpPerspectiveKernel<interpolation>::warp_constant(const Window &window)
+{
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, window);
+
+    const int    min_x  = _input->info()->valid_region().anchor[0];
+    const int    max_x  = min_x + _input->info()->valid_region().shape[0];
+    const int    min_y  = _input->info()->valid_region().anchor[1];
+    const int    max_y  = min_y + _input->info()->valid_region().shape[1];
+    const size_t stride = _input->info()->strides_in_bytes()[1];
+
+    // x0 = M00 * x + M01 * y + M02
+    // y0 = M10 * x + M11 * y + M12
+    // z0 = M20 * x + M21 * y + M22
+    // xn = x0 / z0
+    // yn = y0 / z0
+    const float M00 = _matrix[0];
+    const float M10 = _matrix[1];
+    const float M20 = _matrix[2];
+    const float M01 = _matrix[0 + 1 * 3];
+    const float M11 = _matrix[1 + 1 * 3];
+    const float M21 = _matrix[2 + 1 * 3];
+    const float M02 = _matrix[0 + 2 * 3];
+    const float M12 = _matrix[1 + 2 * 3];
+    const float M22 = _matrix[2 + 2 * 3];
+
+    // "M00 * x", "M10 * x" and "M20 * x", when x = window.x.start
+    const float start_x0 = M00 * window.x().start();
+    const float start_y0 = M10 * window.x().start();
+    const float start_z0 = M20 * window.x().start();
+
+    // Current row
+    int y_cur = window.y().start();
+
+    // const_x0, const_y0 and const_z0 are the constant parts of x0, y0 and z0 during the row processing
+    float const_x0 = M01 * y_cur + M02;
+    float const_y0 = M11 * y_cur + M12;
+    float const_z0 = M21 * y_cur + M22;
+
+    // Perspective warp coordinates
+    float x0 = start_x0 + const_x0;
+    float y0 = start_y0 + const_y0;
+    float z0 = start_z0 + const_z0;
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Check if we are processing a new row. If so, update the current row (y_cur), x0, y0 and z0
+        if(y_cur != id.y())
+        {
+            y_cur = id.y();
+
+            const_x0 = M01 * y_cur + M02;
+            const_y0 = M11 * y_cur + M12;
+            const_z0 = M21 * y_cur + M22;
+
+            x0 = start_x0 + const_x0;
+            y0 = start_y0 + const_y0;
+            z0 = start_z0 + const_z0;
+        }
+
+        const float xn = x0 / z0;
+        const float yn = y0 / z0;
+
+        // Only use input values if xn and yn are within the valid region.
+        // Otherwise write the constant border value.
+        if((min_y <= yn) && (yn < max_y) && (min_x <= xn) && (xn < max_x))
+        {
+            switch(interpolation)
+            {
+                case InterpolationPolicy::NEAREST_NEIGHBOR:
+                    *out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride);
+                    break;
+                case InterpolationPolicy::BILINEAR:
+                    *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, xn, yn);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Interpolation not supported");
+            }
+        }
+        else
+        {
+            *out.ptr() = _constant_border_value;
+        }
+
+        x0 += M00;
+        y0 += M10;
+        z0 += M20;
+    },
+    in, out);
+}
+
+template <InterpolationPolicy interpolation>
+void NEWarpPerspectiveKernel<interpolation>::warp_replicate(const Window &window)
+{
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, window);
+
+    const int    min_x  = _input->info()->valid_region().anchor[0];
+    const int    max_x  = min_x + _input->info()->valid_region().shape[0];
+    const int    min_y  = _input->info()->valid_region().anchor[1];
+    const int    max_y  = min_y + _input->info()->valid_region().shape[1];
+    const size_t stride = _input->info()->strides_in_bytes()[1];
+
+    // Current row
+    int y_cur = window.y().start();
+
+    // x0 = M00 * x + M01 * y + M02
+    // y0 = M10 * x + M11 * y + M12
+    // z0 = M20 * x + M21 * y + M22
+    // xn = x0 / z0
+    // yn = y0 / z0
+    const float M00 = _matrix[0];
+    const float M10 = _matrix[1];
+    const float M20 = _matrix[2];
+    const float M01 = _matrix[0 + 1 * 3];
+    const float M11 = _matrix[1 + 1 * 3];
+    const float M21 = _matrix[2 + 1 * 3];
+    const float M02 = _matrix[0 + 2 * 3];
+    const float M12 = _matrix[1 + 2 * 3];
+    const float M22 = _matrix[2 + 2 * 3];
+
+    // "M00 * x", "M10 * x" and "M20 * x", when x = window.x.start
+    const float start_x0 = M00 * window.x().start();
+    const float start_y0 = M10 * window.x().start();
+    const float start_z0 = M20 * window.x().start();
+
+    // const_x0, const_y0 and const_z0 are the constant parts of x0, y0 and z0 during the row processing
+    float const_x0 = M01 * y_cur + M02;
+    float const_y0 = M11 * y_cur + M12;
+    float const_z0 = M21 * y_cur + M22;
+
+    // Perspective warp coordinates
+    float x0 = start_x0 + const_x0;
+    float y0 = start_y0 + const_y0;
+    float z0 = start_z0 + const_z0;
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Check if we are processing a new row. If so, update the current row (y_cur), x0, y0 and z0
+        if(y_cur != id.y())
+        {
+            y_cur = id.y();
+
+            const_x0 = M01 * y_cur + M02;
+            const_y0 = M11 * y_cur + M12;
+            const_z0 = M21 * y_cur + M22;
+
+            x0 = start_x0 + const_x0;
+            y0 = start_y0 + const_y0;
+            z0 = start_z0 + const_z0;
+        }
+
+        const float xn = x0 / z0;
+        const float yn = y0 / z0;
+
+        // Only load from (x0, y0) if the point is within the valid region.
+        // Otherwise load from the edge of the valid region.
+        if((min_y <= yn) && (yn < max_y) && (min_x <= xn) && (xn < max_x))
+        {
+            switch(interpolation)
+            {
+                case InterpolationPolicy::NEAREST_NEIGHBOR:
+                    *out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride);
+                    break;
+                case InterpolationPolicy::BILINEAR:
+                    *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, xn, yn);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Interpolation not supported");
+            }
+        }
+        else
+        {
+            // Clamp coordinates
+            const auto xi = clamp<int>(x0, min_x, max_x - 1);
+            const auto yi = clamp<int>(y0, min_y, max_y - 1);
+
+            *out.ptr() = *(in.ptr() + xi + yi * stride);
+        }
+
+        x0 += M00;
+        y0 += M10;
+        z0 += M20;
+    },
+    in, out);
+}
+
+template class arm_compute::NEWarpAffineKernel<InterpolationPolicy::NEAREST_NEIGHBOR>;
+template class arm_compute::NEWarpAffineKernel<InterpolationPolicy::BILINEAR>;
+template class arm_compute::NEWarpPerspectiveKernel<InterpolationPolicy::NEAREST_NEIGHBOR>;
+template class arm_compute::NEWarpPerspectiveKernel<InterpolationPolicy::BILINEAR>;
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
new file mode 100644
index 0000000000..aa6be44bee
--- /dev/null
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
+
+#include "arm_compute/core/Dimensions.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+namespace
+{
+template <typename T>
+void weights_reshape(const ITensor *input, const ITensor *bias, ITensor *output, const Window &window)
+{
+    const unsigned int kernel_size     = input->info()->dimension(0);
+    const unsigned int kernel_depth    = input->info()->dimension(2);
+    const unsigned int input_stride_x  = input->info()->strides_in_bytes().x();
+    const unsigned int input_stride_y  = input->info()->strides_in_bytes().y();
+    const unsigned int input_stride_z  = input->info()->strides_in_bytes().z();
+    const unsigned int output_stride_y = output->info()->strides_in_bytes().y();
+
+    // Create iterators
+    Iterator in(input, window);
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Get column index
+        const int kernel_idx = id[3];
+        const int kernel_idz = id[4];
+
+        // Setup pointers
+        const uint8_t *tmp_input_ptr        = in.ptr();
+        uint8_t       *tmp_output_ptr       = output->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz));
+        const uint8_t *curr_input_row_ptr   = tmp_input_ptr;
+        const uint8_t *curr_input_depth_ptr = tmp_input_ptr;
+
+        // Linearize volume
+        for(unsigned int d = 0; d < kernel_depth; ++d)
+        {
+            for(unsigned int j = 0; j < kernel_size; ++j)
+            {
+                for(unsigned int i = 0; i < kernel_size; ++i)
+                {
+                    *(reinterpret_cast<T *>(tmp_output_ptr)) = *(reinterpret_cast<const T *>(tmp_input_ptr));
+                    tmp_input_ptr += input_stride_x;
+                    tmp_output_ptr += output_stride_y;
+                }
+                curr_input_row_ptr += input_stride_y;
+                tmp_input_ptr = curr_input_row_ptr;
+            }
+            curr_input_depth_ptr += input_stride_z;
+            curr_input_row_ptr = curr_input_depth_ptr;
+            tmp_input_ptr      = curr_input_depth_ptr;
+        }
+
+        // Add bias
+        if(bias != nullptr)
+        {
+            *(reinterpret_cast<T *>(tmp_output_ptr)) = *(reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(kernel_idx, kernel_idz))));
+        }
+    },
+    in);
+}
+} // namespace
+
+NEWeightsReshapeKernel::NEWeightsReshapeKernel()
+    : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr)
+{
+}
+
+void NEWeightsReshapeKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != input->info()->dimension(1));
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    TensorShape output_shape{ input->info()->tensor_shape() };
+    output_shape.collapse(3);
+    const size_t tmp_dim = output_shape[0];
+    output_shape.set(0, output_shape[1]);
+    output_shape.set(1, tmp_dim + (bias != nullptr ? 1 : 0));
+
+    // Set data type and shape for output tensor if not yet configured
+    set_data_type_if_unknown(*output->info(), dt);
+    set_fixed_point_position_if_zero(*output->info(), fixed_point_position);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+    if(bias != nullptr)
+    {
+        TensorShape bias_shape{ input->info()->tensor_shape()[3] };
+
+        // Set data type and shape for bias tensor if not yet configured
+        set_data_type_if_unknown(*bias->info(), dt);
+        set_fixed_point_position_if_zero(*bias->info(), fixed_point_position);
+        set_shape_if_empty(*bias->info(), bias_shape);
+
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(bias->info()->tensor_shape(), bias_shape);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::F32, DataType::QS8);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    }
+
+    _input  = input;
+    _bias   = bias;
+    _output = output;
+
+    switch(_input->info()->data_type())
+    {
+        case DataType::F32:
+        {
+            _func = &weights_reshape<uint32_t>;
+            break;
+        }
+        case DataType::QS8:
+        {
+            _func = &weights_reshape<uint8_t>;
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR_ON("Data type not supported");
+            break;
+        }
+    }
+
+    // Configure kernel
+    Window window = calculate_max_window(*input->info(), Steps());
+    window.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
+    window.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
+    window.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
+
+    // The NEConvolutionLayerWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    INEKernel::configure(window);
+}
+
+void NEWeightsReshapeKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    (*_func)(_input, _bias, _output, window);
+}
diff --git a/src/core/PyramidInfo.cpp b/src/core/PyramidInfo.cpp
new file mode 100644
index 0000000000..1c12eee46f
--- /dev/null
+++ b/src/core/PyramidInfo.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/PyramidInfo.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorShape.h"
+
+#include <cmath>
+
+using namespace arm_compute;
+
+PyramidInfo::PyramidInfo()
+    : _num_levels(0), _tensor_shape(), _format(Format::UNKNOWN), _scale(0.0f)
+{
+}
+
+PyramidInfo::PyramidInfo(size_t num_levels, float scale, size_t width, size_t height, Format format)
+    : PyramidInfo()
+{
+    init(num_levels, scale, width, height, format);
+}
+
+PyramidInfo::PyramidInfo(size_t num_levels, float scale, const TensorShape &tensor_shape, Format format)
+    : PyramidInfo()
+{
+    init(num_levels, scale, tensor_shape, format);
+}
+
+void PyramidInfo::init(size_t num_levels, float scale, size_t width, size_t height, Format format)
+{
+    init(num_levels, scale, TensorShape(width, height), format);
+}
+
+void PyramidInfo::init(size_t num_levels, float scale, const TensorShape &tensor_shape, Format format)
+{
+    ARM_COMPUTE_ERROR_ON(0 == num_levels);
+    ARM_COMPUTE_ERROR_ON(0.0f == scale);
+    ARM_COMPUTE_ERROR_ON(0 == tensor_shape.x());
+    ARM_COMPUTE_ERROR_ON(0 == tensor_shape.y());
+    ARM_COMPUTE_ERROR_ON(Format::IYUV == format);
+    ARM_COMPUTE_ERROR_ON(Format::NV12 == format);
+    ARM_COMPUTE_ERROR_ON(Format::NV21 == format);
+    ARM_COMPUTE_ERROR_ON(Format::UYVY422 == format);
+    ARM_COMPUTE_ERROR_ON(Format::YUV444 == format);
+    ARM_COMPUTE_ERROR_ON(Format::YUYV422 == format);
+    ARM_COMPUTE_ERROR_ON_MSG(0 != _num_levels, "PyramidInfo already initialized");
+    ARM_COMPUTE_ERROR_ON(0 == (tensor_shape.x() * pow(scale, num_levels)));
+    ARM_COMPUTE_ERROR_ON(0 == (tensor_shape.y() * pow(scale, num_levels)));
+
+    _num_levels   = num_levels;
+    _format       = format;
+    _scale        = scale;
+    _tensor_shape = tensor_shape;
+}
+
+size_t PyramidInfo::num_levels() const
+{
+    return _num_levels;
+}
+
+size_t PyramidInfo::width() const
+{
+    return _tensor_shape.x();
+}
+
+size_t PyramidInfo::height() const
+{
+    return _tensor_shape.y();
+}
+
+const TensorShape &PyramidInfo::tensor_shape() const
+{
+    return _tensor_shape;
+}
+
+Format PyramidInfo::format() const
+{
+    return _format;
+}
+
+float PyramidInfo::scale() const
+{
+    return _scale;
+}
diff --git a/src/core/SubTensorInfo.cpp b/src/core/SubTensorInfo.cpp
new file mode 100644
index 0000000000..f5a282df8a
--- /dev/null
+++ b/src/core/SubTensorInfo.cpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/SubTensorInfo.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+SubTensorInfo::SubTensorInfo()
+    : _parent(nullptr), _tensor_shape(), _coords(), _valid_region{ Coordinates(), _tensor_shape }
+{
+}
+
+SubTensorInfo::SubTensorInfo(ITensorInfo *parent, const TensorShape &tensor_shape, const Coordinates &coords)
+    : _parent(parent), _tensor_shape(tensor_shape), _coords(coords), _valid_region{ Coordinates(), _tensor_shape }
+{
+    ARM_COMPUTE_ERROR_ON(parent == nullptr);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(parent->tensor_shape(), coords, tensor_shape);
+
+    // Initialize valid region
+    Coordinates coordinates;
+    coordinates.set_num_dimensions(_tensor_shape.num_dimensions());
+    _valid_region = ValidRegion{ coordinates, _tensor_shape };
+}
+
+void SubTensorInfo::set_tensor_shape(TensorShape shape)
+{
+    ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(_parent->tensor_shape(), _coords, shape);
+    _tensor_shape = shape;
+}
+
+bool SubTensorInfo::extend_padding(const PaddingSize &padding)
+{
+    ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+    ARM_COMPUTE_ERROR_ON(!_parent->is_resizable());
+
+    // Extend parent padding if required
+    return _parent->extend_padding(padding);
+}
+
+size_t SubTensorInfo::offset_element_in_bytes(const Coordinates &pos) const
+{
+    ARM_COMPUTE_ERROR_ON_COORDINATES_DIMENSIONS_GTE(pos, _tensor_shape.num_dimensions());
+
+    size_t         offset  = offset_first_element_in_bytes();
+    const Strides &strides = strides_in_bytes();
+
+    for(size_t i = 0; i < _tensor_shape.num_dimensions(); ++i)
+    {
+        offset += pos[i] * strides[i];
+    }
+
+    return offset;
+}
diff --git a/src/core/TensorInfo.cpp b/src/core/TensorInfo.cpp
new file mode 100644
index 0000000000..3d07ccb69a
--- /dev/null
+++ b/src/core/TensorInfo.cpp
@@ -0,0 +1,377 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/TensorInfo.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/HOGInfo.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+TensorInfo::TensorInfo()
+    : _total_size(0), _fixed_point_position(0), _offset_first_element_in_bytes(0), _strides_in_bytes(), _num_channels(0), _tensor_shape(), _data_type(DataType::UNKNOWN), _format(Format::UNKNOWN),
+      _is_resizable{ true }, _valid_region{ Coordinates(), _tensor_shape }, _padding{ 0 }
+{
+}
+
+TensorInfo::TensorInfo(const ITensorInfo &info)
+    : TensorInfo()
+{
+    _total_size                    = info.total_size();
+    _fixed_point_position          = info.fixed_point_position();
+    _offset_first_element_in_bytes = info.offset_first_element_in_bytes();
+    _strides_in_bytes              = info.strides_in_bytes();
+    _num_channels                  = info.num_channels();
+    _tensor_shape                  = info.tensor_shape();
+    _data_type                     = info.data_type();
+    _format                        = info.format();
+    _is_resizable                  = info.is_resizable();
+    _valid_region                  = info.valid_region();
+    _padding                       = info.padding();
+}
+
+TensorInfo::TensorInfo(Format format)
+    : TensorInfo(TensorShape(), format)
+{
+}
+
+TensorInfo::TensorInfo(unsigned int width, unsigned int height, Format format)
+    : TensorInfo(TensorShape(width, height), format)
+{
+}
+
+TensorInfo::TensorInfo(const TensorShape &tensor_shape, Format format)
+    : TensorInfo()
+{
+    init(tensor_shape, format);
+}
+
+TensorInfo::TensorInfo(size_t num_channels, DataType data_type, size_t fixed_point_position)
+    : TensorInfo()
+{
+    init(TensorShape(), num_channels, data_type, fixed_point_position);
+}
+
+TensorInfo::TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position)
+    : TensorInfo()
+{
+    init(tensor_shape, num_channels, data_type, fixed_point_position);
+}
+
+TensorInfo::TensorInfo(const HOGInfo &hog_info, unsigned int width, unsigned int height)
+    : TensorInfo()
+{
+    init(hog_info, width, height);
+}
+
+void TensorInfo::init(Format format)
+{
+    init(TensorShape(), format);
+}
+
+void TensorInfo::init(const TensorShape &tensor_shape, Format format)
+{
+    size_t         num_channels = num_channels_from_format(format);
+    const DataType type         = data_type_from_format(format);
+
+    init(tensor_shape, num_channels, type);
+
+    _format = format;
+}
+
+void TensorInfo::init(const TensorShape &tensor_shape, Format format,
+                      const Strides &strides_in_bytes, size_t offset_first_element_in_bytes,
+                      size_t total_size_in_bytes)
+{
+    size_t         num_channels = num_channels_from_format(format);
+    const DataType type         = data_type_from_format(format);
+
+    init(tensor_shape, num_channels, type, strides_in_bytes, offset_first_element_in_bytes, total_size_in_bytes);
+
+    _format = format;
+}
+
+void TensorInfo::init(size_t num_channels, DataType data_type, size_t fixed_point_position)
+{
+    init(TensorShape(), num_channels, data_type, fixed_point_position);
+}
+
+void TensorInfo::init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position)
+{
+    ARM_COMPUTE_ERROR_ON(num_channels == 0);
+    ARM_COMPUTE_ERROR_ON(data_type == DataType::QS8 && (fixed_point_position < 1 || fixed_point_position > 6));
+    ARM_COMPUTE_ERROR_ON(data_type == DataType::QS16 && (fixed_point_position < 1 || fixed_point_position > 14));
+
+    _fixed_point_position = fixed_point_position;
+    _data_type            = data_type;
+    _num_channels         = num_channels;
+    _format               = Format::UNKNOWN;
+
+    set_tensor_shape(tensor_shape);
+}
+
+void TensorInfo::init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type,
+                      const Strides &strides_in_bytes, size_t offset_first_element_in_bytes,
+                      size_t total_size_in_bytes, int fixed_point_position)
+{
+    ARM_COMPUTE_ERROR_ON(num_channels == 0);
+    ARM_COMPUTE_ERROR_ON(data_type == DataType::QS8 && (fixed_point_position < 1 || fixed_point_position > 6));
+    ARM_COMPUTE_ERROR_ON(data_type == DataType::QS16 && (fixed_point_position < 1 || fixed_point_position > 14));
+
+    _fixed_point_position          = fixed_point_position;
+    _data_type                     = data_type;
+    _num_channels                  = num_channels;
+    _format                        = Format::UNKNOWN;
+    _tensor_shape                  = tensor_shape;
+    _offset_first_element_in_bytes = offset_first_element_in_bytes;
+    _strides_in_bytes              = strides_in_bytes;
+    _total_size                    = total_size_in_bytes;
+
+    Coordinates coordinates;
+    coordinates.set_num_dimensions(_tensor_shape.num_dimensions());
+    _valid_region = ValidRegion{ coordinates, _tensor_shape };
+}
+
+void TensorInfo::init(const HOGInfo &hog_info, unsigned int width, unsigned int height)
+{
+    // Number of cells for each block
+    const Size2D num_cells_per_block = hog_info.num_cells_per_block();
+
+    // Tensor Size = (Number of horizontal blocks) * (Number of vertical blocks )
+    const Size2D num_blocks_per_img = hog_info.num_blocks_per_image(Size2D(width, height));
+
+    // Number of tensor channels = (Number of cells per block) * (Number of bins per cell)
+    const size_t num_channels = num_cells_per_block.area() * hog_info.num_bins();
+
+    init(TensorShape(num_blocks_per_img.width, num_blocks_per_img.height), num_channels, DataType::F32);
+}
+
+size_t TensorInfo::init_auto_padding(const TensorShape &tensor_shape, Format format)
+{
+    const size_t   num_channels = num_channels_from_format(format);
+    const DataType type         = data_type_from_format(format);
+    size_t         total_size   = init_auto_padding(tensor_shape, num_channels, type);
+
+    _format = format;
+
+    return total_size;
+}
+
+size_t TensorInfo::init_auto_padding(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position)
+{
+    ARM_COMPUTE_ERROR_ON(num_channels == 0);
+    ARM_COMPUTE_ERROR_ON(data_type == DataType::QS8 && (fixed_point_position < 1 || fixed_point_position > 6));
+    ARM_COMPUTE_ERROR_ON(data_type == DataType::QS16 && (fixed_point_position < 1 || fixed_point_position > 14));
+
+    _fixed_point_position = fixed_point_position;
+    _data_type            = data_type;
+    _num_channels         = num_channels;
+    _format               = Format::UNKNOWN;
+    _tensor_shape         = tensor_shape;
+
+    Coordinates coordinates;
+    coordinates.set_num_dimensions(_tensor_shape.num_dimensions());
+    _valid_region = ValidRegion{ coordinates, _tensor_shape };
+
+    auto_padding();
+
+    return _total_size;
+}
+
+size_t TensorInfo::init_auto_padding(const HOGInfo &hog_info, unsigned int width, unsigned int height)
+{
+    // Number of cells for each block
+    const Size2D num_cells_per_block = hog_info.num_cells_per_block();
+
+    // Tensor Size = (Number of horizontal blocks) * (Number of vertical blocks )
+    const Size2D num_blocks_per_img = hog_info.num_blocks_per_image(Size2D(width, height));
+
+    // Number of tensor channels = (Number of cells per block) * (Number of bins per cell)
+    const size_t num_channels = num_cells_per_block.area() * hog_info.num_bins();
+
+    return init_auto_padding(TensorShape(num_blocks_per_img.width, num_blocks_per_img.height), num_channels, DataType::F32);
+}
+
+bool TensorInfo::auto_padding()
+{
+    ARM_COMPUTE_ERROR_ON(!_is_resizable);
+
+    // Some kernels compute 32 elements at the time, worst case scenario they
+    // will read 32 values after the last element
+    const size_t extra_pad_x = _tensor_shape.num_dimensions() < 1 ? 0 : 32;
+    const size_t pad_x       = _tensor_shape.num_dimensions() < 1 ? 0 : 4;
+    const size_t pad_y       = _tensor_shape.num_dimensions() < 2 ? 0 : 4;
+
+    return extend_padding(PaddingSize(pad_y, pad_x + extra_pad_x, pad_y, pad_x));
+}
+
+std::tuple<Strides, size_t, size_t> TensorInfo::calculate_padding_requirements(const PaddingSize &padding)
+{
+    // Calculate resulting stride for the X, Y and Z dimension
+    const size_t stride_x = element_size();
+    const size_t stride_y = (padding.left + _tensor_shape[0] + padding.right) * stride_x;
+    const size_t stride_z = (padding.top + _tensor_shape[1] + padding.bottom) * stride_y;
+
+    Strides      required_strides;
+    size_t       required_total_size           = 0;
+    const size_t required_offset_first_element = padding.left * stride_x + padding.top * stride_y;
+
+    switch(_tensor_shape.num_dimensions())
+    {
+        case 0:
+        {
+            if(_tensor_shape.total_size() > 0)
+            {
+                required_strides    = Strides(stride_x);
+                required_total_size = stride_z;
+            }
+            break;
+        }
+        case 1:
+            required_strides    = compute_strides(*this, stride_x);
+            required_total_size = stride_z;
+            break;
+        case 2:
+            required_strides    = compute_strides(*this, stride_x, stride_y);
+            required_total_size = stride_z;
+            break;
+        default:
+        {
+            required_strides = compute_strides(*this, stride_x, stride_y, stride_z);
+
+            const unsigned int idx_last_dimension = _tensor_shape.num_dimensions() - 1;
+
+            required_total_size = _tensor_shape[idx_last_dimension] * required_strides[idx_last_dimension];
+            break;
+        }
+    }
+
+    return std::make_tuple(required_strides, required_offset_first_element, required_total_size);
+}
+
+bool TensorInfo::extend_padding(const PaddingSize &padding)
+{
+    ARM_COMPUTE_ERROR_ON(!_is_resizable);
+
+    bool updated = false;
+
+    if(padding.top > _padding.top)
+    {
+        _padding.top = padding.top;
+        updated      = true;
+    }
+
+    if(padding.right > _padding.right)
+    {
+        _padding.right = padding.right;
+        updated        = true;
+    }
+
+    if(padding.bottom > _padding.bottom)
+    {
+        _padding.bottom = padding.bottom;
+        updated         = true;
+    }
+
+    if(padding.left > _padding.left)
+    {
+        _padding.left = padding.left;
+        updated       = true;
+    }
+
+    std::tie(_strides_in_bytes, _offset_first_element_in_bytes, _total_size) = calculate_padding_requirements(_padding);
+
+    return updated;
+}
+
+void TensorInfo::set_data_type(DataType data_type)
+{
+    _data_type = data_type;
+    _format    = Format::UNKNOWN;
+}
+
+void TensorInfo::set_num_channels(int num_channels)
+{
+    _num_channels = num_channels;
+    _format       = Format::UNKNOWN;
+}
+
+void TensorInfo::set_format(Format format)
+{
+    _format = format;
+
+    if(_data_type == DataType::UNKNOWN)
+    {
+        _num_channels = num_channels_from_format(format);
+        _data_type    = data_type_from_format(format);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR_ON(num_channels_from_format(format) != _num_channels);
+        ARM_COMPUTE_ERROR_ON(data_type_from_format(format) != _data_type);
+    }
+}
+
+void TensorInfo::set_tensor_shape(TensorShape shape)
+{
+    _tensor_shape                  = shape;
+    _offset_first_element_in_bytes = 0;
+    _strides_in_bytes              = compute_strides(*this);
+
+    if(_tensor_shape.num_dimensions() == 0)
+    {
+        _total_size = _strides_in_bytes[0];
+    }
+    else
+    {
+        const unsigned int idx_last_dimension = _tensor_shape.num_dimensions() - 1;
+        _total_size                           = _tensor_shape[idx_last_dimension] * _strides_in_bytes[idx_last_dimension];
+    }
+
+    Coordinates coordinates;
+    coordinates.set_num_dimensions(_tensor_shape.num_dimensions());
+    _valid_region = ValidRegion{ coordinates, _tensor_shape };
+}
+
+void TensorInfo::set_fixed_point_position(int fixed_point_position)
+{
+    ARM_COMPUTE_ERROR_ON(_data_type == DataType::QS8 && (fixed_point_position < 1 || fixed_point_position > 6));
+    ARM_COMPUTE_ERROR_ON(_data_type == DataType::QS16 && (fixed_point_position < 1 || fixed_point_position > 14));
+    _fixed_point_position = fixed_point_position;
+}
+
+size_t TensorInfo::offset_element_in_bytes(const Coordinates &pos) const
+{
+    ARM_COMPUTE_ERROR_ON_COORDINATES_DIMENSIONS_GTE(pos, _tensor_shape.num_dimensions());
+
+    size_t offset = _offset_first_element_in_bytes;
+
+    for(size_t i = 0; i < _tensor_shape.num_dimensions(); ++i)
+    {
+        offset += pos[i] * _strides_in_bytes[i];
+    }
+
+    return offset;
+}
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
new file mode 100644
index 0000000000..bf005c12f6
--- /dev/null
+++ b/src/core/Utils.cpp
@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Utils.h"
+
+#include "arm_compute/core/FixedPoint.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <fstream>
+#include <map>
+#include <string>
+
+using namespace arm_compute;
+
+std::string arm_compute::build_information()
+{
+    static const std::string information =
+#include "arm_compute_version.embed"
+        ;
+    return information;
+}
+
+std::string arm_compute::read_file(const std::string &filename, bool binary)
+{
+    std::string   out;
+    std::ifstream fs;
+
+    try
+    {
+        fs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
+        std::ios_base::openmode mode = std::ios::in;
+
+        if(binary)
+        {
+            mode |= std::ios::binary;
+        }
+
+        fs.open(filename, mode);
+
+        // Go to the end of the file
+        fs.seekg(0, std::ios::end);
+        // Reserve the memory required to store the file's content
+        out.reserve(fs.tellg());
+        // Go back to the beginning of the file
+        fs.seekg(0, std::ios::beg);
+        // Copy the content of the file
+        out.assign(std::istreambuf_iterator<char>(fs), std::istreambuf_iterator<char>());
+    }
+    catch(const std::ifstream::failure &e)
+    {
+        ARM_COMPUTE_ERROR("Accessing %s: %s", filename.c_str(), e.what());
+    }
+
+    return out;
+}
+
+const std::string &arm_compute::string_from_format(Format format)
+{
+    static std::map<Format, const std::string> formats_map =
+    {
+        { Format::UNKNOWN, "UNKNOWN" },
+        { Format::U8, "U8" },
+        { Format::S16, "S16" },
+        { Format::U16, "U16" },
+        { Format::S32, "S32" },
+        { Format::U32, "U32" },
+        { Format::F16, "F16" },
+        { Format::F32, "F32" },
+        { Format::UV88, "UV88" },
+        { Format::RGB888, "RGB888" },
+        { Format::RGBA8888, "RGBA8888" },
+        { Format::YUV444, "YUV444" },
+        { Format::YUYV422, "YUYV422" },
+        { Format::NV12, "NV12" },
+        { Format::NV21, "NV21" },
+        { Format::IYUV, "IYUV" },
+        { Format::UYVY422, "UYVY422" }
+    };
+
+    return formats_map[format];
+}
+
+const std::string &arm_compute::string_from_channel(Channel channel)
+{
+    static std::map<Channel, const std::string> channels_map =
+    {
+        { Channel::UNKNOWN, "UNKNOWN" },
+        { Channel::R, "R" },
+        { Channel::G, "G" },
+        { Channel::B, "B" },
+        { Channel::A, "A" },
+        { Channel::Y, "Y" },
+        { Channel::U, "U" },
+        { Channel::V, "V" },
+        { Channel::C0, "C0" },
+        { Channel::C1, "C1" },
+        { Channel::C2, "C2" },
+        { Channel::C3, "C3" }
+    };
+
+    return channels_map[channel];
+}
+
+const std::string &arm_compute::string_from_data_type(DataType dt)
+{
+    static std::map<DataType, const std::string> dt_map =
+    {
+        { DataType::UNKNOWN, "UNKNOWN" },
+        { DataType::S8, "S8" },
+        { DataType::U8, "U8" },
+        { DataType::QS8, "QS8" },
+        { DataType::S16, "S16" },
+        { DataType::U16, "U16" },
+        { DataType::QS16, "QS16" },
+        { DataType::S32, "S32" },
+        { DataType::U32, "U32" },
+        { DataType::S64, "S64" },
+        { DataType::U64, "U64" },
+        { DataType::F16, "F16" },
+        { DataType::F32, "F32" },
+        { DataType::F64, "F64" },
+        { DataType::SIZET, "SIZET" },
+    };
+
+    return dt_map[dt];
+}
+
+const std::string &arm_compute::string_from_activation_func(ActivationLayerInfo::ActivationFunction act)
+{
+    static std::map<ActivationLayerInfo::ActivationFunction, const std::string> act_map =
+    {
+        { ActivationLayerInfo::ActivationFunction::ABS, "ABS" },
+        { ActivationLayerInfo::ActivationFunction::LINEAR, "LINEAR" },
+        { ActivationLayerInfo::ActivationFunction::LOGISTIC, "LOGISTIC" },
+        { ActivationLayerInfo::ActivationFunction::RELU, "RELU" },
+        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, "BRELU" },
+        { ActivationLayerInfo::ActivationFunction::SOFT_RELU, "SRELU" },
+        { ActivationLayerInfo::ActivationFunction::SQRT, "SQRT" },
+        { ActivationLayerInfo::ActivationFunction::SQUARE, "SQUARE" },
+        { ActivationLayerInfo::ActivationFunction::TANH, "TANH" },
+    };
+
+    return act_map[act];
+}
+
+const std::string &arm_compute::string_from_matrix_pattern(MatrixPattern pattern)
+{
+    static std::map<MatrixPattern, const std::string> pattern_map =
+    {
+        { MatrixPattern::BOX, "BOX" },
+        { MatrixPattern::CROSS, "CROSS" },
+        { MatrixPattern::DISK, "DISK" },
+        { MatrixPattern::OTHER, "OTHER" },
+    };
+
+    return pattern_map[pattern];
+}
+
+const std::string &arm_compute::string_from_non_linear_filter_function(NonLinearFilterFunction function)
+{
+    static std::map<NonLinearFilterFunction, const std::string> func_map =
+    {
+        { NonLinearFilterFunction::MAX, "MAX" },
+        { NonLinearFilterFunction::MEDIAN, "MEDIAN" },
+        { NonLinearFilterFunction::MIN, "MIN" },
+    };
+
+    return func_map[function];
+}
+
+const std::string &arm_compute::string_from_interpolation_policy(InterpolationPolicy policy)
+{
+    static std::map<InterpolationPolicy, const std::string> interpolation_policy_map =
+    {
+        { InterpolationPolicy::AREA, "AREA" },
+        { InterpolationPolicy::BILINEAR, "BILINEAR" },
+        { InterpolationPolicy::NEAREST_NEIGHBOR, "NEAREST_NEIGHBOUR" },
+    };
+
+    return interpolation_policy_map[policy];
+}
+
+const std::string &arm_compute::string_from_border_mode(BorderMode border_mode)
+{
+    static std::map<BorderMode, const std::string> border_mode_map =
+    {
+        { BorderMode::UNDEFINED, "UNDEFINED" },
+        { BorderMode::CONSTANT, "CONSTANT" },
+        { BorderMode::REPLICATE, "REPLICATE" },
+    };
+
+    return border_mode_map[border_mode];
+}
+
+const std::string &arm_compute::string_from_norm_type(NormType type)
+{
+    static std::map<NormType, const std::string> norm_type_map =
+    {
+        { NormType::IN_MAP_1D, "IN_MAP_1D" },
+        { NormType::IN_MAP_2D, "IN_MAP_2D" },
+        { NormType::CROSS_MAP, "CROSS_MAP" },
+    };
+
+    return norm_type_map[type];
+}
+
+std::string arm_compute::lower_string(const std::string &val)
+{
+    std::string res = val;
+    std::transform(res.begin(), res.end(), res.begin(), ::tolower);
+    return res;
+}
+
+const std::pair<unsigned int, unsigned int> arm_compute::scaled_dimensions(unsigned int width, unsigned int height, unsigned int kernel_size,
+                                                                           unsigned int stride_x, unsigned int stride_y,
+                                                                           unsigned int pad_x, unsigned int pad_y,
+                                                                           DimensionRoundingType round_type)
+{
+    unsigned int w = 0;
+    unsigned int h = 0;
+    switch(round_type)
+    {
+        case DimensionRoundingType::FLOOR:
+            w = static_cast<unsigned int>(std::floor((static_cast<float>(width + 2 * pad_x - kernel_size) / stride_x) + 1));
+            h = static_cast<unsigned int>(std::floor((static_cast<float>(height + 2 * pad_y - kernel_size) / stride_y) + 1));
+            break;
+        case DimensionRoundingType::CEIL:
+            w = static_cast<unsigned int>(std::ceil((static_cast<float>(width + 2 * pad_x - kernel_size) / stride_x) + 1));
+            h = static_cast<unsigned int>(std::ceil((static_cast<float>(height + 2 * pad_y - kernel_size) / stride_y) + 1));
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported rounding type");
+    }
+
+    // Make sure that border operations will start from inside the input and not the padded area
+    if(((w - 1) * stride_x) >= (width + pad_x))
+    {
+        --w;
+    }
+    if(((h - 1) * stride_y) >= (height + pad_y))
+    {
+        --h;
+    }
+    ARM_COMPUTE_ERROR_ON(((w - 1) * stride_x) >= (width + pad_x));
+    ARM_COMPUTE_ERROR_ON(((h - 1) * stride_y) >= (height + pad_y));
+
+    return std::make_pair(w, h);
+}
+
+void arm_compute::print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n, int stream_width, const std::string &element_delim)
+{
+    switch(dt)
+    {
+        case DataType::U8:
+            print_consecutive_elements_impl<uint8_t>(s, ptr, n, stream_width, element_delim);
+            break;
+        case DataType::QS8:
+        case DataType::S8:
+            print_consecutive_elements_impl<int8_t>(s, reinterpret_cast<const int8_t *>(ptr), n, stream_width, element_delim);
+            break;
+        case DataType::U16:
+            print_consecutive_elements_impl<uint16_t>(s, reinterpret_cast<const uint16_t *>(ptr), n, stream_width, element_delim);
+            break;
+        case DataType::S16:
+            print_consecutive_elements_impl<int16_t>(s, reinterpret_cast<const int16_t *>(ptr), n, stream_width, element_delim);
+            break;
+        case DataType::U32:
+            print_consecutive_elements_impl<uint32_t>(s, reinterpret_cast<const uint32_t *>(ptr), n, stream_width, element_delim);
+            break;
+        case DataType::S32:
+            print_consecutive_elements_impl<int32_t>(s, reinterpret_cast<const int32_t *>(ptr), n, stream_width, element_delim);
+            break;
+        case DataType::F32:
+            print_consecutive_elements_impl<float>(s, reinterpret_cast<const float *>(ptr), n, stream_width, element_delim);
+            break;
+        case DataType::F16:
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Undefined element size for given data type");
+    }
+}
+
+int arm_compute::max_consecutive_elements_display_width(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n)
+{
+    switch(dt)
+    {
+        case DataType::U8:
+            return max_consecutive_elements_display_width_impl<uint8_t>(s, ptr, n);
+        case DataType::QS8:
+        case DataType::S8:
+            return max_consecutive_elements_display_width_impl<int8_t>(s, reinterpret_cast<const int8_t *>(ptr), n);
+        case DataType::U16:
+            return max_consecutive_elements_display_width_impl<uint16_t>(s, reinterpret_cast<const uint16_t *>(ptr), n);
+        case DataType::S16:
+            return max_consecutive_elements_display_width_impl<int16_t>(s, reinterpret_cast<const int16_t *>(ptr), n);
+        case DataType::U32:
+            return max_consecutive_elements_display_width_impl<uint32_t>(s, reinterpret_cast<const uint32_t *>(ptr), n);
+        case DataType::S32:
+            return max_consecutive_elements_display_width_impl<int32_t>(s, reinterpret_cast<const int32_t *>(ptr), n);
+        case DataType::F32:
+            return max_consecutive_elements_display_width_impl<float>(s, reinterpret_cast<const float *>(ptr), n);
+        case DataType::F16:
+            return 0;
+        default:
+            ARM_COMPUTE_ERROR("Undefined element size for given data type");
+    }
+}
diff --git a/src/core/Validate.cpp b/src/core/Validate.cpp
new file mode 100644
index 0000000000..ae2841d7a4
--- /dev/null
+++ b/src/core/Validate.cpp
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Validate.h"
+
+void arm_compute::error_on_mismatching_windows(const char *function, const char *file, const int line,
+                                               const arm_compute::Window &full, const arm_compute::Window &win)
+{
+    ARM_COMPUTE_UNUSED(function);
+    ARM_COMPUTE_UNUSED(file);
+    ARM_COMPUTE_UNUSED(line);
+
+    full.validate();
+    win.validate();
+
+    for(size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON_LOC(full[i].start() != win[i].start(), function, file, line);
+        ARM_COMPUTE_ERROR_ON_LOC(full[i].end() != win[i].end(), function, file, line);
+        ARM_COMPUTE_ERROR_ON_LOC(full[i].step() != win[i].step(), function, file, line);
+    }
+}
+
+void arm_compute::error_on_invalid_subwindow(const char *function, const char *file, const int line,
+                                             const arm_compute::Window &full, const arm_compute::Window &sub)
+{
+    ARM_COMPUTE_UNUSED(function);
+    ARM_COMPUTE_UNUSED(file);
+    ARM_COMPUTE_UNUSED(line);
+
+    full.validate();
+    sub.validate();
+
+    for(size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON_LOC(full[i].start() > sub[i].start(), function, file, line);
+        ARM_COMPUTE_ERROR_ON_LOC(full[i].end() < sub[i].end(), function, file, line);
+        ARM_COMPUTE_ERROR_ON_LOC(full[i].step() != sub[i].step(), function, file, line);
+        ARM_COMPUTE_ERROR_ON_LOC((sub[i].start() - full[i].start()) % sub[i].step(), function, file, line);
+    }
+}
+
+void arm_compute::error_on_coordinates_dimensions_gte(const char *function, const char *file, const int line,
+                                                      const arm_compute::Coordinates &pos, unsigned int max_dim)
+{
+    ARM_COMPUTE_UNUSED(function);
+    ARM_COMPUTE_UNUSED(file);
+    ARM_COMPUTE_UNUSED(line);
+    ARM_COMPUTE_UNUSED(pos);
+
+    for(unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON_LOC(pos[i] != 0, function, file, line);
+    }
+}
+
+void arm_compute::error_on_window_dimensions_gte(const char *function, const char *file, const int line,
+                                                 const arm_compute::Window &win, unsigned int max_dim)
+{
+    ARM_COMPUTE_UNUSED(function);
+    ARM_COMPUTE_UNUSED(file);
+    ARM_COMPUTE_UNUSED(line);
+    ARM_COMPUTE_UNUSED(win);
+
+    for(unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON_LOC_MSG(win[i].start() != 0 || win[i].end() != win[i].step(),
+                                     function, file, line,
+                                     "Maximum number of dimensions expected %u but dimension %u is not empty", max_dim, i);
+    }
+}
+
+void arm_compute::error_on_tensor_not_2d(const char *function, const char *file, const int line,
+                                         const arm_compute::ITensor *tensor)
+{
+    ARM_COMPUTE_UNUSED(function);
+    ARM_COMPUTE_UNUSED(file);
+    ARM_COMPUTE_UNUSED(line);
+    ARM_COMPUTE_UNUSED(tensor);
+
+    ARM_COMPUTE_ERROR_ON_LOC(tensor == nullptr, function, file, line);
+    ARM_COMPUTE_ERROR_ON_LOC_MSG(tensor->info()->num_dimensions() != 2,
+                                 function, file, line,
+                                 "Only 2D Tensors are supported by this kernel (%d passed)", tensor->info()->num_dimensions());
+}
+
+void arm_compute::error_on_channel_not_in_known_format(const char *function, const char *file, const int line,
+                                                       arm_compute::Format fmt, arm_compute::Channel cn)
+{
+    ARM_COMPUTE_ERROR_ON_LOC(fmt == arm_compute::Format::UNKNOWN, function, file, line);
+    ARM_COMPUTE_ERROR_ON_LOC(cn == arm_compute::Channel::UNKNOWN, function, file, line);
+
+    switch(fmt)
+    {
+        case arm_compute::Format::RGB888:
+            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::R, arm_compute::Channel::G, arm_compute::Channel::B);
+            break;
+        case arm_compute::Format::RGBA8888:
+            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::R, arm_compute::Channel::G, arm_compute::Channel::B, arm_compute::Channel::A);
+            break;
+        case arm_compute::Format::UV88:
+            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::U, arm_compute::Channel::V);
+            break;
+        case arm_compute::Format::IYUV:
+        case arm_compute::Format::UYVY422:
+        case arm_compute::Format::YUYV422:
+        case arm_compute::Format::NV12:
+        case arm_compute::Format::NV21:
+        case arm_compute::Format::YUV444:
+            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::Y, arm_compute::Channel::U, arm_compute::Channel::V);
+            break;
+        default:
+            ARM_COMPUTE_ERROR_LOC(function, file, line, "Not supported format.");
+    }
+}
+
+void arm_compute::error_on_invalid_multi_hog(const char *function, const char *file, const int line,
+                                             const arm_compute::IMultiHOG *multi_hog)
+{
+    ARM_COMPUTE_UNUSED(function);
+    ARM_COMPUTE_UNUSED(file);
+    ARM_COMPUTE_UNUSED(line);
+
+    ARM_COMPUTE_ERROR_ON_LOC(nullptr == multi_hog, function, file, line);
+    ARM_COMPUTE_ERROR_ON_LOC(0 == multi_hog->num_models(), function, file, line);
+
+    for(size_t i = 1; i < multi_hog->num_models(); ++i)
+    {
+        ARM_COMPUTE_ERROR_ON_LOC_MSG(multi_hog->model(0)->info()->phase_type() != multi_hog->model(i)->info()->phase_type(),
+                                     function, file, line,
+                                     "All HOG parameters must have the same phase type");
+        ARM_COMPUTE_ERROR_ON_LOC_MSG(multi_hog->model(0)->info()->normalization_type() != multi_hog->model(i)->info()->normalization_type(),
+                                     function, file, line,
+                                     "All HOG parameters must have the same normalization type");
+        ARM_COMPUTE_ERROR_ON_LOC_MSG((multi_hog->model(0)->info()->l2_hyst_threshold() != multi_hog->model(i)->info()->l2_hyst_threshold())
+                                     && (multi_hog->model(0)->info()->normalization_type() == arm_compute::HOGNormType::L2HYS_NORM),
+                                     function, file, line,
+                                     "All HOG parameters must have the same l2 hysteresis threshold if you use L2 hysteresis normalization type");
+    }
+}
+
+void arm_compute::error_on_unconfigured_kernel(const char *function, const char *file, const int line,
+                                               const arm_compute::IKernel *kernel)
+{
+    ARM_COMPUTE_UNUSED(function);
+    ARM_COMPUTE_UNUSED(file);
+    ARM_COMPUTE_UNUSED(line);
+    ARM_COMPUTE_UNUSED(kernel);
+
+    ARM_COMPUTE_ERROR_ON_LOC(kernel == nullptr, function, file, line);
+    ARM_COMPUTE_ERROR_ON_LOC_MSG((kernel->window().x().start() == kernel->window().x().end()) && (kernel->window().x().end() == 0),
+                                 function, file, line,
+                                 "This kernel hasn't been configured.");
+}
+
+void arm_compute::error_on_invalid_subtensor(const char *function, const char *file, const int line,
+                                             const TensorShape &parent_shape, const Coordinates &coords, const TensorShape &shape)
+{
+    ARM_COMPUTE_UNUSED(function);
+    ARM_COMPUTE_UNUSED(file);
+    ARM_COMPUTE_UNUSED(line);
+    ARM_COMPUTE_UNUSED(parent_shape);
+    ARM_COMPUTE_UNUSED(coords);
+    ARM_COMPUTE_UNUSED(shape);
+
+    // Subtensor should not index in x, y dimensions.
+    ARM_COMPUTE_ERROR_ON_LOC(((coords.x() != 0) && (coords.y() != 0)), function, file, line);
+    // Subtensor shape should match parent tensor in x, y dimensions.
+    ARM_COMPUTE_ERROR_ON_LOC(((parent_shape.x() != shape.x()) && (parent_shape.y() != parent_shape.y())), function, file, line);
+
+    // Check dimensions
+    for(unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON_LOC(((coords[i] >= static_cast<int>(parent_shape[i])) || (coords[i] + static_cast<int>(shape[i]) > static_cast<int>(parent_shape[i]))),
+                                 function, file, line);
+    }
+}
+
+void arm_compute::error_on_invalid_subtensor_valid_region(const char *function, const char *file, const int line,
+                                                          const ValidRegion &parent_valid_region, const ValidRegion &valid_region)
+{
+    ARM_COMPUTE_UNUSED(function);
+    ARM_COMPUTE_UNUSED(file);
+    ARM_COMPUTE_UNUSED(line);
+    ARM_COMPUTE_UNUSED(parent_valid_region);
+    ARM_COMPUTE_UNUSED(valid_region);
+
+    // Check valid regions
+    for(unsigned int d = 0; d < TensorShape::num_max_dimensions; ++d)
+    {
+        ARM_COMPUTE_ERROR_ON_LOC((parent_valid_region.anchor[d] > valid_region.anchor[d]), function, file, line);
+        ARM_COMPUTE_ERROR_ON_LOC((parent_valid_region.anchor[d] + static_cast<int>(parent_valid_region.shape[d])) < (valid_region.anchor[d] + static_cast<int>(valid_region.shape[d])),
+                                 function, file, line);
+    }
+}
diff --git a/src/runtime/CL/CLDistribution1D.cpp b/src/runtime/CL/CLDistribution1D.cpp
new file mode 100644
index 0000000000..f1dd95e77e
--- /dev/null
+++ b/src/runtime/CL/CLDistribution1D.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLDistribution1D.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLDistribution1D::CLDistribution1D(size_t num_bins, int32_t offset, uint32_t range)
+    : ICLDistribution1D(num_bins, offset, range), _mem(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, num_bins * sizeof(int32_t))
+{
+}
+
+void CLDistribution1D::map(bool blocking)
+{
+    ICLDistribution1D::map(CLScheduler::get().queue(), blocking);
+}
+
+void CLDistribution1D::unmap()
+{
+    ICLDistribution1D::unmap(CLScheduler::get().queue());
+}
+
+uint32_t *CLDistribution1D::do_map(cl::CommandQueue &q, bool blocking)
+{
+    ARM_COMPUTE_ERROR_ON(_mem.get() == nullptr);
+    return static_cast<uint32_t *>(q.enqueueMapBuffer(_mem, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, size()));
+}
+
+void CLDistribution1D::do_unmap(cl::CommandQueue &q)
+{
+    ARM_COMPUTE_ERROR_ON(_mem.get() == nullptr);
+    q.enqueueUnmapMemObject(_mem, _mapping);
+}
+
+cl::Buffer &CLDistribution1D::cl_buffer()
+{
+    return _mem;
+}
diff --git a/src/runtime/CL/CLHOG.cpp b/src/runtime/CL/CLHOG.cpp
new file mode 100644
index 0000000000..3f5266ce70
--- /dev/null
+++ b/src/runtime/CL/CLHOG.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/CLHOG.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLHOG::CLHOG()
+    : _info(), _buffer()
+{
+}
+
+void CLHOG::init(const HOGInfo &input)
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() != nullptr);
+    _info   = input;
+    _buffer = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info()->descriptor_size() * sizeof(float));
+}
+
+void CLHOG::free()
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+
+    _buffer = cl::Buffer();
+}
+
+const HOGInfo *CLHOG::info() const
+{
+    return &_info;
+}
+
+const cl::Buffer &CLHOG::cl_buffer() const
+{
+    return _buffer;
+}
+
+void CLHOG::map(bool blocking)
+{
+    ARM_COMPUTE_ERROR_ON(descriptor() != nullptr);
+    ICLHOG::map(CLScheduler::get().queue(), blocking);
+}
+
+void CLHOG::unmap()
+{
+    ARM_COMPUTE_ERROR_ON(descriptor() == nullptr);
+    ICLHOG::unmap(CLScheduler::get().queue());
+}
+
+uint8_t *CLHOG::do_map(cl::CommandQueue &q, bool blocking)
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+    return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info()->descriptor_size()));
+}
+
+void CLHOG::do_unmap(cl::CommandQueue &q)
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+    q.enqueueUnmapMemObject(_buffer, descriptor());
+}
+\ No newline at end of file
diff --git a/src/runtime/CL/CLLut.cpp b/src/runtime/CL/CLLut.cpp
new file mode 100644
index 0000000000..a8cbf2131f
--- /dev/null
+++ b/src/runtime/CL/CLLut.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLLut.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <cstring>
+
+using namespace arm_compute;
+
+CLLut::CLLut()
+    : _allocator()
+{
+}
+
+CLLut::CLLut(size_t num_elements, DataType data_type)
+    : _allocator()
+{
+    _allocator.init(num_elements, data_type);
+}
+
+size_t CLLut::num_elements() const
+{
+    return _allocator.num_elements();
+}
+
+uint32_t CLLut::index_offset() const
+{
+    return (DataType::S16 == _allocator.type()) ? num_elements() / 2 : 0;
+}
+
+size_t CLLut::size_in_bytes() const
+{
+    return _allocator.size();
+}
+
+DataType CLLut::type() const
+{
+    return _allocator.type();
+}
+
+const cl::Buffer &CLLut::cl_buffer() const
+{
+    return _allocator.cl_data();
+}
+
+void CLLut::clear()
+{
+    cl::CommandQueue &q    = CLScheduler::get().queue();
+    uint8_t          *data = _allocator.map(q, true /* blocking */);
+    std::memset(data, 0, size_in_bytes());
+    _allocator.unmap(q, data);
+}
+
+ILutAllocator *CLLut::allocator()
+{
+    return &_allocator;
+}
+
+void CLLut::map(bool blocking)
+{
+    ICLLut::map(CLScheduler::get().queue(), blocking);
+}
+
+void CLLut::unmap()
+{
+    ICLLut::unmap(CLScheduler::get().queue());
+}
+
+uint8_t *CLLut::do_map(cl::CommandQueue &q, bool blocking)
+{
+    return _allocator.map(q, blocking);
+}
+
+void CLLut::do_unmap(cl::CommandQueue &q)
+{
+    _allocator.unmap(q, buffer());
+}
diff --git a/src/runtime/CL/CLLutAllocator.cpp b/src/runtime/CL/CLLutAllocator.cpp
new file mode 100644
index 0000000000..311de4bb8d
--- /dev/null
+++ b/src/runtime/CL/CLLutAllocator.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLLutAllocator.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLLutAllocator::CLLutAllocator()
+    : _buffer(), _mapping(nullptr)
+{
+}
+
+uint8_t *CLLutAllocator::data()
+{
+    return _mapping;
+}
+
+const cl::Buffer &CLLutAllocator::cl_data() const
+{
+    return _buffer;
+}
+
+uint8_t *CLLutAllocator::map(cl::CommandQueue &q, bool blocking)
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+    return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, size()));
+}
+
+void CLLutAllocator::unmap(cl::CommandQueue &q, uint8_t *mapping)
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+    q.enqueueUnmapMemObject(_buffer, mapping);
+}
+
+void CLLutAllocator::allocate()
+{
+    _buffer = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size());
+}
+
+uint8_t *CLLutAllocator::lock()
+{
+    ARM_COMPUTE_ERROR_ON(_mapping != nullptr);
+    cl::CommandQueue q = CLScheduler::get().queue();
+    _mapping           = map(q, true);
+    return _mapping;
+}
+
+void CLLutAllocator::unlock()
+{
+    ARM_COMPUTE_ERROR_ON(_mapping == nullptr);
+    cl::CommandQueue q = CLScheduler::get().queue();
+    unmap(q, _mapping);
+    _mapping = nullptr;
+}
diff --git a/src/runtime/CL/CLMultiHOG.cpp b/src/runtime/CL/CLMultiHOG.cpp
new file mode 100644
index 0000000000..b9e8739454
--- /dev/null
+++ b/src/runtime/CL/CLMultiHOG.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLMultiHOG.h"
+
+#include "arm_compute/core/CL/ICLHOG.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+
+using namespace arm_compute;
+
+CLMultiHOG::CLMultiHOG(size_t num_models)
+    : _num_models(num_models), _model(arm_compute::cpp14::make_unique<CLHOG[]>(_num_models))
+{
+}
+
+size_t CLMultiHOG::num_models() const
+{
+    return _num_models;
+}
+
+ICLHOG *CLMultiHOG::cl_model(size_t index)
+{
+    ARM_COMPUTE_ERROR_ON(index >= _num_models);
+    return (_model.get() + index);
+}
+
+const ICLHOG *CLMultiHOG::cl_model(size_t index) const
+{
+    ARM_COMPUTE_ERROR_ON(index >= _num_models);
+    return (_model.get() + index);
+}
+\ No newline at end of file
diff --git a/src/runtime/CL/CLMultiImage.cpp b/src/runtime/CL/CLMultiImage.cpp
new file mode 100644
index 0000000000..63059cb5f4
--- /dev/null
+++ b/src/runtime/CL/CLMultiImage.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLMultiImage.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+using namespace arm_compute;
+
+CLMultiImage::CLMultiImage()
+    : _info(), _plane()
+{
+}
+
+const MultiImageInfo *CLMultiImage::info() const
+{
+    return &_info;
+}
+
+void CLMultiImage::init(unsigned int width, unsigned int height, Format format)
+{
+    internal_init(width, height, format, false);
+}
+
+void CLMultiImage::init_auto_padding(unsigned int width, unsigned int height, Format format)
+{
+    internal_init(width, height, format, true);
+}
+
+void CLMultiImage::internal_init(unsigned int width, unsigned int height, Format format, bool auto_padding)
+{
+    TensorInfo info(width, height, Format::U8);
+
+    if(auto_padding)
+    {
+        info.auto_padding();
+    }
+
+    switch(format)
+    {
+        case Format::U8:
+        case Format::S16:
+        case Format::U16:
+        case Format::S32:
+        case Format::F16:
+        case Format::F32:
+        case Format::U32:
+        case Format::RGB888:
+        case Format::RGBA8888:
+        case Format::YUYV422:
+        case Format::UYVY422:
+        {
+            TensorInfo info_full(width, height, format);
+
+            if(auto_padding)
+            {
+                info_full.auto_padding();
+            }
+
+            std::get<0>(_plane).allocator()->init(info_full);
+            break;
+        }
+        case Format::NV12:
+        case Format::NV21:
+        {
+            TensorInfo info_uv88(width / 2, height / 2, Format::UV88);
+
+            if(auto_padding)
+            {
+                info_uv88.auto_padding();
+            }
+
+            std::get<0>(_plane).allocator()->init(info);
+            std::get<1>(_plane).allocator()->init(info_uv88);
+            break;
+        }
+        case Format::IYUV:
+        {
+            TensorInfo info_sub2(width / 2, height / 2, Format::U8);
+
+            if(auto_padding)
+            {
+                info_sub2.auto_padding();
+            }
+
+            std::get<0>(_plane).allocator()->init(info);
+            std::get<1>(_plane).allocator()->init(info_sub2);
+            std::get<2>(_plane).allocator()->init(info_sub2);
+            break;
+        }
+        case Format::YUV444:
+            std::get<0>(_plane).allocator()->init(info);
+            std::get<1>(_plane).allocator()->init(info);
+            std::get<2>(_plane).allocator()->init(info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+
+    _info.init(width, height, format);
+}
+
+void CLMultiImage::allocate()
+{
+    switch(_info.format())
+    {
+        case Format::U8:
+        case Format::S16:
+        case Format::U16:
+        case Format::S32:
+        case Format::F16:
+        case Format::F32:
+        case Format::U32:
+        case Format::RGB888:
+        case Format::RGBA8888:
+        case Format::YUYV422:
+        case Format::UYVY422:
+            std::get<0>(_plane).allocator()->allocate();
+            break;
+        case Format::NV12:
+        case Format::NV21:
+            std::get<0>(_plane).allocator()->allocate();
+            std::get<1>(_plane).allocator()->allocate();
+            break;
+        case Format::IYUV:
+        case Format::YUV444:
+            std::get<0>(_plane).allocator()->allocate();
+            std::get<1>(_plane).allocator()->allocate();
+            std::get<2>(_plane).allocator()->allocate();
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+}
+
+CLImage *CLMultiImage::cl_plane(unsigned int index)
+{
+    return &_plane[index];
+}
+
+const CLImage *CLMultiImage::cl_plane(unsigned int index) const
+{
+    return &_plane[index];
+}
diff --git a/src/runtime/CL/CLPyramid.cpp b/src/runtime/CL/CLPyramid.cpp
new file mode 100644
index 0000000000..41d81ea0f8
--- /dev/null
+++ b/src/runtime/CL/CLPyramid.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLPyramid.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PyramidInfo.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+
+#include <array>
+#include <cmath>
+
+using namespace arm_compute;
+
+CLPyramid::CLPyramid()
+    : _info(), _pyramid(nullptr)
+{
+}
+
+void CLPyramid::init(const PyramidInfo &info)
+{
+    internal_init(info, false);
+}
+
+void CLPyramid::init_auto_padding(const PyramidInfo &info)
+{
+    internal_init(info, true);
+}
+
+void CLPyramid::internal_init(const PyramidInfo &info, bool auto_padding)
+{
+    _info    = info;
+    _pyramid = arm_compute::cpp14::make_unique<CLTensor[]>(_info.num_levels());
+
+    size_t      w            = _info.width();
+    size_t      h            = _info.height();
+    size_t      ref_w        = w;
+    size_t      ref_h        = h;
+    const bool  is_orb_scale = (SCALE_PYRAMID_ORB == _info.scale());
+    TensorShape tensor_shape = _info.tensor_shape();
+
+    // Note: Look-up table used by the OpenVX sample implementation
+    const std::array<float, 4> c_orbscale =
+    {
+        {
+            0.5f,
+            SCALE_PYRAMID_ORB,
+            SCALE_PYRAMID_ORB * SCALE_PYRAMID_ORB,
+            SCALE_PYRAMID_ORB *SCALE_PYRAMID_ORB * SCALE_PYRAMID_ORB
+        }
+    };
+
+    for(size_t i = 0; i < _info.num_levels(); ++i)
+    {
+        TensorInfo tensor_info(tensor_shape, _info.format());
+
+        if(auto_padding)
+        {
+            tensor_info.auto_padding();
+        }
+
+        _pyramid[i].allocator()->init(tensor_info);
+
+        if(is_orb_scale)
+        {
+            const float orb_scale = c_orbscale[(i + 1) % 4];
+            w                     = std::ceil(ref_w * orb_scale);
+            h                     = std::ceil(ref_h * orb_scale);
+
+            if(0 == ((i + 1) % 4))
+            {
+                ref_w = w;
+                ref_h = h;
+            }
+        }
+        else
+        {
+            w = (w + 1) * _info.scale();
+            h = (h + 1) * _info.scale();
+        }
+
+        // Update tensor_shape
+        tensor_shape.set(0, w);
+        tensor_shape.set(1, h);
+    }
+}
+
+void CLPyramid::allocate()
+{
+    ARM_COMPUTE_ERROR_ON(_pyramid == nullptr);
+
+    for(size_t i = 0; i < _info.num_levels(); ++i)
+    {
+        (_pyramid.get() + i)->allocator()->allocate();
+    }
+}
+
+const PyramidInfo *CLPyramid::info() const
+{
+    return &_info;
+}
+
+CLTensor *CLPyramid::get_pyramid_level(size_t index) const
+{
+    ARM_COMPUTE_ERROR_ON(index >= _info.num_levels());
+
+    return (_pyramid.get() + index);
+}
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
new file mode 100644
index 0000000000..fe25ce534c
--- /dev/null
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+using namespace arm_compute;
+
+CLScheduler::CLScheduler()
+    : _context(), _queue(), _target(GPUTarget::MIDGARD)
+{
+}
+
+CLScheduler &CLScheduler::get()
+{
+    static CLScheduler scheduler;
+    return scheduler;
+}
+
+void CLScheduler::enqueue(ICLKernel &kernel, bool flush)
+{
+    kernel.run(kernel.window(), _queue);
+
+    if(flush)
+    {
+        _queue.flush();
+    }
+}
diff --git a/src/runtime/CL/CLSubTensor.cpp b/src/runtime/CL/CLSubTensor.cpp
new file mode 100644
index 0000000000..b228c0abda
--- /dev/null
+++ b/src/runtime/CL/CLSubTensor.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLSubTensor.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLSubTensor::CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords)
+    : _parent(nullptr), _info()
+{
+    ARM_COMPUTE_ERROR_ON(parent == nullptr);
+    _info   = SubTensorInfo(parent->info(), tensor_shape, coords);
+    _parent = parent;
+}
+
+ITensorInfo *CLSubTensor::info() const
+{
+    return &_info;
+}
+
+ITensorInfo *CLSubTensor::info()
+{
+    return &_info;
+}
+
+const cl::Buffer &CLSubTensor::cl_buffer() const
+{
+    ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+    return _parent->cl_buffer();
+}
+
+ICLTensor *CLSubTensor::parent()
+{
+    return _parent;
+}
+
+void CLSubTensor::map(bool blocking)
+{
+    ICLTensor::map(CLScheduler::get().queue(), blocking);
+}
+
+void CLSubTensor::unmap()
+{
+    ICLTensor::unmap(CLScheduler::get().queue());
+}
+
+uint8_t *CLSubTensor::do_map(cl::CommandQueue &q, bool blocking)
+{
+    ARM_COMPUTE_ERROR_ON(cl_buffer().get() == nullptr);
+    return static_cast<uint8_t *>(q.enqueueMapBuffer(cl_buffer(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info()->total_size()));
+}
+
+void CLSubTensor::do_unmap(cl::CommandQueue &q)
+{
+    ARM_COMPUTE_ERROR_ON(cl_buffer().get() == nullptr);
+    q.enqueueUnmapMemObject(cl_buffer(), buffer());
+}
diff --git a/src/runtime/CL/CLTensor.cpp b/src/runtime/CL/CLTensor.cpp
new file mode 100644
index 0000000000..eefa0331d5
--- /dev/null
+++ b/src/runtime/CL/CLTensor.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLTensor::CLTensor()
+    : _allocator()
+{
+}
+
+TensorInfo *CLTensor::info() const
+{
+    return &_allocator.info();
+}
+
+TensorInfo *CLTensor::info()
+{
+    return &_allocator.info();
+}
+
+const cl::Buffer &CLTensor::cl_buffer() const
+{
+    return _allocator.cl_data();
+}
+
+ITensorAllocator *CLTensor::allocator()
+{
+    return &_allocator;
+}
+
+void CLTensor::map(bool blocking)
+{
+    ICLTensor::map(CLScheduler::get().queue(), blocking);
+}
+
+void CLTensor::unmap()
+{
+    ICLTensor::unmap(CLScheduler::get().queue());
+}
+
+uint8_t *CLTensor::do_map(cl::CommandQueue &q, bool blocking)
+{
+    return _allocator.map(q, blocking);
+}
+
+void CLTensor::do_unmap(cl::CommandQueue &q)
+{
+    _allocator.unmap(q, buffer());
+}
diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp
new file mode 100644
index 0000000000..8112a7148f
--- /dev/null
+++ b/src/runtime/CL/CLTensorAllocator.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLTensorAllocator::CLTensorAllocator()
+    : _buffer(), _mapping(nullptr)
+{
+}
+
+uint8_t *CLTensorAllocator::data()
+{
+    return _mapping;
+}
+
+const cl::Buffer &CLTensorAllocator::cl_data() const
+{
+    return _buffer;
+}
+
+void CLTensorAllocator::allocate()
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() != nullptr);
+
+    _buffer = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info().total_size());
+    info().set_is_resizable(false);
+}
+
+void CLTensorAllocator::free()
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+
+    _buffer = cl::Buffer();
+    info().set_is_resizable(true);
+}
+
+uint8_t *CLTensorAllocator::lock()
+{
+    ARM_COMPUTE_ERROR_ON(_mapping != nullptr);
+    _mapping = map(CLScheduler::get().queue(), true);
+    return _mapping;
+}
+
+void CLTensorAllocator::unlock()
+{
+    ARM_COMPUTE_ERROR_ON(_mapping == nullptr);
+    unmap(CLScheduler::get().queue(), _mapping);
+    _mapping = nullptr;
+}
+
+uint8_t *CLTensorAllocator::map(cl::CommandQueue &q, bool blocking)
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+    return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info().total_size()));
+}
+
+void CLTensorAllocator::unmap(cl::CommandQueue &q, uint8_t *mapping)
+{
+    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
+    q.enqueueUnmapMemObject(_buffer, mapping);
+}
diff --git a/src/runtime/CL/ICLSimpleFunction.cpp b/src/runtime/CL/ICLSimpleFunction.cpp
new file mode 100644
index 0000000000..aa45743d37
--- /dev/null
+++ b/src/runtime/CL/ICLSimpleFunction.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+ICLSimpleFunction::ICLSimpleFunction()
+    : _kernel(), _border_handler()
+{
+}
+
+void ICLSimpleFunction::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!_kernel, "The child class didn't set the CL kernel or function isn't configured");
+
+    CLScheduler::get().enqueue(_border_handler, false);
+    CLScheduler::get().enqueue(*_kernel);
+}
diff --git a/src/runtime/CL/functions/CLAbsoluteDifference.cpp b/src/runtime/CL/functions/CLAbsoluteDifference.cpp
new file mode 100644
index 0000000000..5097dd4710
--- /dev/null
+++ b/src/runtime/CL/functions/CLAbsoluteDifference.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLAbsoluteDifference.h"
+
+#include "arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLAbsoluteDifference::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLAbsoluteDifferenceKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLAccumulate.cpp b/src/runtime/CL/functions/CLAccumulate.cpp
new file mode 100644
index 0000000000..56c519984c
--- /dev/null
+++ b/src/runtime/CL/functions/CLAccumulate.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLAccumulate.h"
+
+#include "arm_compute/core/CL/kernels/CLAccumulateKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLAccumulate::configure(const ICLTensor *input, ICLTensor *accum)
+{
+    auto k = arm_compute::cpp14::make_unique<CLAccumulateKernel>();
+    k->configure(input, accum);
+    _kernel = std::move(k);
+}
+
+void CLAccumulateWeighted::configure(const ICLTensor *input, float alpha, ICLTensor *accum)
+{
+    auto k = arm_compute::cpp14::make_unique<CLAccumulateWeightedKernel>();
+    k->configure(input, alpha, accum);
+    _kernel = std::move(k);
+}
+
+void CLAccumulateSquared::configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum)
+{
+    auto k = arm_compute::cpp14::make_unique<CLAccumulateSquaredKernel>();
+    k->configure(input, shift, accum);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp
new file mode 100644
index 0000000000..9b5bd8b663
--- /dev/null
+++ b/src/runtime/CL/functions/CLActivationLayer.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+using namespace arm_compute;
+
+void CLActivationLayer::configure(const ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
+{
+    auto k = arm_compute::cpp14::make_unique<CLActivationLayerKernel>();
+    k->configure(input, output, act_info);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLArithmeticAddition.cpp b/src/runtime/CL/functions/CLArithmeticAddition.cpp
new file mode 100644
index 0000000000..36bff4285c
--- /dev/null
+++ b/src/runtime/CL/functions/CLArithmeticAddition.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLArithmeticAddition.h"
+
+#include "arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLArithmeticAddition::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+{
+    auto k = arm_compute::cpp14::make_unique<CLArithmeticAdditionKernel>();
+    k->configure(input1, input2, output, policy);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp b/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
new file mode 100644
index 0000000000..97f0a1caf4
--- /dev/null
+++ b/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
+
+#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLArithmeticSubtraction::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+{
+    auto k = arm_compute::cpp14::make_unique<CLArithmeticSubtractionKernel>();
+    k->configure(input1, input2, output, policy);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
new file mode 100644
index 0000000000..3df673c6a6
--- /dev/null
+++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLBatchNormalizationLayer::CLBatchNormalizationLayer()
+    : _norm_kernel()
+{
+}
+
+void CLBatchNormalizationLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon)
+{
+    _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon);
+}
+
+void CLBatchNormalizationLayer::run()
+{
+    CLScheduler::get().enqueue(_norm_kernel, true);
+}
diff --git a/src/runtime/CL/functions/CLBitwiseAnd.cpp b/src/runtime/CL/functions/CLBitwiseAnd.cpp
new file mode 100644
index 0000000000..7c85043206
--- /dev/null
+++ b/src/runtime/CL/functions/CLBitwiseAnd.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLBitwiseAnd.h"
+
+#include "arm_compute/core/CL/kernels/CLBitwiseAndKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLBitwiseAnd::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLBitwiseAndKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLBitwiseNot.cpp b/src/runtime/CL/functions/CLBitwiseNot.cpp
new file mode 100644
index 0000000000..17ae5dea3c
--- /dev/null
+++ b/src/runtime/CL/functions/CLBitwiseNot.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLBitwiseNot.h"
+
+#include "arm_compute/core/CL/kernels/CLBitwiseNotKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLBitwiseNot::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLBitwiseNotKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLBitwiseOr.cpp b/src/runtime/CL/functions/CLBitwiseOr.cpp
new file mode 100644
index 0000000000..c84a279bae
--- /dev/null
+++ b/src/runtime/CL/functions/CLBitwiseOr.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLBitwiseOr.h"
+
+#include "arm_compute/core/CL/kernels/CLBitwiseOrKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLBitwiseOr::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLBitwiseOrKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLBitwiseXor.cpp b/src/runtime/CL/functions/CLBitwiseXor.cpp
new file mode 100644
index 0000000000..fd49c7d818
--- /dev/null
+++ b/src/runtime/CL/functions/CLBitwiseXor.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLBitwiseXor.h"
+
+#include "arm_compute/core/CL/kernels/CLBitwiseXorKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLBitwiseXor::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLBitwiseXorKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLBox3x3.cpp b/src/runtime/CL/functions/CLBox3x3.cpp
new file mode 100644
index 0000000000..8de6807c73
--- /dev/null
+++ b/src/runtime/CL/functions/CLBox3x3.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLBox3x3.h"
+
+#include "arm_compute/core/CL/kernels/CLBox3x3Kernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLBox3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLBox3x3Kernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLCannyEdge.cpp b/src/runtime/CL/functions/CLCannyEdge.cpp
new file mode 100644
index 0000000000..1d018b8347
--- /dev/null
+++ b/src/runtime/CL/functions/CLCannyEdge.cpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLCannyEdge.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
+#include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
+#include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
+
+using namespace arm_compute;
+
+CLCannyEdge::CLCannyEdge()
+    : _sobel(nullptr), _gradient(), _border_mag_gradient(), _non_max_suppr(), _edge_trace(), _gx(), _gy(), _mag(), _phase(), _nonmax(), _visited(), _recorded(), _l1_list_counter(), _l1_stack()
+{
+}
+
+void CLCannyEdge::configure(ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON((1 != norm_type) && (2 != norm_type));
+    ARM_COMPUTE_ERROR_ON(lower_thr > upper_thr);
+
+    const unsigned int L1_hysteresis_stack_size = 8;
+    const TensorShape  shape                    = input->info()->tensor_shape();
+
+    TensorInfo gradient_info;
+    TensorInfo info;
+
+    // Initialize images
+    if(gradient_size < 7)
+    {
+        gradient_info.init(shape, 1, arm_compute::DataType::S16);
+        info.init(shape, 1, arm_compute::DataType::U16);
+    }
+    else
+    {
+        gradient_info.init(shape, 1, arm_compute::DataType::S32);
+        info.init(shape, 1, arm_compute::DataType::U32);
+    }
+
+    _gx.allocator()->init(gradient_info);
+    _gy.allocator()->init(gradient_info);
+    _mag.allocator()->init(info);
+    _nonmax.allocator()->init(info);
+
+    TensorInfo info_u8(shape, 1, arm_compute::DataType::U8);
+    _phase.allocator()->init(info_u8);
+    _l1_list_counter.allocator()->init(info_u8);
+
+    TensorInfo info_u32(shape, 1, arm_compute::DataType::U32);
+    _visited.allocator()->init(info_u32);
+    _recorded.allocator()->init(info_u32);
+
+    TensorShape shape_l1_stack = input->info()->tensor_shape();
+    shape_l1_stack.set(0, input->info()->dimension(0) * L1_hysteresis_stack_size);
+    TensorInfo info_s32(shape_l1_stack, 1, arm_compute::DataType::S32);
+    _l1_stack.allocator()->init(info_s32);
+
+    // Configure/Init sobelNxN
+    if(gradient_size == 3)
+    {
+        auto k = arm_compute::cpp14::make_unique<CLSobel3x3>();
+        k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+        _sobel = std::move(k);
+    }
+    else if(gradient_size == 5)
+    {
+        auto k = arm_compute::cpp14::make_unique<CLSobel5x5>();
+        k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+        _sobel = std::move(k);
+    }
+    else if(gradient_size == 7)
+    {
+        auto k = arm_compute::cpp14::make_unique<CLSobel7x7>();
+        k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+        _sobel = std::move(k);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Gradient %d size not supported", gradient_size);
+    }
+
+    // Configure gradient
+    _gradient.configure(&_gx, &_gy, &_mag, &_phase, norm_type);
+
+    // Configure non-maxima suppression
+    _non_max_suppr.configure(&_mag, &_phase, &_nonmax, lower_thr, border_mode == BorderMode::UNDEFINED);
+
+    // Fill border around magnitude image as non-maxima suppression will access
+    // it. If border mode is undefined filling the border is a nop.
+    _border_mag_gradient.configure(&_mag, _non_max_suppr.border_size(), border_mode, constant_border_value);
+
+    // Configure edge tracing
+    _edge_trace.configure(&_nonmax, output, upper_thr, lower_thr, &_visited, &_recorded, &_l1_stack, &_l1_list_counter);
+
+    _gx.allocator()->allocate();
+    _gy.allocator()->allocate();
+    _phase.allocator()->allocate();
+    _mag.allocator()->allocate();
+    _visited.allocator()->allocate();
+    _recorded.allocator()->allocate();
+    _l1_stack.allocator()->allocate();
+    _l1_list_counter.allocator()->allocate();
+    _nonmax.allocator()->allocate();
+}
+
+void CLCannyEdge::run()
+{
+    // Run sobel
+    _sobel->run();
+
+    // Run phase and magnitude calculation
+    CLScheduler::get().enqueue(_gradient, false);
+
+    // Fill border before non-maxima suppression. Nop for border mode undefined.
+    CLScheduler::get().enqueue(_border_mag_gradient, false);
+
+    // Run non max suppresion
+    _nonmax.clear(CLScheduler::get().queue());
+    CLScheduler::get().enqueue(_non_max_suppr, false);
+
+    // Clear temporary structures and run edge trace
+    _visited.clear(CLScheduler::get().queue());
+    _recorded.clear(CLScheduler::get().queue());
+    _l1_list_counter.clear(CLScheduler::get().queue());
+    _l1_stack.clear(CLScheduler::get().queue());
+    CLScheduler::get().enqueue(_edge_trace, true);
+}
diff --git a/src/runtime/CL/functions/CLChannelCombine.cpp b/src/runtime/CL/functions/CLChannelCombine.cpp
new file mode 100644
index 0000000000..79a3676bd7
--- /dev/null
+++ b/src/runtime/CL/functions/CLChannelCombine.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLChannelCombine.h"
+
+#include "arm_compute/core/CL/kernels/CLChannelCombineKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLChannelCombine::configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLChannelCombineKernel>();
+    k->configure(plane0, plane1, plane2, plane3, output);
+    _kernel = std::move(k);
+}
+
+void CLChannelCombine::configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLChannelCombineKernel>();
+    k->configure(plane0, plane1, plane2, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLChannelExtract.cpp b/src/runtime/CL/functions/CLChannelExtract.cpp
new file mode 100644
index 0000000000..2c6174b9ee
--- /dev/null
+++ b/src/runtime/CL/functions/CLChannelExtract.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLChannelExtract.h"
+
+#include "arm_compute/core/CL/kernels/CLChannelExtractKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLChannelExtract::configure(const ICLTensor *input, Channel channel, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLChannelExtractKernel>();
+    k->configure(input, channel, output);
+    _kernel = std::move(k);
+}
+
+void CLChannelExtract::configure(const ICLMultiImage *input, Channel channel, ICLImage *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLChannelExtractKernel>();
+    k->configure(input, channel, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLColorConvert.cpp b/src/runtime/CL/functions/CLColorConvert.cpp
new file mode 100644
index 0000000000..2fe465aeb8
--- /dev/null
+++ b/src/runtime/CL/functions/CLColorConvert.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLColorConvert.h"
+
+#include "arm_compute/core/CL/kernels/CLColorConvertKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLColorConvert::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLColorConvertKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
+
+void CLColorConvert::configure(const ICLImage *input, ICLMultiImage *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLColorConvertKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
+
+void CLColorConvert::configure(const ICLMultiImage *input, ICLImage *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLColorConvertKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
+
+void CLColorConvert::configure(const ICLMultiImage *input, ICLMultiImage *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLColorConvertKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLConvolution.cpp b/src/runtime/CL/functions/CLConvolution.cpp
new file mode 100644
index 0000000000..21b5d47679
--- /dev/null
+++ b/src/runtime/CL/functions/CLConvolution.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLConvolution.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLConvolution3x3::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLConvolution3x3Kernel>();
+    k->configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
+
+template <unsigned int matrix_size>
+CLConvolutionSquare<matrix_size>::CLConvolutionSquare()
+    : _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
+{
+}
+
+template <unsigned int matrix_size>
+void CLConvolutionSquare<matrix_size>::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(conv == nullptr);
+    int16_t conv_col[matrix_size];
+    int16_t conv_row[matrix_size];
+    _is_separable = separate_matrix(conv, conv_col, conv_row, matrix_size);
+
+    if(_is_separable)
+    {
+        std::pair<DataType, DataType> type_pair = data_type_for_convolution(conv_col, conv_row, matrix_size);
+        _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, type_pair.first));
+
+        if(scale == 0)
+        {
+            scale = calculate_matrix_scale(conv, matrix_size);
+        }
+
+        _kernel_hor.configure(input, &_tmp, conv_row, border_mode == BorderMode::UNDEFINED);
+        _kernel_vert.configure(&_tmp, output, conv_col, scale, border_mode == BorderMode::UNDEFINED, type_pair.second);
+        _border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+
+        // Allocate intermediate buffer
+        _tmp.allocator()->allocate();
+    }
+    else
+    {
+        _kernel.configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
+        _border_handler.configure(input, _kernel.border_size(), border_mode, PixelValue(constant_border_value));
+    }
+}
+
+template <unsigned int matrix_size>
+void                   CLConvolutionSquare<matrix_size>::run()
+{
+    CLScheduler::get().enqueue(_border_handler);
+
+    if(_is_separable)
+    {
+        CLScheduler::get().enqueue(_kernel_hor, false);
+        CLScheduler::get().enqueue(_kernel_vert);
+    }
+    else
+    {
+        CLScheduler::get().enqueue(_kernel);
+    }
+}
+
+template class arm_compute::CLConvolutionSquare<5>;
+template class arm_compute::CLConvolutionSquare<7>;
+template class arm_compute::CLConvolutionSquare<9>;
+
+void CLConvolutionRectangle::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLConvolutionRectangleKernel>();
+    k->configure(input, output, conv, rows, cols, scale, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
new file mode 100644
index 0000000000..f0bbc3514f
--- /dev/null
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <cmath>
+#include <tuple>
+
+using namespace arm_compute;
+
+CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights()
+    : _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
+{
+}
+
+void CLConvolutionLayerReshapeWeights::configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose1xW)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, biases, output);
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+    }
+
+    const bool _has_bias = (biases != nullptr);
+
+    _transpose1xW = transpose1xW;
+
+    if(transpose1xW)
+    {
+        // Create tensor to store the reshaped weights
+        const unsigned int mat_weights_cols = weights->info()->dimension(3);
+        const unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
+        TensorShape        shape_wr(mat_weights_cols, mat_weights_rows);
+        const DataType     dt = weights->info()->data_type();
+        TensorInfo         info_wr(shape_wr, 1, dt);
+
+        _weights_reshaped.allocator()->init(info_wr);
+        _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
+        _weights_transposed_kernel.configure(&_weights_reshaped, output);
+        _weights_reshaped.allocator()->allocate();
+    }
+    else
+    {
+        _weights_reshape_kernel.configure(weights, biases, output);
+    }
+}
+
+void CLConvolutionLayerReshapeWeights::run()
+{
+    cl::CommandQueue q = CLScheduler::get().queue();
+    CLScheduler::get().enqueue(_weights_reshape_kernel);
+    if(_transpose1xW)
+    {
+        CLScheduler::get().enqueue(_weights_transposed_kernel);
+    }
+}
+
+CLConvolutionLayer::CLConvolutionLayer()
+    : _reshape_weights(), _input_im2col_kernel(), _input_interleave_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(),
+      _weights_transposed(), _gemm_output(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
+{
+}
+
+void CLConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && weights->info()->dimension(2) != input->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && biases->info()->dimension(0) != weights->info()->dimension(3));
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+    }
+
+    _has_bias             = (biases != nullptr);
+    _are_weights_reshaped = weights_info.are_reshaped();
+
+    // Get parameters for conv_info
+    unsigned int stride_x = 0;
+    unsigned int stride_y = 0;
+    unsigned int pad_x    = 0;
+    unsigned int pad_y    = 0;
+    std::tie(stride_x, stride_y) = conv_info.stride();
+    std::tie(pad_x, pad_y)       = conv_info.pad();
+
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+
+    const unsigned int kernel_width = _are_weights_reshaped ? weights_info.kernel_size() : weights->info()->dimension(0);
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width,
+                                                 stride_x, stride_y, pad_x, pad_y, conv_info.round());
+    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+
+    // Check if its a "fully connected" convolution
+    _is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
+
+    // Create tensor to store the reshaped weights
+    size_t mat_weights_cols = weights->info()->dimension(3);
+    size_t mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
+    if(_are_weights_reshaped)
+    {
+        mat_weights_cols                         = output->info()->dimension(2);
+        const unsigned int quarter_reshaped_cols = weights->info()->dimension(0) / 4;
+        mat_weights_rows                         = (_has_bias ? 1 + quarter_reshaped_cols : quarter_reshaped_cols);
+    }
+    else
+    {
+        if(_is_fully_connected_convolution)
+        {
+            // Create tensor to store the reshaped weights
+            TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
+            TensorInfo  info_wr(shape_wr, 1, weights->info()->data_type());
+            _weights_reshaped.allocator()->init(info_wr);
+            _reshape_weights.configure(weights, biases, &_weights_reshaped, false);
+            weights = &_weights_reshaped;
+        }
+        else
+        {
+            // Create tensor to store transposed weights
+            TensorShape shape_wt(mat_weights_rows * 4, static_cast<size_t>(std::ceil(mat_weights_cols / 4.f)));
+            TensorInfo  info_wt(shape_wt, 1, weights->info()->data_type());
+            _weights_transposed.allocator()->init(info_wt);
+            _reshape_weights.configure(weights, biases, &_weights_transposed, true);
+            weights = &_weights_transposed;
+        }
+    }
+    // Create tensor to store im2col reshaped inputs
+    const size_t mat_input_cols = mat_weights_rows;
+    const size_t mat_input_rows = conv_w * conv_h;
+    TensorShape  shape_im2col   = input->info()->tensor_shape();
+    shape_im2col.set(0, mat_input_cols);
+    shape_im2col.set(1, mat_input_rows);
+    shape_im2col.set(2, 1);
+    _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+
+    // Create tensor (interleave) to prepare input tensor for GEMM
+    if(!_is_fully_connected_convolution)
+    {
+        TensorShape shape_interleaved = shape_im2col;
+        shape_interleaved.set(0, shape_interleaved.x() * 4);
+        shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4.f));
+        _input_interleaved_reshaped.allocator()->init(TensorInfo(shape_interleaved, 1, input->info()->data_type()));
+    }
+
+    // Create GEMM output tensor
+    TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
+    shape_gemm.set(0, mat_weights_cols);
+    shape_gemm.set(1, mat_input_rows);
+    _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
+
+    // Configure kernels
+    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
+    _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+
+    if(_is_fully_connected_convolution)
+    {
+        _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f);
+    }
+    else
+    {
+        _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
+        _mm_kernel.configure(&_input_interleaved_reshaped, weights, &_gemm_output, 1.0f);
+    }
+
+    if(!_are_weights_reshaped)
+    {
+        if(!_is_fully_connected_convolution)
+        {
+            _weights_transposed.allocator()->allocate();
+        }
+        else
+        {
+            _weights_reshaped.allocator()->allocate();
+        }
+    }
+
+    _input_im2col_reshaped.allocator()->allocate();
+    if(!_is_fully_connected_convolution)
+    {
+        _input_interleaved_reshaped.allocator()->allocate();
+    }
+    _gemm_output.allocator()->allocate();
+}
+
+void CLConvolutionLayer::run()
+{
+    // Run weights reshaping (Runs once for every configure)
+    if(!_are_weights_reshaped)
+    {
+        _are_weights_reshaped = true;
+        _reshape_weights.run();
+    }
+
+    // Run input reshaping
+    CLScheduler::get().enqueue(_input_im2col_kernel);
+    if(!_is_fully_connected_convolution)
+    {
+        CLScheduler::get().enqueue(_input_interleave_kernel);
+    }
+
+    // Runs matrix multiply on reshaped matrices
+    CLScheduler::get().enqueue(_mm_kernel);
+
+    // Reshape output matrix
+    CLScheduler::get().enqueue(_output_col2im_kernel, false);
+}
diff --git a/src/runtime/CL/functions/CLDepthConcatenate.cpp b/src/runtime/CL/functions/CLDepthConcatenate.cpp
new file mode 100644
index 0000000000..d967d9865f
--- /dev/null
+++ b/src/runtime/CL/functions/CLDepthConcatenate.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDepthConcatenate.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h"
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLDepthConcatenate::CLDepthConcatenate()
+    : _inputs_vector(), _concat_kernels_vector(), _border_handlers_vector(), _num_inputs(0)
+{
+}
+
+void CLDepthConcatenate::configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
+
+    _num_inputs = inputs_vector.size();
+
+    unsigned int depth_offset = 0;
+
+    _concat_kernels_vector  = arm_compute::cpp14::make_unique<CLDepthConcatenateKernel[]>(_num_inputs);
+    _border_handlers_vector = arm_compute::cpp14::make_unique<CLFillBorderKernel[]>(_num_inputs);
+
+    for(unsigned int i = 0; i < _num_inputs; i++)
+    {
+        _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
+        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+
+        depth_offset += inputs_vector.at(i)->info()->dimension(2);
+    }
+}
+
+void CLDepthConcatenate::run()
+{
+    cl::CommandQueue q = CLScheduler::get().queue();
+
+    for(unsigned i = 0; i < _num_inputs; i++)
+    {
+        CLScheduler::get().enqueue(_border_handlers_vector[i], false);
+        CLScheduler::get().enqueue(_concat_kernels_vector[i], true);
+    }
+}
diff --git a/src/runtime/CL/functions/CLDepthConvert.cpp b/src/runtime/CL/functions/CLDepthConvert.cpp
new file mode 100644
index 0000000000..edcd4928ab
--- /dev/null
+++ b/src/runtime/CL/functions/CLDepthConvert.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDepthConvert.h"
+
+#include "arm_compute/core/CL/kernels/CLDepthConvertKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLDepthConvert::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
+{
+    auto k = arm_compute::cpp14::make_unique<CLDepthConvertKernel>();
+    k->configure(input, output, policy, shift);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLDerivative.cpp b/src/runtime/CL/functions/CLDerivative.cpp
new file mode 100644
index 0000000000..c51cb4c333
--- /dev/null
+++ b/src/runtime/CL/functions/CLDerivative.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDerivative.h"
+
+#include "arm_compute/core/CL/kernels/CLDerivativeKernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLDerivative::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLDerivativeKernel>();
+    k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLDilate.cpp b/src/runtime/CL/functions/CLDilate.cpp
new file mode 100644
index 0000000000..345f47763c
--- /dev/null
+++ b/src/runtime/CL/functions/CLDilate.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDilate.h"
+
+#include "arm_compute/core/CL/kernels/CLDilateKernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLDilate::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLDilateKernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLEqualizeHistogram.cpp b/src/runtime/CL/functions/CLEqualizeHistogram.cpp
new file mode 100644
index 0000000000..3b182d31b6
--- /dev/null
+++ b/src/runtime/CL/functions/CLEqualizeHistogram.cpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLEqualizeHistogram.h"
+
+#include "arm_compute/core/CL/ICLDistribution1D.h"
+#include "arm_compute/core/CL/ICLLut.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <numeric>
+
+using namespace arm_compute;
+
+namespace
+{
+void calculate_cum_dist_and_lut(CLDistribution1D &dist, CLDistribution1D &cum_dist, CLLut &lut)
+{
+    dist.map(true);
+    cum_dist.map(true);
+    lut.map(true);
+
+    const uint32_t *dist_ptr     = dist.buffer();
+    uint32_t       *cum_dist_ptr = cum_dist.buffer();
+    uint8_t        *lut_ptr      = lut.buffer();
+
+    ARM_COMPUTE_ERROR_ON(dist_ptr == nullptr);
+    ARM_COMPUTE_ERROR_ON(cum_dist_ptr == nullptr);
+    ARM_COMPUTE_ERROR_ON(lut_ptr == nullptr);
+
+    // Calculate cumulative distribution
+    std::partial_sum(dist_ptr, dist_ptr + 256, cum_dist_ptr);
+
+    // Get the number of pixels that have the lowest value in the input image
+    const uint32_t num_lowest_pixels = *std::find_if(dist_ptr, dist_ptr + 256, [](const uint32_t &v)
+    {
+        return v > 0;
+    });
+    const size_t image_size = cum_dist_ptr[255];
+
+    if(image_size == num_lowest_pixels)
+    {
+        std::iota(lut_ptr, lut_ptr + 256, 0);
+    }
+    else
+    {
+        const float diff = image_size - num_lowest_pixels;
+
+        for(size_t i = 0; i < 256; ++i)
+        {
+            lut_ptr[i] = lround((cum_dist_ptr[i] - num_lowest_pixels) / diff * 255.f);
+        }
+    }
+
+    dist.unmap();
+    cum_dist.unmap();
+    lut.unmap();
+}
+} // namespace
+
+CLEqualizeHistogram::CLEqualizeHistogram()
+    : _histogram_kernel(), _border_histogram_kernel(), _map_histogram_kernel(), _hist(nr_bins, 0, max_range), _cum_dist(nr_bins, 0, max_range), _cd_lut(nr_bins, DataType::U8)
+{
+}
+
+void CLEqualizeHistogram::configure(const ICLImage *input, ICLImage *output)
+{
+    _histogram_kernel.configure(input, &_hist);
+    _border_histogram_kernel.configure(input, &_hist);
+    _map_histogram_kernel.configure(input, &_cd_lut, output);
+}
+
+void CLEqualizeHistogram::run()
+{
+    // Calculate histogram of input.
+    CLScheduler::get().enqueue(_histogram_kernel, false);
+
+    // Calculate remaining pixels when image is not multiple of the elements of histogram kernel
+    CLScheduler::get().enqueue(_border_histogram_kernel, false);
+
+    // Calculate cumulative distribution of histogram and create LUT.
+    calculate_cum_dist_and_lut(_hist, _cum_dist, _cd_lut);
+
+    // Map input to output using created LUT.
+    CLScheduler::get().enqueue(_map_histogram_kernel);
+}
diff --git a/src/runtime/CL/functions/CLErode.cpp b/src/runtime/CL/functions/CLErode.cpp
new file mode 100644
index 0000000000..b4c50e465a
--- /dev/null
+++ b/src/runtime/CL/functions/CLErode.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLErode.h"
+
+#include "arm_compute/core/CL/kernels/CLErodeKernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLErode::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLErodeKernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLFastCorners.cpp b/src/runtime/CL/functions/CLFastCorners.cpp
new file mode 100644
index 0000000000..d2903fb849
--- /dev/null
+++ b/src/runtime/CL/functions/CLFastCorners.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLFastCorners.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/CL/kernels/CLFastCornersKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+#include <algorithm>
+#include <cstring>
+
+using namespace arm_compute;
+
+CLFastCorners::CLFastCorners()
+    : _fast_corners_kernel(),
+      _suppr_func(),
+      _copy_array_kernel(),
+      _output(),
+      _suppr(),
+      _win(),
+      _non_max(false),
+      _num_corners(nullptr),
+      _num_buffer(),
+      _corners(nullptr),
+      _constant_border_value(0)
+{
+}
+
+void CLFastCorners::configure(const ICLImage *input, float threshold, bool nonmax_suppression, CLKeyPointArray *const corners,
+                              unsigned int *num_corners, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON(BorderMode::UNDEFINED != border_mode);
+    ARM_COMPUTE_ERROR_ON(nullptr == corners);
+    ARM_COMPUTE_ERROR_ON(threshold < 1 && threshold > 255);
+
+    TensorInfo tensor_info(input->info()->tensor_shape(), 1, DataType::U8);
+    _output.allocator()->init(tensor_info);
+
+    _non_max               = nonmax_suppression;
+    _num_corners           = num_corners;
+    _corners               = corners;
+    _num_buffer            = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(unsigned int));
+    _constant_border_value = constant_border_value;
+
+    const bool update_number = (nullptr != _num_corners);
+
+    _fast_corners_kernel.configure(input, &_output, threshold, nonmax_suppression, border_mode);
+
+    if(!_non_max)
+    {
+        _copy_array_kernel.configure(&_output, update_number, corners, &_num_buffer);
+    }
+    else
+    {
+        _suppr.allocator()->init(tensor_info);
+
+        _suppr_func.configure(&_output, &_suppr, border_mode);
+        _copy_array_kernel.configure(&_suppr, update_number, corners, &_num_buffer);
+
+        _suppr.allocator()->allocate();
+    }
+
+    // Allocate intermediate tensors
+    _output.allocator()->allocate();
+}
+
+void CLFastCorners::run()
+{
+    cl::CommandQueue q = CLScheduler::get().queue();
+
+    if(_non_max)
+    {
+        ARM_COMPUTE_ERROR_ON_MSG(_output.cl_buffer().get() == nullptr, "Unconfigured function");
+        const auto out_buffer = static_cast<unsigned char *>(q.enqueueMapBuffer(_output.cl_buffer(), CL_TRUE, CL_MAP_WRITE, 0, _output.info()->total_size()));
+        memset(out_buffer, 0, _output.info()->total_size());
+        q.enqueueUnmapMemObject(_output.cl_buffer(), out_buffer);
+    }
+
+    CLScheduler::get().enqueue(_fast_corners_kernel, false);
+
+    if(_non_max)
+    {
+        _suppr_func.run();
+    }
+
+    CLScheduler::get().enqueue(_copy_array_kernel, false);
+
+    unsigned int get_num_corners = 0;
+    q.enqueueReadBuffer(_num_buffer, CL_TRUE, 0, sizeof(unsigned int), &get_num_corners);
+
+    size_t corner_size = std::min(static_cast<size_t>(get_num_corners), _corners->max_num_values());
+
+    _corners->resize(corner_size);
+
+    if(_num_corners != nullptr)
+    {
+        *_num_corners = get_num_corners;
+    }
+
+    q.flush();
+}
diff --git a/src/runtime/CL/functions/CLFillBorder.cpp b/src/runtime/CL/functions/CLFillBorder.cpp
new file mode 100644
index 0000000000..9e59b771d8
--- /dev/null
+++ b/src/runtime/CL/functions/CLFillBorder.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLFillBorder.h"
+
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLFillBorder::configure(ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLFillBorderKernel>();
+    k->configure(tensor, border_width, border_mode, constant_border_value);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
new file mode 100644
index 0000000000..57d57d517f
--- /dev/null
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <algorithm>
+#include <cmath>
+
+using namespace arm_compute;
+
+CLFullyConnectedLayerReshapeWeights::CLFullyConnectedLayerReshapeWeights()
+    : _transpose_kernel(), _transpose1xW_kernel(), _transpose_output(), _transpose_weights(false), _is_batched_fc_layer(false)
+{
+}
+
+void CLFullyConnectedLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output, bool transpose_weights, bool is_batched_fc_layer)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != 2);
+    ARM_COMPUTE_ERROR_ON((transpose_weights == false) && (is_batched_fc_layer == false));
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    _transpose_weights   = transpose_weights;
+    _is_batched_fc_layer = is_batched_fc_layer;
+
+    // Check if we need to transpose the weights
+    if(_transpose_weights)
+    {
+        if(_is_batched_fc_layer)
+        {
+            // Initialize the output tensor for transpose
+            TensorShape shape_transposed(input->info()->dimension(1), input->info()->dimension(0));
+            _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, dt, fixed_point_position));
+            _transpose_kernel.configure(input, &_transpose_output);
+
+            // Configure transpose 1xW kernel
+            _transpose1xW_kernel.configure(&_transpose_output, output);
+
+            // Allocate temporary tensor used for transposing the weights
+            _transpose_output.allocator()->allocate();
+        }
+        else
+        {
+            _transpose_kernel.configure(input, output);
+        }
+    }
+    else
+    {
+        if(_is_batched_fc_layer)
+        {
+            // Configure transpose 1xW kernel
+            _transpose1xW_kernel.configure(input, output);
+        }
+        else
+        {
+            ARM_COMPUTE_ERROR("Configuration transpose_weights=false & is_batched_fc_layer=false not supported");
+        }
+    }
+}
+
+void CLFullyConnectedLayerReshapeWeights::run()
+{
+    if(_transpose_weights)
+    {
+        CLScheduler::get().enqueue(_transpose_kernel, _is_batched_fc_layer);
+    }
+    if(_is_batched_fc_layer)
+    {
+        CLScheduler::get().enqueue(_transpose1xW_kernel);
+    }
+}
+
+CLFullyConnectedLayer::CLFullyConnectedLayer()
+    : _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), _reshape_weights_output(),
+      _are_weights_reshaped(true), _is_fc_after_conv(true), _is_batched_fc_layer(false), _accumulate_biases(false)
+{
+}
+
+void CLFullyConnectedLayer::configure_conv_fc_wb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2) * (16 / weights->info()->element_size())));
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
+
+    // Initialize output tensor for im2col
+    TensorShape shape_im2col;
+    shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
+    shape_im2col.set(1, input->info()->dimension(3));
+    shape_im2col.set(2, input->info()->dimension(4));
+    shape_im2col.set(3, input->info()->dimension(5));
+    _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
+
+    // Initialize output tensor for interleave 4x4
+    TensorShape shape_interleaved = _im2col_output.info()->tensor_shape();
+    shape_interleaved.set(0, shape_interleaved.x() * 4);
+    shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
+    _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
+
+    // Configure im2col kernel
+    _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+
+    // Configure interleave4x4 kernel
+    _interleave4x4_kernel.configure(&_im2col_output, &_interleave4x4_output);
+
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
+
+    // Allocate the tensors once all the configure methods have been called
+    _im2col_output.allocator()->allocate();
+    _interleave4x4_output.allocator()->allocate();
+}
+
+void CLFullyConnectedLayer::configure_fc_fc_wb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
+{
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    // Initialize output tensor for interleave 4x4
+    TensorShape shape_interleaved = input->info()->tensor_shape();
+    shape_interleaved.set(0, shape_interleaved.x() * 4);
+    shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
+    _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
+
+    // Configure interleave4x4 kernel
+    _interleave4x4_kernel.configure(input, &_interleave4x4_output);
+
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
+
+    // Allocate the tensors once all the configure methods have been called
+    _interleave4x4_output.allocator()->allocate();
+}
+
+void CLFullyConnectedLayer::configure_conv_fc_nb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
+
+    // Initialize output tensor for im2col
+    TensorShape shape_im2col;
+    shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
+    shape_im2col.set(1, 1);
+    _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
+
+    // Configure im2col kernel
+    _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(&_im2col_output, weights, output, 1.0f);
+
+    // Allocate the output tensor for im2col once all the configure methods have been called
+    _im2col_output.allocator()->allocate();
+}
+
+void CLFullyConnectedLayer::configure_fc_fc_nb(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
+
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(input, weights, output, 1.0f);
+}
+
+void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights, bool are_weights_reshaped)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() != 2);
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    _are_weights_reshaped = are_weights_reshaped;
+    _is_fc_after_conv     = true;
+    _is_batched_fc_layer  = false;
+    _accumulate_biases    = false;
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+
+        _accumulate_biases = true;
+
+        // Configure accumulate biases kernel
+        _accumulate_biases_kernel.configure(output, biases);
+    }
+
+    // With the Fully Connected layer we can have 4 different cases:
+    //  1) Convolution layer -> Fully Connected layer without batches
+    //  2) Fully Connected layer -> Fully Connected layer without batches
+    //  3) Convolution layer -> Fully Connected layer with batches
+    //  4) Fully Connected layer -> Fully Connected layer with batches
+
+    // Check if we have a fully connected layer with batches
+    _is_batched_fc_layer = (output->info()->dimension(1) > 1);
+
+    const ICLTensor *weights_to_use = weights;
+
+    if(!are_weights_reshaped)
+    {
+        if((transpose_weights || _is_batched_fc_layer))
+        {
+            weights_to_use = &_reshape_weights_output;
+
+            if(transpose_weights)
+            {
+                if(_is_batched_fc_layer)
+                {
+                    const float transpose_width = 16.0f / input->info()->element_size();
+                    TensorShape shape_wt(weights->info()->dimension(0) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(1) / transpose_width)));
+                    TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
+                    _reshape_weights_output.allocator()->init(info_wt);
+                }
+                else
+                {
+                    TensorShape shape_wt(weights->info()->dimension(1), weights->info()->dimension(0));
+                    TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
+                    _reshape_weights_output.allocator()->init(info_wt);
+                }
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR_ON(!_is_batched_fc_layer);
+
+                const float transpose_width = 16.0f / input->info()->element_size();
+                TensorShape shape_wt(weights->info()->dimension(1) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(0) / transpose_width)));
+                TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
+                _reshape_weights_output.allocator()->init(info_wt);
+            }
+
+            // Reshape the weights
+            _reshape_weights_kernel.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer);
+        }
+    }
+
+    if(_is_batched_fc_layer)
+    {
+        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
+                                                                                  input->info()->tensor_shape().cend(),
+                                                                                  output->info()->tensor_shape().cbegin() + 1));
+
+        if(_is_fc_after_conv)
+        {
+            // Fully Connected layer after a Convolution Layer with batches
+            configure_conv_fc_wb(input, weights_to_use, output);
+        }
+        else
+        {
+            // Fully Connected layer after a Fully Connected Layer with batches
+            configure_fc_fc_wb(input, weights_to_use, output);
+        }
+    }
+    else
+    {
+        // In case of not batched fully connected layer, the weights will not be reshaped using transposed1xW
+        _is_fc_after_conv = ((weights_to_use->info()->dimension(1)) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
+
+        if(_is_fc_after_conv)
+        {
+            // Fully Connected layer after a Convolution Layer without batches
+            configure_conv_fc_nb(input, weights_to_use, output);
+        }
+        else
+        {
+            // Fully Connected layer after a Fully Connected Layer without batches
+            configure_fc_fc_nb(input, weights_to_use, output);
+        }
+    }
+
+    // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
+    if(!are_weights_reshaped)
+    {
+        if(transpose_weights || _is_batched_fc_layer)
+        {
+            // Allocate the tensor for the weights reshaped
+            _reshape_weights_output.allocator()->allocate();
+        }
+    }
+}
+
+void CLFullyConnectedLayer::run()
+{
+    // Reshape of the weights (happens only once)
+    if(!_are_weights_reshaped)
+    {
+        _are_weights_reshaped = true;
+        _reshape_weights_kernel.run();
+    }
+
+    // Linearize input if it comes from a convolutional layer
+    if(_is_fc_after_conv)
+    {
+        CLScheduler::get().enqueue(_im2col_kernel, false);
+    }
+
+    // Interleave input
+    if(_is_batched_fc_layer)
+    {
+        CLScheduler::get().enqueue(_interleave4x4_kernel, false);
+    }
+
+    // Run matrix multiply
+    CLScheduler::get().enqueue(_mm_kernel, !_accumulate_biases);
+
+    // Accumulate biases if provided
+    if(_accumulate_biases)
+    {
+        CLScheduler::get().enqueue(_accumulate_biases_kernel);
+    }
+}
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
new file mode 100644
index 0000000000..7408054127
--- /dev/null
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGEMM.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+using namespace arm_compute;
+
+CLGEMM::CLGEMM()
+    : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _run_vector_matrix_multiplication(false), _run_addition(false)
+{
+}
+
+void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::F16);
+
+    if(c != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(c, 1, DataType::F32, DataType::F16);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c);
+        ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
+        ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C");
+        ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(0) != output->info()->dimension(0), "The C matrix must have the same number of rows as the output matrix");
+        ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != output->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix");
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
+    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+
+    // Check if the first input tensor is a vector. If so, all the kernels for reshaping the tensors can be skipped
+    if(a->info()->dimension(1) != 1)
+    {
+        _run_vector_matrix_multiplication = false;
+
+        TensorShape shape_tmp_a = a->info()->tensor_shape();
+        TensorShape shape_tmp_b = b->info()->tensor_shape();
+
+        shape_tmp_a.set(0, a->info()->dimension(0) * 4);
+        shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f));
+
+        if(DataType::F32 == a->info()->data_type())
+        {
+            shape_tmp_b.set(0, b->info()->dimension(1) * 4);
+            shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 4.0f));
+        }
+        else if(DataType::F16 == a->info()->data_type())
+        {
+            shape_tmp_b.set(0, b->info()->dimension(1) * 8);
+            shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 8.0f));
+        }
+        else
+        {
+            ARM_COMPUTE_ERROR("DataType not supported");
+        }
+
+        TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
+        _tmp_a.allocator()->init(info_a);
+
+        TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+        _tmp_b.allocator()->init(info_b);
+
+        // Configure interleave kernel
+        _interleave_kernel.configure(a, &_tmp_a);
+
+        // Configure transpose kernel
+        _transpose_kernel.configure(b, &_tmp_b);
+
+        // Configure matrix multiply kernel
+        _mm_kernel.configure(&_tmp_a, &_tmp_b, output, alpha);
+
+        // Allocate intermediate tensors
+        _tmp_a.allocator()->allocate();
+        _tmp_b.allocator()->allocate();
+    }
+    else // The first input tensor is a vector
+    {
+        _run_vector_matrix_multiplication = true;
+
+        // Configure the matrix multiply kernel
+        _mm_kernel.configure(a, b, output, alpha);
+    }
+
+    // Configure matrix addition kernel
+    if(beta != 0 && c != nullptr)
+    {
+        _ma_kernel.configure(c, output, beta);
+        _run_addition = true;
+    }
+}
+
+void CLGEMM::run()
+{
+    if(!_run_vector_matrix_multiplication)
+    {
+        // Run interleave kernel
+        CLScheduler::get().enqueue(_interleave_kernel, false);
+
+        // Run transpose kernel
+        CLScheduler::get().enqueue(_transpose_kernel, false);
+    }
+
+    // Run matrix multiply kernel
+    CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+
+    // Run matrix addition kernel
+    if(_run_addition)
+    {
+        CLScheduler::get().enqueue(_ma_kernel);
+    }
+}
diff --git a/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp b/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp
new file mode 100644
index 0000000000..9dc77156ef
--- /dev/null
+++ b/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h"
+
+#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/Helpers.h"
+
+using namespace arm_compute;
+
+void CLGEMMInterleave4x4::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLGEMMInterleave4x4Kernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLGEMMLowp.cpp b/src/runtime/CL/functions/CLGEMMLowp.cpp
new file mode 100644
index 0000000000..45e011d8ce
--- /dev/null
+++ b/src/runtime/CL/functions/CLGEMMLowp.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGEMMLowp.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLGEMMLowp::CLGEMMLowp()
+    : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
+{
+}
+
+void CLGEMMLowp::configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
+    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != output->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
+    ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != output->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C");
+
+    // Create shape for interleaved temporary tensor
+    TensorShape shape_tmp_a = a->info()->tensor_shape();
+    shape_tmp_a.set(0, a->info()->dimension(0) * 4);
+    shape_tmp_a.set(1, ceil(a->info()->dimension(1) / 4));
+    TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
+    _tmp_a.allocator()->init(info_a);
+
+    // Create shape for tranposed temporary tensor
+    TensorShape shape_tmp_b = b->info()->tensor_shape();
+    shape_tmp_b.set(0, b->info()->dimension(1) * 16);
+    shape_tmp_b.set(1, std::ceil(static_cast<float>(b->info()->dimension(0)) / 16));
+    TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+    _tmp_b.allocator()->init(info_b);
+
+    // Configure kernels
+    _interleave_kernel.configure(a, &_tmp_a);
+    _transpose_kernel.configure(b, &_tmp_b);
+    _mm_kernel.configure(&_tmp_a, &_tmp_b, output, a_offset, b_offset, output_offset, output_mult_int, shift);
+
+    // Allocate intermediate buffers
+    _tmp_a.allocator()->allocate();
+    _tmp_b.allocator()->allocate();
+}
+
+void CLGEMMLowp::run()
+{
+    /* Run interleave kernel */
+    CLScheduler::get().enqueue(_interleave_kernel, false);
+
+    /* Run transpose kernel */
+    CLScheduler::get().enqueue(_transpose_kernel, false);
+
+    /* Run matrix multiply kernel */
+    CLScheduler::get().enqueue(_mm_kernel, false);
+}
diff --git a/src/runtime/CL/functions/CLGaussian3x3.cpp b/src/runtime/CL/functions/CLGaussian3x3.cpp
new file mode 100644
index 0000000000..362a3fe920
--- /dev/null
+++ b/src/runtime/CL/functions/CLGaussian3x3.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGaussian3x3.h"
+
+#include "arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLGaussian3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLGaussian3x3Kernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLGaussian5x5.cpp b/src/runtime/CL/functions/CLGaussian5x5.cpp
new file mode 100644
index 0000000000..e83a8fb857
--- /dev/null
+++ b/src/runtime/CL/functions/CLGaussian5x5.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+CLGaussian5x5::CLGaussian5x5()
+    : _kernel_hor(), _kernel_vert(), _border_handler(), _tmp()
+{
+}
+
+void CLGaussian5x5::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+
+    _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, DataType::U16));
+
+    _kernel_hor.configure(input, &_tmp, border_mode == BorderMode::UNDEFINED);
+    _kernel_vert.configure(&_tmp, output, border_mode == BorderMode::UNDEFINED);
+    _border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+
+    // Allocate intermediate buffers
+    _tmp.allocator()->allocate();
+}
+
+void CLGaussian5x5::run()
+{
+    CLScheduler::get().enqueue(_border_handler, false);
+    CLScheduler::get().enqueue(_kernel_hor, false);
+    CLScheduler::get().enqueue(_kernel_vert);
+}
diff --git a/src/runtime/CL/functions/CLGaussianPyramid.cpp b/src/runtime/CL/functions/CLGaussianPyramid.cpp
new file mode 100644
index 0000000000..8a4279e99b
--- /dev/null
+++ b/src/runtime/CL/functions/CLGaussianPyramid.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h"
+#include "arm_compute/core/CL/kernels/CLScaleKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "arm_compute/runtime/CL/CLPyramid.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
+
+#include <cstddef>
+
+using namespace arm_compute;
+
+CLGaussianPyramid::CLGaussianPyramid()
+    : _input(nullptr), _pyramid(nullptr), _tmp()
+{
+}
+
+CLGaussianPyramidHalf::CLGaussianPyramidHalf()
+    : _border_handler(), _horizontal_reduction(), _vertical_reduction()
+{
+}
+
+void CLGaussianPyramidHalf::configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(pyramid == nullptr);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
+    ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_HALF != pyramid->info()->scale());
+
+    /* Get number of pyramid levels */
+    const size_t num_levels = pyramid->info()->num_levels();
+
+    _input   = input;
+    _pyramid = pyramid;
+
+    if(num_levels > 1)
+    {
+        _border_handler       = arm_compute::cpp14::make_unique<CLFillBorderKernel[]>(num_levels - 1);
+        _horizontal_reduction = arm_compute::cpp14::make_unique<CLGaussianPyramidHorKernel[]>(num_levels - 1);
+        _vertical_reduction   = arm_compute::cpp14::make_unique<CLGaussianPyramidVertKernel[]>(num_levels - 1);
+
+        // Apply half scale to the X dimension of the tensor shape
+        TensorShape tensor_shape = pyramid->info()->tensor_shape();
+        tensor_shape.set(0, (pyramid->info()->width() + 1) * SCALE_PYRAMID_HALF);
+
+        PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_HALF, tensor_shape, Format::U16);
+
+        _tmp.init(pyramid_info);
+
+        for(size_t i = 0; i < num_levels - 1; ++i)
+        {
+            /* Configure horizontal kernel */
+            _horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode == BorderMode::UNDEFINED);
+
+            /* Configure vertical kernel */
+            _vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), border_mode == BorderMode::UNDEFINED);
+
+            /* Configure border */
+            _border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
+        }
+        _tmp.allocate();
+    }
+}
+
+void CLGaussianPyramidHalf::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_pyramid == nullptr, "Unconfigured function");
+
+    /* Get number of pyramid levels */
+    const size_t num_levels = _pyramid->info()->num_levels();
+
+    /* The first level of the pyramid has the input image */
+    _pyramid->get_pyramid_level(0)->map(CLScheduler::get().queue(), true /* blocking */);
+    _input->map(CLScheduler::get().queue(), true /* blocking */);
+    _pyramid->get_pyramid_level(0)->copy_from(*_input);
+    _input->unmap(CLScheduler::get().queue());
+    _pyramid->get_pyramid_level(0)->unmap(CLScheduler::get().queue());
+
+    for(unsigned int i = 0; i < num_levels - 1; ++i)
+    {
+        CLScheduler::get().enqueue(_border_handler[i], false);
+        CLScheduler::get().enqueue(_horizontal_reduction[i], false);
+        CLScheduler::get().enqueue(_vertical_reduction[i], false);
+    }
+}
+
+CLGaussianPyramidOrb::CLGaussianPyramidOrb()
+    : _gauss5x5(), _scale_nearest()
+{
+}
+
+void CLGaussianPyramidOrb::configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
+    ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_ORB != pyramid->info()->scale());
+
+    /* Get number of pyramid levels */
+    const size_t num_levels = pyramid->info()->num_levels();
+
+    _input   = input;
+    _pyramid = pyramid;
+
+    if(num_levels > 1)
+    {
+        _gauss5x5      = arm_compute::cpp14::make_unique<CLGaussian5x5[]>(num_levels - 1);
+        _scale_nearest = arm_compute::cpp14::make_unique<CLScaleKernel[]>(num_levels - 1);
+
+        PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8);
+
+        _tmp.init(pyramid_info);
+
+        for(size_t i = 0; i < num_levels - 1; ++i)
+        {
+            /* Configure gaussian 5x5 */
+            _gauss5x5[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
+
+            /* Configure scale image kernel */
+            _scale_nearest[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR, border_mode == BorderMode::UNDEFINED);
+        }
+
+        _tmp.allocate();
+    }
+}
+
+void CLGaussianPyramidOrb::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_pyramid == nullptr, "Unconfigured function");
+
+    /* Get number of pyramid levels */
+    const size_t num_levels = _pyramid->info()->num_levels();
+
+    /* The first level of the pyramid has the input image */
+    _pyramid->get_pyramid_level(0)->map(CLScheduler::get().queue(), true /* blocking */);
+    _input->map(CLScheduler::get().queue(), true /* blocking */);
+    _pyramid->get_pyramid_level(0)->copy_from(*_input);
+    _input->unmap(CLScheduler::get().queue());
+    _pyramid->get_pyramid_level(0)->unmap(CLScheduler::get().queue());
+
+    for(unsigned int i = 0; i < num_levels - 1; ++i)
+    {
+        _gauss5x5[i].run();
+        CLScheduler::get().enqueue(_scale_nearest[i]);
+    }
+}
diff --git a/src/runtime/CL/functions/CLHOGDescriptor.cpp b/src/runtime/CL/functions/CLHOGDescriptor.cpp
new file mode 100644
index 0000000000..b1b5a03ac1
--- /dev/null
+++ b/src/runtime/CL/functions/CLHOGDescriptor.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLHOGDescriptor.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/HOGInfo.h"
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLHOGDescriptor::CLHOGDescriptor()
+    : _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
+{
+}
+
+void CLHOGDescriptor::configure(ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    ARM_COMPUTE_ERROR_ON(nullptr == hog);
+
+    const HOGInfo *hog_info = hog->info();
+    const size_t   width    = input->info()->dimension(Window::DimX);
+    const size_t   height   = input->info()->dimension(Window::DimY);
+    const size_t   num_bins = hog_info->num_bins();
+
+    Size2D cell_size = hog_info->cell_size();
+
+    // Calculate number of cells along the x and y directions for the hog_space
+    const size_t num_cells_x = width / cell_size.width;
+    const size_t num_cells_y = height / cell_size.height;
+
+    // TensorShape of the input image
+    const TensorShape &shape_img = input->info()->tensor_shape();
+
+    // TensorShape of the hog space
+    TensorShape shape_hog_space = input->info()->tensor_shape();
+    shape_hog_space.set(Window::DimX, num_cells_x);
+    shape_hog_space.set(Window::DimY, num_cells_y);
+
+    // Intitialize tensors for magnitude, phase and hog space
+    TensorInfo info_mag(shape_img, Format::S16);
+    _mag.allocator()->init(info_mag);
+
+    TensorInfo info_phase(shape_img, Format::U8);
+    _phase.allocator()->init(info_phase);
+
+    TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
+    _hog_space.allocator()->init(info_space);
+
+    // Initialise gradient kernel
+    _gradient.configure(input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value);
+
+    // Initialise orientation binning kernel
+    _orient_bin.configure(&_mag, &_phase, &_hog_space, hog->info());
+
+    // Initialize HOG norm kernel
+    _block_norm.configure(&_hog_space, output, hog->info());
+
+    // Allocate intermediate tensors
+    _mag.allocator()->allocate();
+    _phase.allocator()->allocate();
+    _hog_space.allocator()->allocate();
+}
+
+void CLHOGDescriptor::run()
+{
+    // Run gradient
+    _gradient.run();
+
+    // Run orientation binning
+    CLScheduler::get().enqueue(_orient_bin, false);
+
+    // Run block normalization
+    CLScheduler::get().enqueue(_block_norm);
+}
+\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGDetector.cpp b/src/runtime/CL/functions/CLHOGDetector.cpp
new file mode 100644
index 0000000000..8eb5e4251f
--- /dev/null
+++ b/src/runtime/CL/functions/CLHOGDetector.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLHOGDetector.h"
+
+#include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <algorithm>
+
+using namespace arm_compute;
+
+CLHOGDetector::CLHOGDetector()
+    : _hog_detector_kernel(), _detection_windows(nullptr), _num_detection_windows()
+{
+}
+
+void CLHOGDetector::configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, size_t idx_class)
+{
+    _detection_windows = detection_windows;
+
+    // Allocate buffer for storing the number of detected objects
+    _num_detection_windows = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(unsigned int));
+
+    // Configure HOGDetectorKernel
+    _hog_detector_kernel.configure(input, hog, detection_windows, &_num_detection_windows, detection_window_stride, threshold, idx_class);
+}
+
+void CLHOGDetector::run()
+{
+    cl::CommandQueue q = CLScheduler::get().queue();
+
+    // Reset number of detections
+    const unsigned int init_num_detection_windows = _detection_windows->num_values();
+    q.enqueueWriteBuffer(_num_detection_windows, CL_FALSE, 0, sizeof(unsigned int), &init_num_detection_windows);
+
+    // Run CLHOGDetectorKernel
+    CLScheduler::get().enqueue(_hog_detector_kernel);
+
+    // Read number of detections
+    unsigned int num_detection_windows = 0;
+    q.enqueueReadBuffer(_num_detection_windows, CL_TRUE, 0, sizeof(unsigned int), &num_detection_windows);
+
+    // Update the number of values stored in _detection_windows
+    _detection_windows->resize(static_cast<size_t>(num_detection_windows));
+
+    q.flush();
+}
+\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGGradient.cpp b/src/runtime/CL/functions/CLHOGGradient.cpp
new file mode 100644
index 0000000000..2387474358
--- /dev/null
+++ b/src/runtime/CL/functions/CLHOGGradient.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLHOGGradient::CLHOGGradient()
+    : _derivative(), _mag_phase(), _gx(), _gy()
+{
+}
+
+void CLHOGGradient::configure(ICLTensor *input, ICLTensor *output_magnitude, ICLTensor *output_phase, PhaseType phase_type, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_magnitude, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_phase, 1, DataType::U8);
+
+    const TensorShape &shape_img = input->info()->tensor_shape();
+
+    // Allocate image memory
+    TensorInfo info(shape_img, Format::S16);
+    _gx.allocator()->init(info);
+    _gy.allocator()->init(info);
+
+    // Initialise derivate kernel
+    _derivative.configure(input, &_gx, &_gy, border_mode, constant_border_value);
+
+    // Initialise magnitude/phase kernel
+    if(PhaseType::UNSIGNED == phase_type)
+    {
+        _mag_phase.configure(&_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::UNSIGNED);
+    }
+    else
+    {
+        _mag_phase.configure(&_gx, &_gy, output_magnitude, output_phase, MagnitudeType::L2NORM, PhaseType::SIGNED);
+    }
+
+    // Allocate intermediate tensors
+    _gx.allocator()->allocate();
+    _gy.allocator()->allocate();
+}
+
+void CLHOGGradient::run()
+{
+    // Run derivative
+    _derivative.run();
+
+    // Run magnitude/phase kernel
+    CLScheduler::get().enqueue(_mag_phase);
+}
+\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHOGMultiDetection.cpp b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
new file mode 100644
index 0000000000..b8f2224ac8
--- /dev/null
+++ b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLHOGMultiDetection.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/CL/CLArray.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+using namespace arm_compute;
+
+CLHOGMultiDetection::CLHOGMultiDetection()
+    : _gradient_kernel(), _orient_bin_kernel(), _block_norm_kernel(), _hog_detect_kernel(), _non_maxima_kernel(), _hog_space(), _hog_norm_space(), _detection_windows(), _mag(), _phase(),
+      _non_maxima_suppression(false), _num_orient_bin_kernel(0), _num_block_norm_kernel(0), _num_hog_detect_kernel(0)
+{
+}
+
+void CLHOGMultiDetection::configure(ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows, ICLSize2DArray *detection_window_strides, BorderMode border_mode,
+                                    uint8_t constant_border_value, float threshold, bool non_maxima_suppression, float min_distance)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_INVALID_MULTI_HOG(multi_hog);
+    ARM_COMPUTE_ERROR_ON(nullptr == detection_windows);
+    ARM_COMPUTE_ERROR_ON(detection_window_strides->num_values() != multi_hog->num_models());
+
+    const size_t       width      = input->info()->dimension(Window::DimX);
+    const size_t       height     = input->info()->dimension(Window::DimY);
+    const TensorShape &shape_img  = input->info()->tensor_shape();
+    const size_t       num_models = multi_hog->num_models();
+    PhaseType          phase_type = multi_hog->model(0)->info()->phase_type();
+
+    size_t prev_num_bins     = multi_hog->model(0)->info()->num_bins();
+    Size2D prev_cell_size    = multi_hog->model(0)->info()->cell_size();
+    Size2D prev_block_size   = multi_hog->model(0)->info()->block_size();
+    Size2D prev_block_stride = multi_hog->model(0)->info()->block_stride();
+
+    /* Check if CLHOGOrientationBinningKernel and CLHOGBlockNormalizationKernel kernels can be skipped for a specific HOG data-object
+     *
+     * 1) CLHOGOrientationBinningKernel and CLHOGBlockNormalizationKernel are skipped if the cell size and the number of bins don't change.
+     *        Since "multi_hog" is sorted,it is enough to check the HOG descriptors at level "ith" and level "(i-1)th
+     * 2) CLHOGBlockNormalizationKernel is skipped if the cell size, the number of bins and block size do not change.
+     *         Since "multi_hog" is sorted,it is enough to check the HOG descriptors at level "ith" and level "(i-1)th
+     *
+     * @note Since the orientation binning and block normalization kernels can be skipped, we need to keep track of the input to process for each kernel
+     *       with "input_orient_bin", "input_hog_detect" and "input_block_norm"
+     */
+    std::vector<size_t> input_orient_bin;
+    std::vector<size_t> input_hog_detect;
+    std::vector<std::pair<size_t, size_t>> input_block_norm;
+
+    input_orient_bin.push_back(0);
+    input_hog_detect.push_back(0);
+    input_block_norm.emplace_back(0, 0);
+
+    for(size_t i = 1; i < num_models; ++i)
+    {
+        size_t cur_num_bins     = multi_hog->model(i)->info()->num_bins();
+        Size2D cur_cell_size    = multi_hog->model(i)->info()->cell_size();
+        Size2D cur_block_size   = multi_hog->model(i)->info()->block_size();
+        Size2D cur_block_stride = multi_hog->model(i)->info()->block_stride();
+
+        if((cur_num_bins != prev_num_bins) || (cur_cell_size.width != prev_cell_size.width) || (cur_cell_size.height != prev_cell_size.height))
+        {
+            prev_num_bins     = cur_num_bins;
+            prev_cell_size    = cur_cell_size;
+            prev_block_size   = cur_block_size;
+            prev_block_stride = cur_block_stride;
+
+            // Compute orientation binning and block normalization kernels. Update input to process
+            input_orient_bin.push_back(i);
+            input_block_norm.emplace_back(i, input_orient_bin.size() - 1);
+        }
+        else if((cur_block_size.width != prev_block_size.width) || (cur_block_size.height != prev_block_size.height) || (cur_block_stride.width != prev_block_stride.width)
+                || (cur_block_stride.height != prev_block_stride.height))
+        {
+            prev_block_size   = cur_block_size;
+            prev_block_stride = cur_block_stride;
+
+            // Compute block normalization kernel. Update input to process
+            input_block_norm.emplace_back(i, input_orient_bin.size() - 1);
+        }
+
+        // Update input to process for hog detector kernel
+        input_hog_detect.push_back(input_block_norm.size() - 1);
+    }
+
+    _detection_windows      = detection_windows;
+    _non_maxima_suppression = non_maxima_suppression;
+    _num_orient_bin_kernel  = input_orient_bin.size(); // Number of CLHOGOrientationBinningKernel kernels to compute
+    _num_block_norm_kernel  = input_block_norm.size(); // Number of CLHOGBlockNormalizationKernel kernels to compute
+    _num_hog_detect_kernel  = input_hog_detect.size(); // Number of CLHOGDetector functions to compute
+
+    _orient_bin_kernel = arm_compute::cpp14::make_unique<CLHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
+    _block_norm_kernel = arm_compute::cpp14::make_unique<CLHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
+    _hog_detect_kernel = arm_compute::cpp14::make_unique<CLHOGDetector[]>(_num_hog_detect_kernel);
+    _non_maxima_kernel = arm_compute::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
+    _hog_space         = arm_compute::cpp14::make_unique<CLTensor[]>(_num_orient_bin_kernel);
+    _hog_norm_space    = arm_compute::cpp14::make_unique<CLTensor[]>(_num_block_norm_kernel);
+
+    // Allocate tensors for magnitude and phase
+    TensorInfo info_mag(shape_img, Format::S16);
+    _mag.allocator()->init(info_mag);
+
+    TensorInfo info_phase(shape_img, Format::U8);
+    _phase.allocator()->init(info_phase);
+
+    // Initialise gradient kernel
+    _gradient_kernel.configure(input, &_mag, &_phase, phase_type, border_mode, constant_border_value);
+
+    // Configure NETensor for the HOG space and orientation binning kernel
+    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+    {
+        const size_t idx_multi_hog = input_orient_bin[i];
+
+        // Get the corresponding cell size and number of bins
+        const Size2D &cell     = multi_hog->model(idx_multi_hog)->info()->cell_size();
+        const size_t  num_bins = multi_hog->model(idx_multi_hog)->info()->num_bins();
+
+        // Calculate number of cells along the x and y directions for the hog_space
+        const size_t num_cells_x = width / cell.width;
+        const size_t num_cells_y = height / cell.height;
+
+        // TensorShape of hog space
+        TensorShape shape_hog_space = input->info()->tensor_shape();
+        shape_hog_space.set(Window::DimX, num_cells_x);
+        shape_hog_space.set(Window::DimY, num_cells_y);
+
+        // Allocate HOG space
+        TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
+        _hog_space[i].allocator()->init(info_space);
+
+        // Initialise orientation binning kernel
+        _orient_bin_kernel[i].configure(&_mag, &_phase, _hog_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+    }
+
+    // Configure CLTensor for the normalized HOG space and block normalization kernel
+    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+    {
+        const size_t idx_multi_hog  = input_block_norm[i].first;
+        const size_t idx_orient_bin = input_block_norm[i].second;
+
+        // Allocate normalized HOG space
+        TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height);
+        _hog_norm_space[i].allocator()->init(tensor_info);
+
+        // Initialize block normalization kernel
+        _block_norm_kernel[i].configure(_hog_space.get() + idx_orient_bin, _hog_norm_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+    }
+
+    detection_window_strides->map(CLScheduler::get().queue(), true);
+
+    // Configure HOG detector kernel
+    for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
+    {
+        const size_t idx_block_norm = input_hog_detect[i];
+
+        _hog_detect_kernel[i].configure(_hog_norm_space.get() + idx_block_norm, multi_hog->cl_model(i), detection_windows, detection_window_strides->at(i), threshold, i);
+    }
+
+    detection_window_strides->unmap(CLScheduler::get().queue());
+
+    // Configure non maxima suppression kernel
+    _non_maxima_kernel->configure(_detection_windows, min_distance);
+
+    // Allocate intermediate tensors
+    _mag.allocator()->allocate();
+    _phase.allocator()->allocate();
+
+    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+    {
+        _hog_space[i].allocator()->allocate();
+    }
+
+    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+    {
+        _hog_norm_space[i].allocator()->allocate();
+    }
+}
+
+void CLHOGMultiDetection::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function");
+
+    // Reset detection window
+    _detection_windows->clear();
+
+    // Run gradient
+    _gradient_kernel.run();
+
+    // Run orientation binning kernel
+    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+    {
+        CLScheduler::get().enqueue(*(_orient_bin_kernel.get() + i), false);
+    }
+
+    // Run block normalization kernel
+    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+    {
+        CLScheduler::get().enqueue(*(_block_norm_kernel.get() + i), false);
+    }
+
+    // Run HOG detector kernel
+    for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
+    {
+        _hog_detect_kernel[i].run();
+    }
+
+    // Run non-maxima suppression kernel if enabled
+    if(_non_maxima_suppression)
+    {
+        // Map detection windows array before computing non maxima suppression
+        _detection_windows->map(CLScheduler::get().queue(), true);
+        _non_maxima_kernel->run(_non_maxima_kernel->window());
+        _detection_windows->unmap(CLScheduler::get().queue());
+    }
+}
+\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLHarrisCorners.cpp b/src/runtime/CL/functions/CLHarrisCorners.cpp
new file mode 100644
index 0000000000..2db277fa4d
--- /dev/null
+++ b/src/runtime/CL/functions/CLHarrisCorners.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLHarrisCorners.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
+#include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
+#include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
+#include "arm_compute/runtime/ITensorAllocator.h"
+#include "arm_compute/runtime/Scheduler.h"
+
+#include <cmath>
+#include <utility>
+
+using namespace arm_compute;
+
+CLHarrisCorners::CLHarrisCorners()
+    : _sobel(), _harris_score(), _non_max_suppr(), _candidates(), _sort_euclidean(), _border_gx(), _border_gy(), _gx(), _gy(), _score(), _nonmax(), _corners_list(), _num_corner_candidates(0),
+      _corners(nullptr)
+{
+}
+
+void CLHarrisCorners::configure(ICLImage *input, float threshold, float min_dist,
+                                float sensitivity, int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners,
+                                BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(!(block_size == 3 || block_size == 5 || block_size == 7));
+    ARM_COMPUTE_ERROR_ON(nullptr == corners);
+
+    _corners = corners;
+
+    const TensorShape shape = input->info()->tensor_shape();
+    const DataType    dt    = (gradient_size < 7) ? DataType::S16 : DataType::S32;
+    TensorInfo        tensor_info(shape, 1, dt);
+    _gx.allocator()->init(tensor_info);
+    _gy.allocator()->init(tensor_info);
+
+    TensorInfo info_f32(shape, 1, DataType::F32);
+    _score.allocator()->init(info_f32);
+    _nonmax.allocator()->init(info_f32);
+
+    _corners_list = arm_compute::cpp14::make_unique<InternalKeypoint[]>(shape.x() * shape.y());
+
+    /* Set/init Sobel kernel accordingly with gradient_size */
+    switch(gradient_size)
+    {
+        case 3:
+        {
+            auto k = arm_compute::cpp14::make_unique<CLSobel3x3>();
+            k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+            _sobel = std::move(k);
+            break;
+        }
+        case 5:
+        {
+            auto k = arm_compute::cpp14::make_unique<CLSobel5x5>();
+            k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+            _sobel = std::move(k);
+            break;
+        }
+        case 7:
+        {
+            auto k = arm_compute::cpp14::make_unique<CLSobel7x7>();
+            k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+            _sobel = std::move(k);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Gradient size not implemented");
+    }
+
+    // Configure border filling before harris score
+    _border_gx.configure(&_gx, block_size / 2, border_mode, constant_border_value);
+    _border_gy.configure(&_gy, block_size / 2, border_mode, constant_border_value);
+
+    // Normalization factor
+    const float norm_factor               = 1.0f / (255.0f * pow(4.0f, gradient_size / 2) * block_size);
+    const float pow4_normalization_factor = pow(norm_factor, 4);
+
+    // Set/init Harris Score kernel accordingly with block_size
+    _harris_score.configure(&_gx, &_gy, &_score, block_size, pow4_normalization_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+
+    // Init non-maxima suppression function
+    _non_max_suppr.configure(&_score, &_nonmax, border_mode == BorderMode::UNDEFINED);
+
+    // Init corner candidates kernel
+    _candidates.configure(&_nonmax, _corners_list.get(), &_num_corner_candidates);
+
+    // Init euclidean distance
+    _sort_euclidean.configure(_corners_list.get(), _corners, &_num_corner_candidates, min_dist);
+
+    // Allocate intermediate buffers
+    _gx.allocator()->allocate();
+    _gy.allocator()->allocate();
+    _score.allocator()->allocate();
+    _nonmax.allocator()->allocate();
+}
+
+void CLHarrisCorners::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
+
+    // Init to 0 number of corner candidates
+    _num_corner_candidates = 0;
+
+    // Run Sobel kernel
+    _sobel->run();
+
+    // Fill border before harris score kernel
+    CLScheduler::get().enqueue(_border_gx, false);
+    CLScheduler::get().enqueue(_border_gy, false);
+
+    // Run harris score kernel
+    CLScheduler::get().enqueue(_harris_score, false);
+
+    // Run non-maxima suppression
+    CLScheduler::get().enqueue(_non_max_suppr);
+
+    // Run corner candidate kernel
+    _nonmax.map(true);
+    Scheduler::get().schedule(&_candidates, Window::DimY);
+    _nonmax.unmap();
+
+    _corners->map(CLScheduler::get().queue(), true);
+    _sort_euclidean.run(_sort_euclidean.window());
+    _corners->unmap(CLScheduler::get().queue());
+}
diff --git a/src/runtime/CL/functions/CLHistogram.cpp b/src/runtime/CL/functions/CLHistogram.cpp
new file mode 100644
index 0000000000..eb543387f5
--- /dev/null
+++ b/src/runtime/CL/functions/CLHistogram.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLHistogram.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLHistogram::CLHistogram()
+    : _kernel(), _kernel_border()
+{
+}
+
+void CLHistogram::configure(const ICLImage *input, ICLDistribution1D *output)
+{
+    _kernel.configure(input, output);
+    _kernel_border.configure(input, output);
+}
+
+void CLHistogram::run()
+{
+    CLScheduler::get().enqueue(_kernel, false);
+    CLScheduler::get().enqueue(_kernel_border);
+}
diff --git a/src/runtime/CL/functions/CLIntegralImage.cpp b/src/runtime/CL/functions/CLIntegralImage.cpp
new file mode 100644
index 0000000000..2d54be32fa
--- /dev/null
+++ b/src/runtime/CL/functions/CLIntegralImage.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLIntegralImage.h"
+
+#include "arm_compute/core/CL/kernels/CLIntegralImageKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLIntegralImage::CLIntegralImage()
+    : _integral_hor(), _integral_vert()
+{
+}
+
+void CLIntegralImage::configure(const ICLTensor *input, ICLTensor *output)
+{
+    _integral_hor.configure(input, output);
+    _integral_vert.configure(output);
+}
+
+void CLIntegralImage::run()
+{
+    CLScheduler::get().enqueue(_integral_hor, false);
+    CLScheduler::get().enqueue(_integral_vert);
+}
diff --git a/src/runtime/CL/functions/CLLaplacianPyramid.cpp b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
new file mode 100644
index 0000000000..d7ce20653d
--- /dev/null
+++ b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLLaplacianPyramid.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IPyramid.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
+#include "arm_compute/runtime/CL/functions/CLDepthConvert.h"
+#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
+#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
+
+using namespace arm_compute;
+
+CLLaplacianPyramid::CLLaplacianPyramid()
+    : _num_levels(0), _gaussian_pyr_function(), _convf(), _subf(), _depth_function(), _gauss_pyr(), _conv_pyr()
+{
+}
+
+void CLLaplacianPyramid::configure(ICLTensor *input, CLPyramid *pyramid, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON(0 == pyramid->info()->num_levels());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(1));
+
+    _num_levels = pyramid->info()->num_levels();
+
+    // Create and initialize the gaussian pyramid and the convoluted pyramid
+    PyramidInfo pyramid_info;
+    pyramid_info.init(_num_levels, 0.5f, pyramid->info()->tensor_shape(), arm_compute::Format::U8);
+
+    _gauss_pyr.init(pyramid_info);
+    _conv_pyr.init(pyramid_info);
+
+    // Create Gaussian Pyramid function
+    _gaussian_pyr_function.configure(input, &_gauss_pyr, border_mode, constant_border_value);
+
+    _convf = arm_compute::cpp14::make_unique<CLGaussian5x5[]>(_num_levels);
+    _subf  = arm_compute::cpp14::make_unique<CLArithmeticSubtraction[]>(_num_levels);
+
+    for(unsigned int i = 0; i < _num_levels; ++i)
+    {
+        _convf[i].configure(_gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), border_mode, constant_border_value);
+        _subf[i].configure(_gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), pyramid->get_pyramid_level(i), ConvertPolicy::WRAP);
+    }
+
+    _depth_function.configure(_conv_pyr.get_pyramid_level(_num_levels - 1), output, ConvertPolicy::WRAP, 0);
+
+    _gauss_pyr.allocate();
+    _conv_pyr.allocate();
+}
+
+void CLLaplacianPyramid::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(0 == _num_levels, "Unconfigured function");
+
+    _gaussian_pyr_function.run(); // compute gaussian pyramid
+
+    for(unsigned int i = 0; i < _num_levels; ++i)
+    {
+        _convf[i].run(); // convolute gaussian pyramid
+    }
+
+    for(unsigned int i = 0; i < _num_levels; ++i)
+    {
+        _subf[i].run(); // compute laplacian image
+    }
+
+    _depth_function.run();
+}
diff --git a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
new file mode 100644
index 0000000000..1dfab740d7
--- /dev/null
+++ b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IPyramid.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cstddef>
+
+using namespace arm_compute;
+
+CLLaplacianReconstruct::CLLaplacianReconstruct()
+    : _tmp_pyr(), _addf(), _scalef(), _depthf()
+{
+}
+
+void CLLaplacianReconstruct::configure(const CLPyramid *pyramid, const ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
+    ARM_COMPUTE_ERROR_ON(input == output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
+    ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != pyramid->get_pyramid_level(0)->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != pyramid->get_pyramid_level(0)->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(1));
+
+    const size_t num_levels = pyramid->info()->num_levels();
+
+    // Create and initialize the tmp pyramid: I(n-2) = upsample( input + Laplace(n-1) )
+    PyramidInfo pyramid_info;
+    pyramid_info.init(num_levels, 0.5f, output->info()->tensor_shape(), arm_compute::Format::S16);
+    _tmp_pyr.init(pyramid_info);
+
+    // Allocate add and scale functions. Level 0 does not need to be scaled.
+    _addf   = arm_compute::cpp14::make_unique<CLArithmeticAddition[]>(num_levels);
+    _scalef = arm_compute::cpp14::make_unique<CLScale[]>(num_levels - 1);
+
+    const size_t last_level = num_levels - 1;
+
+    _addf[last_level].configure(input, pyramid->get_pyramid_level(last_level), _tmp_pyr.get_pyramid_level(last_level), ConvertPolicy::SATURATE);
+
+    // Scale levels n-1 to 1, and add levels n-2 to 0
+    for(size_t l = 0; l < last_level; ++l)
+    {
+        _scalef[l].configure(_tmp_pyr.get_pyramid_level(l + 1), _tmp_pyr.get_pyramid_level(l), arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, constant_border_value);
+        _addf[l].configure(_tmp_pyr.get_pyramid_level(l), pyramid->get_pyramid_level(l), _tmp_pyr.get_pyramid_level(l), ConvertPolicy::SATURATE);
+    }
+
+    // Convert level 0 from S16 to U8
+    _depthf.configure(_tmp_pyr.get_pyramid_level(0), output, ConvertPolicy::SATURATE, 0);
+
+    _tmp_pyr.allocate();
+}
+
+void CLLaplacianReconstruct::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_addf == nullptr, "Unconfigured function");
+
+    const size_t last_level = _tmp_pyr.info()->num_levels() - 1;
+
+    _addf[last_level].run();
+
+    // Run l = [last_level - 1, 0]
+    for(size_t l = last_level; l-- > 0;)
+    {
+        _scalef[l].run();
+        _addf[l].run();
+    }
+
+    _depthf.run();
+}
diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
new file mode 100644
index 0000000000..263fb51987
--- /dev/null
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <cmath>
+#include <tuple>
+
+using namespace arm_compute;
+
+CLLocallyConnectedLayer::CLLocallyConnectedLayer()
+    : _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), _is_first_run(false)
+{
+}
+
+void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 2);
+    }
+
+    bool _has_bias = (biases != nullptr);
+    _is_first_run  = true;
+
+    // Get parameters for conv_info
+    unsigned int stride_x = 0;
+    unsigned int stride_y = 0;
+    unsigned int pad_x    = 0;
+    unsigned int pad_y    = 0;
+    std::tie(stride_x, stride_y) = conv_info.stride();
+    std::tie(pad_x, pad_y)       = conv_info.pad();
+
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0),
+                                                 stride_x, stride_y, pad_x, pad_y, conv_info.round());
+
+    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+    ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
+
+    // Create tensor to store the reshaped weights
+    const size_t mat_weights_cols = weights->info()->dimension(3);
+    const size_t mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
+    const size_t mat_weights_num  = weights->info()->dimension(4);
+
+    const TensorShape shape_wr(mat_weights_cols, mat_weights_rows, mat_weights_num);
+
+    _weights_reshaped.allocator()->init(TensorInfo(shape_wr, 1, weights->info()->data_type()));
+
+    // Create tensor to store im2col reshaped inputs
+    const size_t mat_input_cols = mat_weights_rows;
+    const size_t mat_input_rows = conv_w * conv_h;
+    TensorShape  shape_im2col   = input->info()->tensor_shape();
+    shape_im2col.set(0, mat_input_cols);
+    shape_im2col.set(1, mat_input_rows);
+    shape_im2col.set(2, 1);
+
+    _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+
+    // Create locally connected layer output tensor
+    TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
+    shape_gemm.set(0, mat_weights_cols);
+    shape_gemm.set(1, mat_input_rows);
+    _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
+
+    // Configure kernels
+    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
+    _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
+    _mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
+    _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+
+    // Allocate intermediate tensors
+    _weights_reshaped.allocator()->allocate();
+    _input_im2col_reshaped.allocator()->allocate();
+    _gemm_output.allocator()->allocate();
+}
+
+void CLLocallyConnectedLayer::run()
+{
+    // Run weights reshaping (Runs once for every configure)
+    if(_is_first_run)
+    {
+        _is_first_run = false;
+        CLScheduler::get().enqueue(_weights_reshape_kernel);
+    }
+
+    // Run input reshaping
+    CLScheduler::get().enqueue(_input_im2col_kernel);
+
+    // Runs vector matrix multiply on reshaped matrices
+    CLScheduler::get().enqueue(_mm_kernel);
+
+    // Reshape output matrix
+    CLScheduler::get().enqueue(_output_col2im_kernel, false);
+}
diff --git a/src/runtime/CL/functions/CLMagnitude.cpp b/src/runtime/CL/functions/CLMagnitude.cpp
new file mode 100644
index 0000000000..51088cb71f
--- /dev/null
+++ b/src/runtime/CL/functions/CLMagnitude.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLMagnitude.h"
+
+#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLMagnitude::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type)
+{
+    auto k = arm_compute::cpp14::make_unique<CLMagnitudePhaseKernel>();
+    k->configure(input1, input2, output, nullptr, mag_type);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLMeanStdDev.cpp b/src/runtime/CL/functions/CLMeanStdDev.cpp
new file mode 100644
index 0000000000..56ba146790
--- /dev/null
+++ b/src/runtime/CL/functions/CLMeanStdDev.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLMeanStdDev.h"
+
+#include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLMeanStdDev::CLMeanStdDev()
+    : _mean_stddev_kernel(),
+      _global_sum(),
+      _global_sum_squared()
+{
+}
+
+void CLMeanStdDev::configure(const ICLImage *input, float *mean, float *stddev)
+{
+    _global_sum = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_ulong));
+
+    if(stddev != nullptr)
+    {
+        _global_sum_squared = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_ulong));
+    }
+
+    _mean_stddev_kernel.configure(input, mean, &_global_sum, stddev, &_global_sum_squared);
+}
+
+void CLMeanStdDev::run()
+{
+    CLScheduler::get().enqueue(_mean_stddev_kernel);
+}
diff --git a/src/runtime/CL/functions/CLMedian3x3.cpp b/src/runtime/CL/functions/CLMedian3x3.cpp
new file mode 100644
index 0000000000..0c10f9aa08
--- /dev/null
+++ b/src/runtime/CL/functions/CLMedian3x3.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLMedian3x3.h"
+
+#include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLMedian3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLMedian3x3Kernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLMinMaxLocation.cpp b/src/runtime/CL/functions/CLMinMaxLocation.cpp
new file mode 100644
index 0000000000..ad783d8a53
--- /dev/null
+++ b/src/runtime/CL/functions/CLMinMaxLocation.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLMinMaxLocation.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+
+using namespace arm_compute;
+
+CLMinMaxLocation::CLMinMaxLocation()
+    : _min_max_kernel(),
+      _min_max_loc_kernel(),
+      _min_max_vals(),
+      _min_max_count_vals(),
+      _min(nullptr),
+      _max(nullptr),
+      _min_count(nullptr),
+      _max_count(nullptr),
+      _min_loc(nullptr),
+      _max_loc(nullptr)
+{
+}
+
+void CLMinMaxLocation::configure(const ICLImage *input, int32_t *min, int32_t *max, CLCoordinates2DArray *min_loc, CLCoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == min);
+    ARM_COMPUTE_ERROR_ON(nullptr == max);
+
+    _min_max_vals       = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, 2 * sizeof(int32_t));
+    _min_max_count_vals = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, 2 * sizeof(uint32_t));
+    _min                = min;
+    _max                = max;
+    _min_count          = min_count;
+    _max_count          = max_count;
+    _min_loc            = min_loc;
+    _max_loc            = max_loc;
+
+    _min_max_kernel.configure(input, &_min_max_vals);
+    _min_max_loc_kernel.configure(input, &_min_max_vals, &_min_max_count_vals, _min_loc, _max_loc);
+}
+
+void CLMinMaxLocation::run()
+{
+    cl::CommandQueue q = CLScheduler::get().queue();
+
+    CLScheduler::get().enqueue(_min_max_kernel, false);
+    CLScheduler::get().enqueue(_min_max_loc_kernel, false);
+
+    // Update min and max
+    q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 0 * sizeof(int32_t), sizeof(int32_t), _min);
+    q.enqueueReadBuffer(_min_max_vals, CL_FALSE, 1 * sizeof(int32_t), sizeof(int32_t), _max);
+
+    // Update min and max count
+    if(_min_count != nullptr)
+    {
+        q.enqueueReadBuffer(_min_max_count_vals, CL_FALSE, 0 * sizeof(uint32_t), sizeof(uint32_t), _min_count);
+    }
+    if(_max_count != nullptr)
+    {
+        q.enqueueReadBuffer(_min_max_count_vals, CL_FALSE, 1 * sizeof(uint32_t), sizeof(uint32_t), _max_count);
+    }
+
+    // Update min/max point arrays (Makes the kernel blocking)
+    if(_min_loc != nullptr)
+    {
+        unsigned int min_count = 0;
+        q.enqueueReadBuffer(_min_max_count_vals, CL_TRUE, 0 * sizeof(uint32_t), sizeof(uint32_t), &min_count);
+        size_t min_corner_size = std::min(static_cast<size_t>(min_count), _min_loc->max_num_values());
+        _min_loc->resize(min_corner_size);
+    }
+    if(_max_loc != nullptr)
+    {
+        unsigned int max_count = 0;
+        q.enqueueReadBuffer(_min_max_count_vals, CL_TRUE, 1 * sizeof(uint32_t), sizeof(uint32_t), &max_count);
+        size_t max_corner_size = std::min(static_cast<size_t>(max_count), _max_loc->max_num_values());
+        _max_loc->resize(max_corner_size);
+    }
+}
diff --git a/src/runtime/CL/functions/CLNonLinearFilter.cpp b/src/runtime/CL/functions/CLNonLinearFilter.cpp
new file mode 100644
index 0000000000..b593a6cced
--- /dev/null
+++ b/src/runtime/CL/functions/CLNonLinearFilter.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLNonLinearFilter.h"
+
+#include "arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLNonLinearFilter::configure(ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
+                                  BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLNonLinearFilterKernel>();
+    k->configure(input, output, function, mask_size, pattern, mask, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
new file mode 100644
index 0000000000..ca7d5aede7
--- /dev/null
+++ b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h"
+
+#include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLNonMaximaSuppression3x3::configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode)
+{
+    auto k = arm_compute::cpp14::make_unique<CLNonMaximaSuppression3x3Kernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+
+    if(border_mode != BorderMode::UNDEFINED)
+    {
+        _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT);
+    }
+    else
+    {
+        _border_handler.configure(input, _kernel->border_size(), BorderMode::UNDEFINED);
+    }
+}
diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp
new file mode 100644
index 0000000000..2d89ebd676
--- /dev/null
+++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLNormalizationLayer::CLNormalizationLayer()
+    : _squared_input(), _norm_kernel(), _multiply_kernel(), _border_handler()
+{
+}
+
+void CLNormalizationLayer::configure(const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info)
+{
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+
+    _squared_input.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, input->info()->data_type()));
+
+    _norm_kernel.configure(input, &_squared_input, output, norm_info);
+    _multiply_kernel.configure(input, input, &_squared_input, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
+    _border_handler.configure(&_squared_input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+
+    // Allocate intermediate buffers
+    _squared_input.allocator()->allocate();
+}
+
+void CLNormalizationLayer::run()
+{
+    CLScheduler::get().enqueue(_multiply_kernel, false);
+    CLScheduler::get().enqueue(_border_handler, false);
+    CLScheduler::get().enqueue(_norm_kernel, false);
+}
diff --git a/src/runtime/CL/functions/CLOpticalFlow.cpp b/src/runtime/CL/functions/CLOpticalFlow.cpp
new file mode 100644
index 0000000000..a6b0eb3bec
--- /dev/null
+++ b/src/runtime/CL/functions/CLOpticalFlow.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLOpticalFlow.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/CL/CLPyramid.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
+
+using namespace arm_compute;
+
+CLOpticalFlow::CLOpticalFlow()
+    : _tracker_init_kernel(), _tracker_stage0_kernel(), _tracker_stage1_kernel(), _tracker_finalize_kernel(), _func_scharr(), _scharr_gx(), _scharr_gy(), _old_points(nullptr),
+      _new_points_estimates(nullptr), _new_points(nullptr), _old_points_internal(), _new_points_internal(), _coefficient_table(), _old_values(), _num_levels(0)
+{
+}
+
+void CLOpticalFlow::configure(const CLPyramid *old_pyramid, const CLPyramid *new_pyramid,
+                              const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates, ICLKeyPointArray *new_points,
+                              Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, bool use_initial_estimate,
+                              BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == old_pyramid);
+    ARM_COMPUTE_ERROR_ON(nullptr == new_pyramid);
+    ARM_COMPUTE_ERROR_ON(nullptr == old_points);
+    ARM_COMPUTE_ERROR_ON(nullptr == new_points_estimates);
+    ARM_COMPUTE_ERROR_ON(nullptr == new_points);
+    ARM_COMPUTE_ERROR_ON(old_pyramid->info()->num_levels() != new_pyramid->info()->num_levels());
+    ARM_COMPUTE_ERROR_ON(0 == old_pyramid->info()->num_levels());
+    ARM_COMPUTE_ERROR_ON(old_pyramid->info()->width() != new_pyramid->info()->width());
+    ARM_COMPUTE_ERROR_ON(old_pyramid->info()->height() != new_pyramid->info()->height());
+    ARM_COMPUTE_ERROR_ON(use_initial_estimate && old_points->num_values() != new_points_estimates->num_values());
+
+    // Set member variables
+    _old_points           = old_points;
+    _new_points_estimates = new_points_estimates;
+    _new_points           = new_points;
+    _num_levels           = old_pyramid->info()->num_levels();
+
+    const float pyr_scale              = old_pyramid->info()->scale();
+    const int   list_length            = old_points->num_values();
+    const int   old_values_list_length = list_length * window_dimension * window_dimension;
+
+    // Create kernels and tensors
+    _tracker_init_kernel   = arm_compute::cpp14::make_unique<CLLKTrackerInitKernel[]>(_num_levels);
+    _tracker_stage0_kernel = arm_compute::cpp14::make_unique<CLLKTrackerStage0Kernel[]>(_num_levels);
+    _tracker_stage1_kernel = arm_compute::cpp14::make_unique<CLLKTrackerStage1Kernel[]>(_num_levels);
+    _func_scharr           = arm_compute::cpp14::make_unique<CLScharr3x3[]>(_num_levels);
+    _scharr_gx             = arm_compute::cpp14::make_unique<CLTensor[]>(_num_levels);
+    _scharr_gy             = arm_compute::cpp14::make_unique<CLTensor[]>(_num_levels);
+
+    // Create internal keypoint arrays
+    _old_points_internal = arm_compute::cpp14::make_unique<CLLKInternalKeypointArray>(list_length);
+    _old_points_internal->resize(list_length);
+    _new_points_internal = arm_compute::cpp14::make_unique<CLLKInternalKeypointArray>(list_length);
+    _new_points_internal->resize(list_length);
+    _coefficient_table = arm_compute::cpp14::make_unique<CLCoefficientTableArray>(list_length);
+    _coefficient_table->resize(list_length);
+    _old_values = arm_compute::cpp14::make_unique<CLOldValueArray>(old_values_list_length);
+    _old_values->resize(old_values_list_length);
+    _new_points->resize(list_length);
+
+    for(size_t i = 0; i < _num_levels; ++i)
+    {
+        // Get images from the ith level of old and right pyramid
+        ICLImage *old_ith_input = old_pyramid->get_pyramid_level(i);
+        ICLImage *new_ith_input = new_pyramid->get_pyramid_level(i);
+
+        // Get width and height of images
+        const unsigned int width_ith  = old_ith_input->info()->dimension(0);
+        const unsigned int height_ith = new_ith_input->info()->dimension(1);
+
+        // Initialize Scharr tensors
+        TensorInfo tensor_info(TensorShape(width_ith, height_ith), 1, DataType::S16);
+        _scharr_gx[i].allocator()->init(tensor_info);
+        _scharr_gy[i].allocator()->init(tensor_info);
+
+        // Init Scharr kernel
+        _func_scharr[i].configure(old_ith_input, &_scharr_gx[i], &_scharr_gy[i], border_mode, constant_border_value);
+
+        // Init Lucas-Kanade init kernel
+        _tracker_init_kernel[i].configure(old_points, new_points_estimates, _old_points_internal.get(), _new_points_internal.get(), use_initial_estimate, i, _num_levels, pyr_scale);
+
+        // Init Lucas-Kanade stage0 kernel
+        _tracker_stage0_kernel[i].configure(old_ith_input, &_scharr_gx[i], &_scharr_gy[i],
+                                            _old_points_internal.get(), _new_points_internal.get(), _coefficient_table.get(), _old_values.get(),
+                                            window_dimension, i);
+
+        // Init Lucas-Kanade stage1 kernel
+        _tracker_stage1_kernel[i].configure(new_ith_input, _new_points_internal.get(), _coefficient_table.get(), _old_values.get(),
+                                            termination, epsilon, num_iterations, window_dimension, i);
+
+        // Allocate intermediate buffers
+        _scharr_gx[i].allocator()->allocate();
+        _scharr_gy[i].allocator()->allocate();
+    }
+
+    // Finalize Lucas-Kanade
+    _tracker_finalize_kernel.configure(_new_points_internal.get(), new_points);
+}
+
+void CLOpticalFlow::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function");
+
+    for(unsigned int level = _num_levels; level > 0; --level)
+    {
+        // Run Scharr kernel
+        _func_scharr[level - 1].run();
+
+        // Run Lucas-Kanade init kernel
+        CLScheduler::get().enqueue(_tracker_init_kernel[level - 1]);
+
+        // Run Lucas-Kanade stage0 kernel
+        CLScheduler::get().enqueue(_tracker_stage0_kernel[level - 1]);
+
+        // Run Lucas-Kanade stage1 kernel
+        CLScheduler::get().enqueue(_tracker_stage1_kernel[level - 1]);
+    }
+
+    CLScheduler::get().enqueue(_tracker_finalize_kernel, true);
+}
diff --git a/src/runtime/CL/functions/CLPhase.cpp b/src/runtime/CL/functions/CLPhase.cpp
new file mode 100644
index 0000000000..a8cb22b06e
--- /dev/null
+++ b/src/runtime/CL/functions/CLPhase.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLPhase.h"
+
+#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLPhase::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, PhaseType phase_type)
+{
+    auto k = arm_compute::cpp14::make_unique<CLMagnitudePhaseKernel>();
+    k->configure(input1, input2, nullptr, output, MagnitudeType::L1NORM, phase_type);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
new file mode 100644
index 0000000000..8a86c2e203
--- /dev/null
+++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
+
+#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLPixelWiseMultiplication::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
+                                          ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+{
+    auto k = arm_compute::cpp14::make_unique<CLPixelWiseMultiplicationKernel>();
+    k->configure(input1, input2, output, scale, overflow_policy, rounding_policy);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
new file mode 100644
index 0000000000..1ef70f4a2b
--- /dev/null
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+using namespace arm_compute;
+
+void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info)
+{
+    // Configure pooling kernel
+    auto k = arm_compute::cpp14::make_unique<CLPoolingLayerKernel>();
+    k->configure(input, output, pool_info);
+    _kernel = std::move(k);
+
+    // Configure border depending on operation required
+    BorderMode border_mode = (PoolingType::MAX == pool_info.pool_type()) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(0));
+}
diff --git a/src/runtime/CL/functions/CLRemap.cpp b/src/runtime/CL/functions/CLRemap.cpp
new file mode 100644
index 0000000000..f6b1713c58
--- /dev/null
+++ b/src/runtime/CL/functions/CLRemap.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLRemap.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLRemapKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLRemap::configure(ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported");
+
+    auto k = arm_compute::cpp14::make_unique<CLRemapKernel>();
+    k->configure(input, map_x, map_y, output, policy, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp
new file mode 100644
index 0000000000..043f873028
--- /dev/null
+++ b/src/runtime/CL/functions/CLScale.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLScale.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLScaleKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+void CLScale::configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(output == input);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    auto k = arm_compute::cpp14::make_unique<CLScaleKernel>();
+    k->configure(input, output, policy, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, constant_border_value);
+}
diff --git a/src/runtime/CL/functions/CLScharr3x3.cpp b/src/runtime/CL/functions/CLScharr3x3.cpp
new file mode 100644
index 0000000000..c8bc465be6
--- /dev/null
+++ b/src/runtime/CL/functions/CLScharr3x3.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
+
+#include "arm_compute/core/CL/kernels/CLScharr3x3Kernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLScharr3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLScharr3x3Kernel>();
+    k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLSobel3x3.cpp b/src/runtime/CL/functions/CLSobel3x3.cpp
new file mode 100644
index 0000000000..6b74ebaedb
--- /dev/null
+++ b/src/runtime/CL/functions/CLSobel3x3.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
+
+#include "arm_compute/core/CL/kernels/CLSobel3x3Kernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLSobel3x3::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLSobel3x3Kernel>();
+    k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLSobel5x5.cpp b/src/runtime/CL/functions/CLSobel5x5.cpp
new file mode 100644
index 0000000000..098b546c1a
--- /dev/null
+++ b/src/runtime/CL/functions/CLSobel5x5.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLSobel5x5Kernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+using namespace arm_compute;
+
+CLSobel5x5::CLSobel5x5()
+    : _sobel_hor(), _sobel_vert(), _border_handler(), _tmp_x(), _tmp_y()
+{
+}
+
+void CLSobel5x5::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+
+    const bool run_sobel_x = output_x != nullptr;
+    const bool run_sobel_y = output_y != nullptr;
+
+    TensorInfo tensor_info(input->info()->tensor_shape(), 1, DataType::S16);
+
+    if(run_sobel_x && run_sobel_y)
+    {
+        _tmp_x.allocator()->init(tensor_info);
+        _tmp_y.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+        _tmp_x.allocator()->allocate();
+        _tmp_y.allocator()->allocate();
+    }
+    else if(run_sobel_x)
+    {
+        _tmp_x.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _tmp_x.allocator()->allocate();
+    }
+    else if(run_sobel_y)
+    {
+        _tmp_y.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
+        _tmp_y.allocator()->allocate();
+    }
+    _border_handler.configure(input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+}
+
+void CLSobel5x5::run()
+{
+    CLScheduler::get().enqueue(_border_handler, false);
+    CLScheduler::get().enqueue(_sobel_hor, false);
+    CLScheduler::get().enqueue(_sobel_vert);
+}
diff --git a/src/runtime/CL/functions/CLSobel7x7.cpp b/src/runtime/CL/functions/CLSobel7x7.cpp
new file mode 100644
index 0000000000..db84fa99ae
--- /dev/null
+++ b/src/runtime/CL/functions/CLSobel7x7.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLSobel7x7Kernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+using namespace arm_compute;
+
+CLSobel7x7::CLSobel7x7()
+    : _sobel_hor(), _sobel_vert(), _border_handler(), _tmp_x(), _tmp_y()
+{
+}
+
+void CLSobel7x7::configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+
+    const bool run_sobel_x = output_x != nullptr;
+    const bool run_sobel_y = output_y != nullptr;
+
+    TensorInfo tensor_info(input->info()->tensor_shape(), 1, DataType::S32);
+
+    if(run_sobel_x && run_sobel_y)
+    {
+        _tmp_x.allocator()->init(tensor_info);
+        _tmp_y.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+        _tmp_x.allocator()->allocate();
+        _tmp_y.allocator()->allocate();
+    }
+    else if(run_sobel_x)
+    {
+        _tmp_x.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _tmp_x.allocator()->allocate();
+    }
+    else if(run_sobel_y)
+    {
+        _tmp_y.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
+        _tmp_y.allocator()->allocate();
+    }
+    _border_handler.configure(input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+}
+
+void CLSobel7x7::run()
+{
+    CLScheduler::get().enqueue(_border_handler, false);
+    CLScheduler::get().enqueue(_sobel_hor, false);
+    CLScheduler::get().enqueue(_sobel_vert);
+}
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
new file mode 100644
index 0000000000..2a78c58053
--- /dev/null
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLSoftmaxLayer::CLSoftmaxLayer()
+    : _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp()
+{
+}
+
+void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+
+    // Create intermediate tensors shapes
+    _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type()));
+
+    TensorShape shape = input->info()->tensor_shape();
+    shape.set(0, 1);
+    TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type());
+    _max.allocator()->init(tensor_info_max_sum);
+    _sum.allocator()->init(tensor_info_max_sum);
+
+    // Configure Kernels
+    _max_kernel.configure(input, &_max);
+    _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum);
+    _norm_kernel.configure(&_tmp, &_sum, output);
+
+    // Allocate intermediate buffers
+    _tmp.allocator()->allocate();
+    _max.allocator()->allocate();
+    _sum.allocator()->allocate();
+}
+
+void CLSoftmaxLayer::run()
+{
+    CLScheduler::get().enqueue(_max_kernel, false);
+    CLScheduler::get().enqueue(_shift_exp_sum_kernel, false);
+    CLScheduler::get().enqueue(_norm_kernel);
+}
diff --git a/src/runtime/CL/functions/CLTableLookup.cpp b/src/runtime/CL/functions/CLTableLookup.cpp
new file mode 100644
index 0000000000..743ed5e73e
--- /dev/null
+++ b/src/runtime/CL/functions/CLTableLookup.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLTableLookup.h"
+
+#include "arm_compute/core/CL/kernels/CLTableLookupKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLTableLookup::configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLTableLookupKernel>();
+    k->configure(input, lut, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLThreshold.cpp b/src/runtime/CL/functions/CLThreshold.cpp
new file mode 100644
index 0000000000..e70f932d66
--- /dev/null
+++ b/src/runtime/CL/functions/CLThreshold.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLThreshold.h"
+
+#include "arm_compute/core/CL/kernels/CLThresholdKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLThreshold::configure(const ICLTensor *input, ICLTensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
+{
+    auto k = arm_compute::cpp14::make_unique<CLThresholdKernel>();
+    k->configure(input, output, threshold, false_value, true_value, type, upper);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/CL/functions/CLTranspose.cpp b/src/runtime/CL/functions/CLTranspose.cpp
new file mode 100644
index 0000000000..d802b4fe77
--- /dev/null
+++ b/src/runtime/CL/functions/CLTranspose.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLTranspose.h"
+
+#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLTranspose::configure(const ICLTensor *input, ICLTensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<CLTransposeKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
+\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLWarpAffine.cpp b/src/runtime/CL/functions/CLWarpAffine.cpp
new file mode 100644
index 0000000000..537e0d9397
--- /dev/null
+++ b/src/runtime/CL/functions/CLWarpAffine.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLWarpAffine.h"
+
+#include "arm_compute/core/CL/kernels/CLWarpAffineKernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLWarpAffine::configure(ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLWarpAffineKernel>();
+    k->configure(input, output, matrix, policy);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CL/functions/CLWarpPerspective.cpp b/src/runtime/CL/functions/CLWarpPerspective.cpp
new file mode 100644
index 0000000000..a552ab480d
--- /dev/null
+++ b/src/runtime/CL/functions/CLWarpPerspective.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLWarpPerspective.h"
+
+#include "arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLWarpPerspective::configure(ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<CLWarpPerspectiveKernel>();
+    k->configure(input, output, matrix, policy);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
new file mode 100644
index 0000000000..886933074d
--- /dev/null
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CPP/CPPScheduler.h"
+
+#include "arm_compute/core/CPP/ICPPKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+
+#include <iostream>
+#include <semaphore.h>
+#include <system_error>
+#include <thread>
+
+using namespace arm_compute;
+
+class arm_compute::Thread
+{
+public:
+    /** Start a new thread
+     */
+    Thread();
+    Thread(const Thread &) = delete;
+    Thread &operator=(const Thread &) = delete;
+    Thread(Thread &&)                 = delete;
+    Thread &operator=(Thread &&) = delete;
+    /** Make the thread join
+     */
+    ~Thread();
+    /** Request the worker thread to start executing the given kernel
+     * This function will return as soon as the kernel has been sent to the worker thread.
+     * wait() needs to be called to ensure the execution is complete.
+     */
+    void start(ICPPKernel *kernel, const Window &window);
+    /** Wait for the current kernel execution to complete
+     */
+    void wait();
+    /** Function ran by the worker thread
+     */
+    void worker_thread();
+
+private:
+    std::thread        _thread;
+    ICPPKernel        *_kernel{ nullptr };
+    Window             _window;
+    sem_t              _wait_for_work;
+    sem_t              _job_complete;
+    std::exception_ptr _current_exception;
+};
+
+Thread::Thread()
+    : _thread(), _window(), _wait_for_work(), _job_complete(), _current_exception(nullptr)
+{
+    int ret = sem_init(&_wait_for_work, 0, 0);
+    ARM_COMPUTE_ERROR_ON(ret < 0);
+    ARM_COMPUTE_UNUSED(ret);
+
+    ret = sem_init(&_job_complete, 0, 0);
+    ARM_COMPUTE_ERROR_ON(ret < 0);
+    ARM_COMPUTE_UNUSED(ret);
+
+    _thread = std::thread(&Thread::worker_thread, this);
+}
+
+Thread::~Thread()
+{
+    ARM_COMPUTE_ERROR_ON(!_thread.joinable());
+
+    start(nullptr, Window());
+    _thread.join();
+
+    int ret = sem_destroy(&_wait_for_work);
+    ARM_COMPUTE_ERROR_ON(ret < 0);
+    ARM_COMPUTE_UNUSED(ret);
+
+    ret = sem_destroy(&_job_complete);
+    ARM_COMPUTE_ERROR_ON(ret < 0);
+    ARM_COMPUTE_UNUSED(ret);
+}
+
+void Thread::start(ICPPKernel *kernel, const Window &window)
+{
+    _kernel = kernel;
+    _window = window;
+    int ret = sem_post(&_wait_for_work);
+    ARM_COMPUTE_UNUSED(ret);
+    ARM_COMPUTE_ERROR_ON(ret < 0);
+}
+
+void Thread::wait()
+{
+    int ret = sem_wait(&_job_complete);
+    ARM_COMPUTE_UNUSED(ret);
+    ARM_COMPUTE_ERROR_ON(ret < 0);
+    if(_current_exception)
+    {
+        std::rethrow_exception(_current_exception);
+    }
+}
+
+void Thread::worker_thread()
+{
+    while(sem_wait(&_wait_for_work) >= 0)
+    {
+        _current_exception = nullptr;
+        // Time to exit
+        if(_kernel == nullptr)
+        {
+            return;
+        }
+
+        try
+        {
+            _window.validate();
+            _kernel->run(_window);
+        }
+        catch(...)
+        {
+            _current_exception = std::current_exception();
+        }
+        int ret = sem_post(&_job_complete);
+        ARM_COMPUTE_UNUSED(ret);
+        ARM_COMPUTE_ERROR_ON(ret < 0);
+    }
+
+    ARM_COMPUTE_ERROR("Wait failed");
+}
+
+namespace
+{
+void delete_threads(Thread *t)
+{
+    delete[] t;
+}
+} // namespace
+
+CPPScheduler &CPPScheduler::get()
+{
+    static CPPScheduler scheduler;
+    return scheduler;
+}
+
+unsigned int CPPScheduler::num_threads() const
+{
+    return _num_threads;
+}
+
+CPPScheduler::CPPScheduler()
+    : _num_threads(std::thread::hardware_concurrency()),
+      _threads(std::unique_ptr<Thread[], void(*)(Thread *)>(new Thread[std::thread::hardware_concurrency() - 1], delete_threads))
+{
+}
+
+void CPPScheduler::set_num_threads(unsigned int num_threads)
+{
+    const unsigned int num_cores = std::thread::hardware_concurrency();
+    _num_threads                 = num_threads == 0 ? num_cores : num_threads;
+}
+
+void CPPScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
+
+    /** [Scheduler example] */
+    const Window      &max_window     = kernel->window();
+    const unsigned int num_iterations = max_window.num_iterations(split_dimension);
+    const unsigned int num_threads    = std::min(num_iterations, _num_threads);
+
+    if(!kernel->is_parallelisable() || 1 == num_threads)
+    {
+        kernel->run(max_window);
+    }
+    else
+    {
+        for(unsigned int t = 0; t < num_threads; ++t)
+        {
+            Window win = max_window.split_window(split_dimension, t, num_threads);
+            win.set_thread_id(t);
+            win.set_num_threads(num_threads);
+
+            if(t != num_threads - 1)
+            {
+                _threads[t].start(kernel, win);
+            }
+            else
+            {
+                kernel->run(win);
+            }
+        }
+
+        try
+        {
+            for(unsigned int t = 1; t < num_threads; ++t)
+            {
+                _threads[t - 1].wait();
+            }
+        }
+        catch(const std::system_error &e)
+        {
+            std::cout << "Caught system_error with code " << e.code() << " meaning " << e.what() << '\n';
+        }
+    }
+    /** [Scheduler example] */
+}
diff --git a/src/runtime/CPP/SingleThreadScheduler.cpp b/src/runtime/CPP/SingleThreadScheduler.cpp
new file mode 100644
index 0000000000..f086813e91
--- /dev/null
+++ b/src/runtime/CPP/SingleThreadScheduler.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/SingleThreadScheduler.h"
+
+#include "arm_compute/core/CPP/ICPPKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Utils.h"
+
+using namespace arm_compute;
+
+SingleThreadScheduler &SingleThreadScheduler::get()
+{
+    static SingleThreadScheduler scheduler;
+    return scheduler;
+}
+
+void SingleThreadScheduler::set_num_threads(unsigned int num_threads)
+{
+    ARM_COMPUTE_UNUSED(num_threads);
+}
+
+void SingleThreadScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
+{
+    ARM_COMPUTE_UNUSED(split_dimension);
+    kernel->run(kernel->window());
+}
+
+unsigned int SingleThreadScheduler::num_threads() const
+{
+    return 1;
+}
diff --git a/src/runtime/Distribution1D.cpp b/src/runtime/Distribution1D.cpp
new file mode 100644
index 0000000000..b06767499b
--- /dev/null
+++ b/src/runtime/Distribution1D.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/Distribution1D.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <cstdint>
+
+using namespace arm_compute;
+
+Distribution1D::Distribution1D(size_t num_bins, int32_t offset, uint32_t range)
+    : IDistribution1D(num_bins, offset, range), _data(arm_compute::cpp14::make_unique<uint32_t[]>(num_bins))
+{
+}
+
+uint32_t *Distribution1D::buffer() const
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == _data);
+    return _data.get();
+}
diff --git a/src/runtime/HOG.cpp b/src/runtime/HOG.cpp
new file mode 100644
index 0000000000..5d533dded4
--- /dev/null
+++ b/src/runtime/HOG.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/HOG.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+
+using namespace arm_compute;
+
+HOG::HOG()
+    : IHOG(), _info(), _descriptor(nullptr)
+{
+}
+
+void HOG::init(const HOGInfo &input)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr != _descriptor);
+    _info       = input;
+    _descriptor = arm_compute::cpp14::make_unique<float[]>(_info.descriptor_size());
+}
+
+float *HOG::descriptor() const
+{
+    return _descriptor.get();
+}
+
+const HOGInfo *HOG::info() const
+{
+    return &_info;
+}
diff --git a/src/runtime/ILutAllocator.cpp b/src/runtime/ILutAllocator.cpp
new file mode 100644
index 0000000000..fb961638f1
--- /dev/null
+++ b/src/runtime/ILutAllocator.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/ILutAllocator.h"
+
+#include "arm_compute/core/Utils.h"
+
+using namespace arm_compute;
+
+ILutAllocator::ILutAllocator()
+    : _num_elements(0), _data_type(DataType::U8)
+{
+}
+
+void ILutAllocator::init(size_t num_elements, DataType data_type)
+{
+    // Init internal metadata
+    _num_elements = num_elements;
+    _data_type    = data_type;
+
+    // Allocate the image's memory
+    allocate();
+}
+
+size_t ILutAllocator::num_elements() const
+{
+    return _num_elements;
+}
+
+DataType ILutAllocator::type() const
+{
+    return _data_type;
+}
+
+size_t ILutAllocator::size() const
+{
+    return data_size_from_type(_data_type) * num_elements();
+}
diff --git a/src/runtime/ITensorAllocator.cpp b/src/runtime/ITensorAllocator.cpp
new file mode 100644
index 0000000000..8294201384
--- /dev/null
+++ b/src/runtime/ITensorAllocator.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+
+using namespace arm_compute;
+
+ITensorAllocator::ITensorAllocator()
+    : _info()
+{
+}
+
+void ITensorAllocator::init(const TensorInfo &input)
+{
+    _info = input;
+}
+
+TensorInfo &ITensorAllocator::info()
+{
+    return _info;
+}
+
+const TensorInfo &ITensorAllocator::info() const
+{
+    return _info;
+}
diff --git a/src/runtime/Lut.cpp b/src/runtime/Lut.cpp
new file mode 100644
index 0000000000..1b3daf1f60
--- /dev/null
+++ b/src/runtime/Lut.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/Lut.h"
+
+#include <cstring>
+
+using namespace arm_compute;
+
+Lut::Lut()
+    : _allocator()
+{
+}
+
+Lut::Lut(size_t num_elements, DataType data_type)
+    : _allocator()
+{
+    _allocator.init(num_elements, data_type);
+}
+
+size_t Lut::num_elements() const
+{
+    return _allocator.num_elements();
+}
+
+uint32_t Lut::index_offset() const
+{
+    return (DataType::S16 == _allocator.type()) ? num_elements() / 2 : 0;
+}
+
+size_t Lut::size_in_bytes() const
+{
+    return _allocator.size();
+}
+
+DataType Lut::type() const
+{
+    return _allocator.type();
+}
+
+uint8_t *Lut::buffer() const
+{
+    return _allocator.data();
+}
+
+void Lut::clear()
+{
+    ARM_COMPUTE_ERROR_ON(this->buffer() == nullptr);
+    std::memset(this->buffer(), 0, this->size_in_bytes());
+}
+
+ILutAllocator *Lut::allocator()
+{
+    return &_allocator;
+}
diff --git a/src/runtime/LutAllocator.cpp b/src/runtime/LutAllocator.cpp
new file mode 100644
index 0000000000..17baf21f45
--- /dev/null
+++ b/src/runtime/LutAllocator.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/LutAllocator.h"
+
+#include "arm_compute/core/Helpers.h"
+
+using namespace arm_compute;
+
+LutAllocator::LutAllocator()
+    : _buffer(nullptr)
+{
+}
+
+uint8_t *LutAllocator::data() const
+{
+    return _buffer.get();
+}
+
+void LutAllocator::allocate()
+{
+    _buffer = arm_compute::cpp14::make_unique<uint8_t[]>(size());
+}
+
+uint8_t *LutAllocator::lock()
+{
+    return _buffer.get();
+}
+
+void LutAllocator::unlock()
+{
+}
diff --git a/src/runtime/MultiHOG.cpp b/src/runtime/MultiHOG.cpp
new file mode 100644
index 0000000000..003dc93895
--- /dev/null
+++ b/src/runtime/MultiHOG.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/MultiHOG.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IMultiHOG.h"
+
+using namespace arm_compute;
+
+MultiHOG::MultiHOG(size_t num_models)
+    : _num_models(num_models), _model(arm_compute::cpp14::make_unique<HOG[]>(_num_models))
+{
+}
+
+size_t MultiHOG::num_models() const
+{
+    return _num_models;
+}
+
+IHOG *MultiHOG::model(size_t index)
+{
+    ARM_COMPUTE_ERROR_ON(index >= _num_models);
+    return (_model.get() + index);
+}
+
+const IHOG *MultiHOG::model(size_t index) const
+{
+    ARM_COMPUTE_ERROR_ON(index >= _num_models);
+    return (_model.get() + index);
+}
diff --git a/src/runtime/MultiImage.cpp b/src/runtime/MultiImage.cpp
new file mode 100644
index 0000000000..def1487c5e
--- /dev/null
+++ b/src/runtime/MultiImage.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/MultiImage.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+using namespace arm_compute;
+
+MultiImage::MultiImage()
+    : _info(), _plane()
+{
+}
+
+const MultiImageInfo *MultiImage::info() const
+{
+    return &_info;
+}
+
+void MultiImage::init(unsigned int width, unsigned int height, Format format)
+{
+    internal_init(width, height, format, false);
+}
+
+void MultiImage::init_auto_padding(unsigned int width, unsigned int height, Format format)
+{
+    internal_init(width, height, format, true);
+}
+
+void MultiImage::internal_init(unsigned int width, unsigned int height, Format format, bool auto_padding)
+{
+    TensorInfo info(width, height, Format::U8);
+
+    if(auto_padding)
+    {
+        info.auto_padding();
+    }
+
+    switch(format)
+    {
+        case Format::U8:
+        case Format::S16:
+        case Format::U16:
+        case Format::S32:
+        case Format::F16:
+        case Format::F32:
+        case Format::U32:
+        case Format::RGB888:
+        case Format::RGBA8888:
+        case Format::YUYV422:
+        case Format::UYVY422:
+        {
+            TensorInfo info_full(width, height, format);
+
+            if(auto_padding)
+            {
+                info_full.auto_padding();
+            }
+
+            std::get<0>(_plane).allocator()->init(info_full);
+            break;
+        }
+        case Format::NV12:
+        case Format::NV21:
+        {
+            TensorInfo info_uv88(width / 2, height / 2, Format::UV88);
+
+            if(auto_padding)
+            {
+                info_uv88.auto_padding();
+            }
+
+            std::get<0>(_plane).allocator()->init(info);
+            std::get<1>(_plane).allocator()->init(info_uv88);
+            break;
+        }
+        case Format::IYUV:
+        {
+            TensorInfo info_sub2(width / 2, height / 2, Format::U8);
+
+            if(auto_padding)
+            {
+                info_sub2.auto_padding();
+            }
+
+            std::get<0>(_plane).allocator()->init(info);
+            std::get<1>(_plane).allocator()->init(info_sub2);
+            std::get<2>(_plane).allocator()->init(info_sub2);
+            break;
+        }
+        case Format::YUV444:
+            std::get<0>(_plane).allocator()->init(info);
+            std::get<1>(_plane).allocator()->init(info);
+            std::get<2>(_plane).allocator()->init(info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+
+    _info.init(width, height, format);
+}
+
+void MultiImage::allocate()
+{
+    switch(_info.format())
+    {
+        case Format::U8:
+        case Format::S16:
+        case Format::U16:
+        case Format::S32:
+        case Format::F16:
+        case Format::F32:
+        case Format::U32:
+        case Format::RGB888:
+        case Format::RGBA8888:
+        case Format::YUYV422:
+        case Format::UYVY422:
+            std::get<0>(_plane).allocator()->allocate();
+            break;
+        case Format::NV12:
+        case Format::NV21:
+            std::get<0>(_plane).allocator()->allocate();
+            std::get<1>(_plane).allocator()->allocate();
+            break;
+        case Format::IYUV:
+        case Format::YUV444:
+            std::get<0>(_plane).allocator()->allocate();
+            std::get<1>(_plane).allocator()->allocate();
+            std::get<2>(_plane).allocator()->allocate();
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+}
+
+void MultiImage::create_subimage(MultiImage *image, const Coordinates &coords, unsigned int width, unsigned int height)
+{
+    arm_compute::Format format = image->info()->format();
+    const TensorInfo    info(width, height, Format::U8);
+
+    switch(format)
+    {
+        case Format::U8:
+        case Format::S16:
+        case Format::U16:
+        case Format::S32:
+        case Format::F32:
+        case Format::F16:
+        case Format::U32:
+        case Format::RGB888:
+        case Format::RGBA8888:
+        case Format::YUYV422:
+        case Format::UYVY422:
+        {
+            const TensorInfo info_full(width, height, format);
+            std::get<0>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(0))->allocator(), coords, info_full);
+            break;
+        }
+        case Format::NV12:
+        case Format::NV21:
+        {
+            const TensorInfo info_uv88(width / 2, height / 2, Format::UV88);
+            std::get<0>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(0))->allocator(), coords, info);
+            std::get<1>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(1))->allocator(), coords, info_uv88);
+            break;
+        }
+        case Format::IYUV:
+        {
+            const TensorInfo info_sub2(width / 2, height / 2, Format::U8);
+            std::get<0>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(0))->allocator(), coords, info);
+            std::get<1>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(1))->allocator(), coords, info_sub2);
+            std::get<2>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(2))->allocator(), coords, info_sub2);
+            break;
+        }
+        case Format::YUV444:
+            std::get<0>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(0))->allocator(), coords, info);
+            std::get<1>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(0))->allocator(), coords, info);
+            std::get<2>(_plane).allocator()->init(*dynamic_cast<Image *>(image->plane(0))->allocator(), coords, info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+
+    _info.init(width, height, format);
+}
+
+Image *MultiImage::plane(unsigned int index)
+{
+    return &_plane[index];
+}
+
+const Image *MultiImage::plane(unsigned int index) const
+{
+    return &_plane[index];
+}
diff --git a/src/runtime/NEON/INESimpleFunction.cpp b/src/runtime/NEON/INESimpleFunction.cpp
new file mode 100644
index 0000000000..6f0da85fc8
--- /dev/null
+++ b/src/runtime/NEON/INESimpleFunction.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+INESimpleFunction::INESimpleFunction()
+    : _kernel(), _border_handler()
+{
+}
+
+void INESimpleFunction::run()
+{
+    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(_kernel.get(), Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEAbsoluteDifference.cpp b/src/runtime/NEON/functions/NEAbsoluteDifference.cpp
new file mode 100644
index 0000000000..b39feb3a2b
--- /dev/null
+++ b/src/runtime/NEON/functions/NEAbsoluteDifference.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEAbsoluteDifference::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEAbsoluteDifferenceKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEAccumulate.cpp b/src/runtime/NEON/functions/NEAccumulate.cpp
new file mode 100644
index 0000000000..c39abfc540
--- /dev/null
+++ b/src/runtime/NEON/functions/NEAccumulate.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEAccumulate.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEAccumulateKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEAccumulate::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEAccumulateKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
+
+void NEAccumulateWeighted::configure(const ITensor *input, float alpha, ITensor *output, bool use_fp16)
+{
+    if(use_fp16)
+    {
+        auto k = arm_compute::cpp14::make_unique<NEAccumulateWeightedFP16Kernel>();
+        k->configure(input, alpha, output);
+        _kernel = std::move(k);
+    }
+    else
+    {
+        auto k = arm_compute::cpp14::make_unique<NEAccumulateWeightedKernel>();
+        k->configure(input, alpha, output);
+        _kernel = std::move(k);
+    }
+}
+
+void NEAccumulateSquared::configure(const ITensor *input, uint32_t shift, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEAccumulateSquaredKernel>();
+    k->configure(input, shift, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEActivationLayer.cpp b/src/runtime/NEON/functions/NEActivationLayer.cpp
new file mode 100644
index 0000000000..f5d81d7cd8
--- /dev/null
+++ b/src/runtime/NEON/functions/NEActivationLayer.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
+
+using namespace arm_compute;
+
+void NEActivationLayer::configure(const ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
+{
+    auto k = arm_compute::cpp14::make_unique<NEActivationLayerKernel>();
+    k->configure(input, output, activation_info);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
new file mode 100644
index 0000000000..50cc38b489
--- /dev/null
+++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEArithmeticAddition::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
+{
+    auto k = arm_compute::cpp14::make_unique<NEArithmeticAdditionKernel>();
+    k->configure(input1, input2, output, policy);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
new file mode 100644
index 0000000000..a3d27c0ed6
--- /dev/null
+++ b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEArithmeticSubtraction::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
+{
+    auto k = arm_compute::cpp14::make_unique<NEArithmeticSubtractionKernel>();
+    k->configure(input1, input2, output, policy);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
new file mode 100644
index 0000000000..a24429c6de
--- /dev/null
+++ b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEBatchNormalizationLayer::NEBatchNormalizationLayer()
+    : _norm_kernel()
+{
+}
+
+void NEBatchNormalizationLayer::configure(const ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon)
+{
+    // Configure kernel
+    _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon);
+}
+
+void NEBatchNormalizationLayer::run()
+{
+    NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEBitwiseAnd.cpp b/src/runtime/NEON/functions/NEBitwiseAnd.cpp
new file mode 100644
index 0000000000..5aafc51dc0
--- /dev/null
+++ b/src/runtime/NEON/functions/NEBitwiseAnd.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEBitwiseAnd.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEBitwiseAnd::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEBitwiseAndKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEBitwiseNot.cpp b/src/runtime/NEON/functions/NEBitwiseNot.cpp
new file mode 100644
index 0000000000..af3df6e46a
--- /dev/null
+++ b/src/runtime/NEON/functions/NEBitwiseNot.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEBitwiseNot.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEBitwiseNot::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEBitwiseNotKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEBitwiseOr.cpp b/src/runtime/NEON/functions/NEBitwiseOr.cpp
new file mode 100644
index 0000000000..d12c5e5f6f
--- /dev/null
+++ b/src/runtime/NEON/functions/NEBitwiseOr.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEBitwiseOr.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEBitwiseOr::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEBitwiseOrKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEBitwiseXor.cpp b/src/runtime/NEON/functions/NEBitwiseXor.cpp
new file mode 100644
index 0000000000..65c943e64c
--- /dev/null
+++ b/src/runtime/NEON/functions/NEBitwiseXor.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEBitwiseXor.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEBitwiseXor::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEBitwiseXorKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEBox3x3.cpp b/src/runtime/NEON/functions/NEBox3x3.cpp
new file mode 100644
index 0000000000..7f0b45d34c
--- /dev/null
+++ b/src/runtime/NEON/functions/NEBox3x3.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEBox3x3.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEBox3x3Kernel.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEBox3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value, bool use_fp16)
+{
+    if(use_fp16)
+    {
+        auto k = arm_compute::cpp14::make_unique<NEBox3x3FP16Kernel>();
+        k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+        _kernel = std::move(k);
+    }
+    else
+    {
+        auto k = arm_compute::cpp14::make_unique<NEBox3x3Kernel>();
+        k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+        _kernel = std::move(k);
+    }
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp
new file mode 100644
index 0000000000..26f31f557b
--- /dev/null
+++ b/src/runtime/NEON/functions/NECannyEdge.cpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NECannyEdge.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NECannyEdgeKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/NEON/functions/NESobel3x3.h"
+#include "arm_compute/runtime/NEON/functions/NESobel5x5.h"
+#include "arm_compute/runtime/NEON/functions/NESobel7x7.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include <cstring>
+#include <utility>
+
+using namespace arm_compute;
+
+NECannyEdge::NECannyEdge()
+    : _sobel(), _gradient(), _non_max_suppr(), _edge_trace(), _border_mag_gradient(), _border_edge_trace(), _gx(), _gy(), _magnitude(), _phase(), _nonmax(), _output(nullptr)
+{
+}
+
+void NECannyEdge::configure(ITensor *input, ITensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, uint8_t constant_border_value,
+                            bool use_fp16)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(gradient_size < 3);
+    ARM_COMPUTE_ERROR_ON(gradient_size > 7);
+    ARM_COMPUTE_ERROR_ON(lower_thr > upper_thr);
+    ARM_COMPUTE_ERROR_ON((1 != norm_type) && (2 != norm_type));
+
+    _output = output;
+
+    const TensorShape &shape = input->info()->tensor_shape();
+    TensorInfo         gradient_info;
+    TensorInfo         magnitude_info;
+
+    // Initialize images
+    if(gradient_size < 7)
+    {
+        gradient_info.init(shape, Format::S16);
+        magnitude_info.init(shape, Format::U16);
+    }
+    else
+    {
+        gradient_info.init(shape, Format::S32);
+        magnitude_info.init(shape, Format::U32);
+    }
+
+    _gx.allocator()->init(gradient_info);
+    _gy.allocator()->init(gradient_info);
+    _magnitude.allocator()->init(magnitude_info);
+
+    TensorInfo info(shape, Format::U8);
+    _phase.allocator()->init(info);
+    _nonmax.allocator()->init(info);
+
+    // Configure/Init sobelNxN
+    if(gradient_size == 3)
+    {
+        auto k = arm_compute::cpp14::make_unique<NESobel3x3>();
+        k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+        _sobel = std::move(k);
+    }
+    else if(gradient_size == 5)
+    {
+        auto k = arm_compute::cpp14::make_unique<NESobel5x5>();
+        k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+        _sobel = std::move(k);
+    }
+    else if(gradient_size == 7)
+    {
+        auto k = arm_compute::cpp14::make_unique<NESobel7x7>();
+        k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+        _sobel = std::move(k);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Gradient size not supported\n");
+    }
+
+    // Configure gradient
+    if(use_fp16)
+    {
+        auto k = arm_compute::cpp14::make_unique<NEGradientFP16Kernel>();
+        k->configure(&_gx, &_gy, &_magnitude, &_phase, norm_type);
+        _gradient = std::move(k);
+    }
+    else
+    {
+        auto k = arm_compute::cpp14::make_unique<NEGradientKernel>();
+        k->configure(&_gx, &_gy, &_magnitude, &_phase, norm_type);
+        _gradient = std::move(k);
+    }
+
+    // Configure non-maxima suppression
+    _non_max_suppr.configure(&_magnitude, &_phase, &_nonmax, upper_thr, lower_thr, border_mode == BorderMode::UNDEFINED);
+
+    // Fill border around magnitude image as non-maxima suppression will access
+    // it. If border mode is undefined filling the border is a nop.
+    _border_mag_gradient.configure(&_magnitude, _non_max_suppr.border_size(), border_mode, constant_border_value);
+
+    // Configure edge tracing
+    _edge_trace.configure(&_nonmax, output);
+
+    // Fill border with "No edge" to stop recursion in edge trace
+    _border_edge_trace.configure(&_nonmax, _edge_trace.border_size(), BorderMode::CONSTANT, 0);
+
+    // Allocate intermediate tensors
+    _gx.allocator()->allocate();
+    _gy.allocator()->allocate();
+    _phase.allocator()->allocate();
+    _magnitude.allocator()->allocate();
+    _nonmax.allocator()->allocate();
+}
+
+void NECannyEdge::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
+    ARM_COMPUTE_ERROR_ON(_output == nullptr);
+
+    // Run sobelNxN
+    _sobel->run();
+
+    // Fill border before non-maxima suppression. Nop for border mode undefined.
+    _border_mag_gradient.run(_border_mag_gradient.window());
+
+    // Run gradient
+    NEScheduler::get().schedule(_gradient.get(), Window::DimY);
+
+    // Run non-maxima suppression
+    NEScheduler::get().schedule(&_non_max_suppr, Window::DimY);
+
+    ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
+    memset(_output->buffer(), 0, _output->info()->total_size());
+
+    // Fill border before edge trace
+    _border_edge_trace.run(_border_edge_trace.window());
+
+    // Run edge tracing
+    _edge_trace.run(_edge_trace.window());
+}
diff --git a/src/runtime/NEON/functions/NEChannelCombine.cpp b/src/runtime/NEON/functions/NEChannelCombine.cpp
new file mode 100644
index 0000000000..84d4fff4ff
--- /dev/null
+++ b/src/runtime/NEON/functions/NEChannelCombine.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEChannelCombine.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEChannelCombineKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEChannelCombine::configure(const ITensor *plane0, const ITensor *plane1, const ITensor *plane2, const ITensor *plane3, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEChannelCombineKernel>();
+    k->configure(plane0, plane1, plane2, plane3, output);
+    _kernel = std::move(k);
+}
+
+void NEChannelCombine::configure(const IImage *plane0, const IImage *plane1, const IImage *plane2, IMultiImage *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEChannelCombineKernel>();
+    k->configure(plane0, plane1, plane2, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEChannelExtract.cpp b/src/runtime/NEON/functions/NEChannelExtract.cpp
new file mode 100644
index 0000000000..634e918eac
--- /dev/null
+++ b/src/runtime/NEON/functions/NEChannelExtract.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEChannelExtract.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEChannelExtractKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEChannelExtract::configure(const ITensor *input, Channel channel, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEChannelExtractKernel>();
+    k->configure(input, channel, output);
+    _kernel = std::move(k);
+}
+
+void NEChannelExtract::configure(const IMultiImage *input, Channel channel, IImage *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEChannelExtractKernel>();
+    k->configure(input, channel, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEColorConvert.cpp b/src/runtime/NEON/functions/NEColorConvert.cpp
new file mode 100644
index 0000000000..bbaa832284
--- /dev/null
+++ b/src/runtime/NEON/functions/NEColorConvert.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEColorConvert.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEColorConvertKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEColorConvert::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEColorConvertKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
+
+void NEColorConvert::configure(const IMultiImage *input, IImage *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEColorConvertKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
+
+void NEColorConvert::configure(const IImage *input, IMultiImage *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEColorConvertKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
+
+void NEColorConvert::configure(const IMultiImage *input, IMultiImage *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEColorConvertKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEConvolution.cpp b/src/runtime/NEON/functions/NEConvolution.cpp
new file mode 100644
index 0000000000..3f39ae2cbd
--- /dev/null
+++ b/src/runtime/NEON/functions/NEConvolution.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEConvolution.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include <array>
+#include <utility>
+
+using namespace arm_compute;
+
+void NEConvolution3x3::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<NEConvolution3x3Kernel>();
+    k->configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
+
+template <unsigned int matrix_size>
+NEConvolutionSquare<matrix_size>::NEConvolutionSquare()
+    : _tmp(), _is_separable(false), _kernel_hor(), _kernel_vert(), _kernel(), _border_handler()
+{
+}
+
+template <unsigned int matrix_size>
+void NEConvolutionSquare<matrix_size>::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(conv == nullptr);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+
+    std::array<int16_t, matrix_size> conv_col{ { 0 } };
+    std::array<int16_t, matrix_size> conv_row{ { 0 } };
+
+    _is_separable = separate_matrix(conv, conv_col.data(), conv_row.data(), matrix_size);
+
+    if(_is_separable)
+    {
+        DataType intermediate_type = DataType::UNKNOWN;
+        std::tie(std::ignore, intermediate_type) = data_type_for_convolution(conv_col.data(), conv_row.data(), matrix_size);
+
+        _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, intermediate_type));
+
+        if(scale == 0)
+        {
+            scale = calculate_matrix_scale(conv, matrix_size);
+        }
+
+        _kernel_hor.configure(input, &_tmp, conv_row.data(), border_mode == BorderMode::UNDEFINED);
+        _kernel_vert.configure(&_tmp, output, conv_col.data(), scale, border_mode == BorderMode::UNDEFINED);
+
+        _tmp.allocator()->allocate();
+
+        _border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+    }
+    else
+    {
+        _kernel.configure(input, output, conv, scale, border_mode == BorderMode::UNDEFINED);
+        _border_handler.configure(input, _kernel.border_size(), border_mode, PixelValue(constant_border_value));
+    }
+}
+
+template <unsigned int matrix_size>
+void                   NEConvolutionSquare<matrix_size>::run()
+{
+    _border_handler.run(_border_handler.window());
+
+    if(_is_separable)
+    {
+        NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
+        NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
+    }
+    else
+    {
+        NEScheduler::get().schedule(&_kernel, Window::DimY);
+    }
+}
+
+template class arm_compute::NEConvolutionSquare<5>;
+template class arm_compute::NEConvolutionSquare<7>;
+template class arm_compute::NEConvolutionSquare<9>;
+
+void NEConvolutionRectangle::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<NEConvolutionRectangleKernel>();
+    k->configure(input, output, conv, rows, cols, scale, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
new file mode 100644
index 0000000000..bd688cffb6
--- /dev/null
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <cmath>
+#include <tuple>
+
+using namespace arm_compute;
+
+NEConvolutionLayerReshapeWeights::NEConvolutionLayerReshapeWeights()
+    : _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
+{
+}
+
+void NEConvolutionLayerReshapeWeights::configure(const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose1xW)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, output);
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::QS8, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, biases);
+        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+    }
+
+    // Check if bias are present, if yes they will be embedded to the weights matrix
+    const bool _has_bias = (biases != nullptr);
+
+    _transpose1xW = transpose1xW;
+
+    if(transpose1xW)
+    {
+        // Create tensor to store the reshaped weights
+        const unsigned int mat_weights_cols = weights->info()->dimension(3);
+        const unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
+        TensorShape        shape_wr(mat_weights_cols, mat_weights_rows);
+        TensorInfo         info_wr(shape_wr, 1, weights->info()->data_type(), weights->info()->fixed_point_position());
+
+        _weights_reshaped.allocator()->init(info_wr);
+        _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
+        _weights_transposed_kernel.configure(&_weights_reshaped, output);
+        _weights_reshaped.allocator()->allocate();
+    }
+    else
+    {
+        _weights_reshape_kernel.configure(weights, biases, output);
+    }
+}
+
+void NEConvolutionLayerReshapeWeights::run()
+{
+    NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
+    if(_transpose1xW)
+    {
+        NEScheduler::get().schedule(&_weights_transposed_kernel, Window::DimY);
+    }
+}
+
+NEConvolutionLayer::NEConvolutionLayer()
+    : _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(),
+      _gemm_output(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
+{
+}
+
+void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights, output);
+    ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && weights->info()->dimension(2) != input->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::QS8, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
+        ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && biases->info()->dimension(0) != weights->info()->dimension(3));
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+    }
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    _has_bias             = (biases != nullptr);
+    _are_weights_reshaped = weights_info.are_reshaped();
+
+    // Get parameters from conv_info
+    unsigned int stride_x = 0;
+    unsigned int stride_y = 0;
+    unsigned int pad_x    = 0;
+    unsigned int pad_y    = 0;
+    std::tie(stride_x, stride_y) = conv_info.stride();
+    std::tie(pad_x, pad_y)       = conv_info.pad();
+
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+
+    const unsigned int kernel_width = (_are_weights_reshaped) ? weights_info.kernel_size() : weights->info()->dimension(0);
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width,
+                                                 stride_x, stride_y, pad_x, pad_y, conv_info.round());
+    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+
+    // Check if its a "fully connected" convolution
+    _is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
+
+    unsigned int mat_weights_cols = weights->info()->dimension(3);
+    unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
+
+    // Reshape weights if needed
+    if(_are_weights_reshaped)
+    {
+        mat_weights_cols                         = output->info()->dimension(2);
+        const unsigned int quarter_reshaped_cols = weights->info()->dimension(0) / 4;
+        mat_weights_rows                         = (_has_bias ? 1 + quarter_reshaped_cols : quarter_reshaped_cols);
+    }
+    else
+    {
+        if(_is_fully_connected_convolution)
+        {
+            // Create tensor to store the reshaped weights
+            TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
+            TensorInfo  info_wr(shape_wr, 1, dt, fixed_point_position);
+            _weights_reshaped.allocator()->init(info_wr);
+            _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
+        }
+        else
+        {
+            // Create tensor to store transposed weights
+            const float transpose_width = 16.0f / input->info()->element_size();
+            TensorShape shape_wt(mat_weights_rows * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)));
+            TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
+            _weights_reshaped.allocator()->init(info_wt);
+            _reshape_weights.configure(weights, biases, &_weights_reshaped, true /* 1xW transpose */);
+        }
+        weights = &_weights_reshaped;
+    }
+
+    // Create tensor to store im2col reshaped inputs
+    const unsigned int mat_input_cols = mat_weights_rows;
+    const unsigned int mat_input_rows = conv_w * conv_h;
+    TensorShape        shape_im2col   = input->info()->tensor_shape();
+    shape_im2col.set(0, mat_input_cols);
+    shape_im2col.set(1, mat_input_rows);
+    shape_im2col.set(2, 1);
+    _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
+
+    // Create tensor (interleave) to prepare input tensor for GEMM
+    if(!_is_fully_connected_convolution)
+    {
+        TensorShape shape_interleaved = shape_im2col;
+        shape_interleaved.set(0, shape_interleaved.x() * 4);
+        shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
+        _input_interleaved_reshaped.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
+    }
+
+    // Create GEMM output tensor
+    TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
+    shape_gemm.set(0, mat_weights_cols);
+    shape_gemm.set(1, mat_input_rows);
+    _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, dt, fixed_point_position));
+
+    // Configure kernels
+    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
+    if(_is_fully_connected_convolution)
+    {
+        _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f);
+    }
+    else
+    {
+        _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
+        _mm_kernel.configure(&_input_interleaved_reshaped, weights, &_gemm_output, 1.0f);
+    }
+    _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+
+    // Allocate intermediate tensor
+    if(!_are_weights_reshaped)
+    {
+        _weights_reshaped.allocator()->allocate();
+    }
+    _input_im2col_reshaped.allocator()->allocate();
+    if(!_is_fully_connected_convolution)
+    {
+        _input_interleaved_reshaped.allocator()->allocate();
+    }
+    _gemm_output.allocator()->allocate();
+}
+
+void NEConvolutionLayer::run()
+{
+    // Run weights reshaping (Runs once for every configure)
+    if(!_are_weights_reshaped)
+    {
+        _are_weights_reshaped = true;
+        _reshape_weights.run();
+    }
+
+    // Run input reshaping
+    NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
+    if(!_is_fully_connected_convolution)
+    {
+        // Run interleave
+        NEScheduler::get().schedule(&_input_interleave_kernel, Window::DimY);
+    }
+
+    // Runs matrix multiply on reshaped matrices
+    NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
+
+    // Reshape output matrix
+    NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEDepthConcatenate.cpp b/src/runtime/NEON/functions/NEDepthConcatenate.cpp
new file mode 100644
index 0000000000..7d2c5494a9
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDepthConcatenate.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDepthConcatenate.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEDepthConcatenate::NEDepthConcatenate()
+    : _inputs_vector(), _concat_kernels_vector(), _border_handlers_vector(), _num_inputs(0)
+{
+}
+
+void NEDepthConcatenate::configure(std::vector<ITensor *> inputs_vector, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
+
+    _num_inputs             = inputs_vector.size();
+    _concat_kernels_vector  = arm_compute::cpp14::make_unique<NEDepthConcatenateKernel[]>(_num_inputs);
+    _border_handlers_vector = arm_compute::cpp14::make_unique<NEFillBorderKernel[]>(_num_inputs);
+
+    unsigned int depth_offset = 0;
+    for(unsigned int i = 0; i < _num_inputs; ++i)
+    {
+        _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output);
+        _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+
+        depth_offset += inputs_vector.at(i)->info()->dimension(2);
+    }
+}
+
+void NEDepthConcatenate::run()
+{
+    for(unsigned i = 0; i < _num_inputs; ++i)
+    {
+        NEScheduler::get().schedule(&_border_handlers_vector[i], Window::DimX);
+        NEScheduler::get().schedule(&_concat_kernels_vector[i], Window::DimX);
+    }
+}
diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/NEON/functions/NEDepthConvert.cpp
new file mode 100644
index 0000000000..a339cae316
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDepthConvert.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEDepthConvert::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(input == output);
+    ARM_COMPUTE_ERROR_ON(input->info()->data_type() == output->info()->data_type());
+
+    auto k = arm_compute::cpp14::make_unique<NEDepthConvertKernel>();
+    k->configure(input, output, policy, shift);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEDerivative.cpp b/src/runtime/NEON/functions/NEDerivative.cpp
new file mode 100644
index 0000000000..2887c13233
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDerivative.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDerivative.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/NEON/kernels/NEDerivativeKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEDerivative::NEDerivative()
+    : _kernel(), _border_handler()
+{
+}
+
+void NEDerivative::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+    _kernel.configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+    _border_handler.configure(input, 1, border_mode, PixelValue(constant_border_value));
+}
+
+void NEDerivative::run()
+{
+    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEDilate.cpp b/src/runtime/NEON/functions/NEDilate.cpp
new file mode 100644
index 0000000000..0c016f14f9
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDilate.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDilate.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEDilateKernel.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEDilate::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<NEDilateKernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
new file mode 100644
index 0000000000..3f3e7710fb
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <cmath>
+#include <tuple>
+
+using namespace arm_compute;
+
+NEDirectConvolutionLayer::NEDirectConvolutionLayer()
+    : _accumulate_bias_kernel(), _conv_kernel(), _input_border_handler(), _accumulator()
+{
+}
+
+void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+
+    // Free accumulator
+    if(_accumulator.buffer() != nullptr)
+    {
+        _accumulator.allocator()->free();
+    }
+
+    // Allocate the intermediate accumulator tensor in case of fixed point input
+    if(output->info()->data_type() == DataType::QS8)
+    {
+        _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS16, output->info()->fixed_point_position()));
+        _conv_kernel.configure(input, weights, &_accumulator, conv_info);
+        _accumulate_bias_kernel.configure(&_accumulator, bias, output);
+        _accumulator.allocator()->allocate();
+    }
+    else
+    {
+        _conv_kernel.configure(input, weights, output, conv_info);
+        _accumulate_bias_kernel.configure(output, bias);
+    }
+
+    // Add zero padding XY
+    _input_border_handler.configure(input, _conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+}
+
+void NEDirectConvolutionLayer::run()
+{
+    _input_border_handler.run(_input_border_handler.window());
+
+    NEScheduler::get().schedule(&_conv_kernel, Window::DimZ);
+    NEScheduler::get().schedule(&_accumulate_bias_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEEqualizeHistogram.cpp b/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
new file mode 100644
index 0000000000..f6ec677e44
--- /dev/null
+++ b/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEEqualizeHistogram.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEEqualizeHistogram::NEEqualizeHistogram()
+    : _histogram_kernel(), _cd_histogram_kernel(), _map_histogram_kernel(), _hist(nr_bins, 0, max_range), _cum_dist(nr_bins, 0, max_range), _cd_lut(nr_bins, DataType::U8)
+{
+}
+
+void NEEqualizeHistogram::configure(const IImage *input, IImage *output)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    // Configure kernels
+    _histogram_kernel.configure(input, &_hist);
+    _cd_histogram_kernel.configure(input, &_hist, &_cum_dist, &_cd_lut);
+    _map_histogram_kernel.configure(input, &_cd_lut, output);
+}
+
+void NEEqualizeHistogram::run()
+{
+    // Calculate histogram of input.
+    NEScheduler::get().schedule(&_histogram_kernel, Window::DimY);
+
+    // Calculate cumulative distribution of histogram and create LUT.
+    _cd_histogram_kernel.run(_cd_histogram_kernel.window());
+
+    // Map input to output using created LUT.
+    NEScheduler::get().schedule(&_map_histogram_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEErode.cpp b/src/runtime/NEON/functions/NEErode.cpp
new file mode 100644
index 0000000000..9b011db845
--- /dev/null
+++ b/src/runtime/NEON/functions/NEErode.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEErode.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEErodeKernel.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEErode::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<NEErodeKernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NEFastCorners.cpp b/src/runtime/NEON/functions/NEFastCorners.cpp
new file mode 100644
index 0000000000..33a58f1904
--- /dev/null
+++ b/src/runtime/NEON/functions/NEFastCorners.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFastCorners.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/Array.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+using namespace arm_compute;
+
+NEFastCorners::NEFastCorners()
+    : _fast_corners_kernel(),
+      _border_handler(),
+      _nonmax_kernel(),
+      _fill_kernel(),
+      _output(),
+      _suppressed(),
+      _non_max(false)
+{
+}
+
+void NEFastCorners::configure(IImage *input, float threshold, bool nonmax_suppression, KeyPointArray *corners,
+                              BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON(BorderMode::UNDEFINED != border_mode);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == corners);
+    ARM_COMPUTE_ERROR_ON(threshold < 1 && threshold > 255);
+
+    _non_max = nonmax_suppression;
+
+    TensorInfo tensor_info(input->info()->tensor_shape(), Format::U8);
+    _output.allocator()->init(tensor_info);
+
+    // If border is UNDEFINED _fast_corners_kernel will operate in xwindow (3,
+    // width - 3) and ywindow (3, height -3) so the output image will leave the
+    // pixels on the borders unchanged. This is reflected in the valid region
+    // of the output. The non maxima suppression is only run on the valid
+    // pixels.
+    _fast_corners_kernel.configure(input, &_output, threshold, nonmax_suppression, BorderMode::UNDEFINED == border_mode);
+    _border_handler.configure(input, _fast_corners_kernel.border_size(), border_mode, constant_border_value);
+
+    if(!_non_max)
+    {
+        _fill_kernel.configure(&_output, 1 /* we keep all texels >0 */, corners);
+    }
+    else
+    {
+        _suppressed.allocator()->init(tensor_info);
+        _nonmax_kernel.configure(&_output, &_suppressed, BorderMode::UNDEFINED == border_mode);
+        _fill_kernel.configure(&_suppressed, 1 /* we keep all texels >0 */, corners);
+
+        // Allocate intermediate tensors
+        _suppressed.allocator()->allocate();
+    }
+
+    // Allocate intermediate tensors
+    _output.allocator()->allocate();
+}
+
+void NEFastCorners::run()
+{
+    _border_handler.run(_border_handler.window());
+
+    NEScheduler::get().schedule(&_fast_corners_kernel, Window::DimY);
+
+    if(_non_max)
+    {
+        NEScheduler::get().schedule(&_nonmax_kernel, Window::DimY);
+    }
+
+    NEScheduler::get().schedule(&_fill_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEFillBorder.cpp b/src/runtime/NEON/functions/NEFillBorder.cpp
new file mode 100644
index 0000000000..e884f4a668
--- /dev/null
+++ b/src/runtime/NEON/functions/NEFillBorder.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFillBorder.h"
+
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+void NEFillBorder::configure(ITensor *input, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
+{
+    _border_handler.configure(input, border_width, border_mode, constant_border_value);
+}
+
+void NEFillBorder::run()
+{
+    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+}
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
new file mode 100644
index 0000000000..abb41e9f70
--- /dev/null
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -0,0 +1,344 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <algorithm>
+#include <cmath>
+
+using namespace arm_compute;
+
+NEFullyConnectedLayerReshapeWeights::NEFullyConnectedLayerReshapeWeights()
+    : _transpose_kernel(), _transpose1xW_kernel(), _transpose_output(), _transpose_weights(false), _is_batched_fc_layer(false)
+{
+}
+
+void NEFullyConnectedLayerReshapeWeights::configure(const ITensor *input, ITensor *output, bool transpose_weights, bool is_batched_fc_layer)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != 2);
+    ARM_COMPUTE_ERROR_ON((transpose_weights == false) && (is_batched_fc_layer == false));
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    _transpose_weights   = transpose_weights;
+    _is_batched_fc_layer = is_batched_fc_layer;
+
+    // Check if we need to transpose the weights
+    if(_transpose_weights)
+    {
+        if(_is_batched_fc_layer)
+        {
+            // Initialize the output tensor for transpose
+            TensorShape shape_transposed(input->info()->dimension(1), input->info()->dimension(0));
+            _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, dt, fixed_point_position));
+            _transpose_kernel.configure(input, &_transpose_output);
+
+            // Configure transpose 1xW kernel
+            _transpose1xW_kernel.configure(&_transpose_output, output);
+
+            // Allocate temporary tensor used for transposing the weights
+            _transpose_output.allocator()->allocate();
+        }
+        else
+        {
+            _transpose_kernel.configure(input, output);
+        }
+    }
+    else
+    {
+        if(_is_batched_fc_layer)
+        {
+            // Configure transpose 1xW kernel
+            _transpose1xW_kernel.configure(input, output);
+        }
+        else
+        {
+            ARM_COMPUTE_ERROR("Configuration transpose_weights=false & is_batched_fc_layer=false not supported");
+        }
+    }
+}
+
+void NEFullyConnectedLayerReshapeWeights::run()
+{
+    if(_transpose_weights)
+    {
+        NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+    }
+    if(_is_batched_fc_layer)
+    {
+        NEScheduler::get().schedule(&_transpose1xW_kernel, Window::DimY);
+    }
+}
+
+NEFullyConnectedLayer::NEFullyConnectedLayer()
+    : _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(), _reshape_weights_output(),
+      _are_weights_reshaped(false), _is_fc_after_conv(false), _is_batched_fc_layer(false), _accumulate_biases(false)
+{
+}
+
+void NEFullyConnectedLayer::configure_conv_fc_wb(const ITensor *input, const ITensor *weights, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2) * (16 / weights->info()->element_size())));
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
+
+    // Initialize output tensor for im2col
+    TensorShape shape_im2col;
+    shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
+    shape_im2col.set(1, input->info()->dimension(3));
+    shape_im2col.set(2, input->info()->dimension(4));
+    shape_im2col.set(3, input->info()->dimension(5));
+    _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
+
+    // Initialize output tensor for interleave 4x4
+    TensorShape shape_interleaved = _im2col_output.info()->tensor_shape();
+    shape_interleaved.set(0, shape_interleaved.x() * 4);
+    shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
+    _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
+
+    // Configure im2col kernel
+    _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+
+    // Configure interleave4x4 kernel
+    _interleave4x4_kernel.configure(&_im2col_output, &_interleave4x4_output);
+
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
+
+    // Allocate the tensors once all the configure methods have been called
+    _im2col_output.allocator()->allocate();
+    _interleave4x4_output.allocator()->allocate();
+}
+
+void NEFullyConnectedLayer::configure_fc_fc_wb(const ITensor *input, const ITensor *weights, ITensor *output)
+{
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    // Initialize output tensor for interleave 4x4
+    TensorShape shape_interleaved = input->info()->tensor_shape();
+    shape_interleaved.set(0, shape_interleaved.x() * 4);
+    shape_interleaved.set(1, std::ceil(static_cast<float>(shape_interleaved.y()) / 4));
+    _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
+
+    // Configure interleave4x4 kernel
+    _interleave4x4_kernel.configure(input, &_interleave4x4_output);
+
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(&_interleave4x4_output, weights, output, 1.0f);
+
+    // Allocate the tensors once all the configure methods have been called
+    _interleave4x4_output.allocator()->allocate();
+}
+
+void NEFullyConnectedLayer::configure_conv_fc_nb(const ITensor *input, const ITensor *weights, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
+
+    // Initialize output tensor for im2col
+    TensorShape shape_im2col;
+    shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
+    shape_im2col.set(1, 1);
+    _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
+
+    // Configure im2col kernel
+    _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(&_im2col_output, weights, output, 1.0f);
+
+    // Allocate the output tensor for im2col once all the configure methods have been called
+    _im2col_output.allocator()->allocate();
+}
+
+void NEFullyConnectedLayer::configure_fc_fc_nb(const ITensor *input, const ITensor *weights, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
+
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(input, weights, output, 1.0f);
+}
+
+void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose_weights, bool are_weights_reshaped)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() != 2);
+
+    const DataType dt                   = input->info()->data_type();
+    const int      fixed_point_position = input->info()->fixed_point_position();
+
+    _are_weights_reshaped = are_weights_reshaped;
+    _is_fc_after_conv     = true;
+    _is_batched_fc_layer  = false;
+    _accumulate_biases    = false;
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+
+        _accumulate_biases = true;
+
+        // Configure accumulate biases kernel
+        _accumulate_biases_kernel.configure(output, biases);
+    }
+
+    // With the Fully Connected layer we can have 4 different cases:
+    //  1) Convolution layer -> Fully Connected layer without batches
+    //  2) Fully Connected layer -> Fully Connected layer without batches
+    //  3) Convolution layer -> Fully Connected layer with batches
+    //  4) Fully Connected layer -> Fully Connected layer with batches
+
+    // Check if we have a fully connected layer with batches
+    _is_batched_fc_layer = (output->info()->dimension(1) > 1);
+
+    const ITensor *weights_to_use = weights;
+
+    if(!are_weights_reshaped)
+    {
+        if((transpose_weights || _is_batched_fc_layer))
+        {
+            weights_to_use = &_reshape_weights_output;
+
+            if(transpose_weights)
+            {
+                if(_is_batched_fc_layer)
+                {
+                    const float transpose_width = 16.0f / input->info()->element_size();
+                    TensorShape shape_wt(weights->info()->dimension(0) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(1) / transpose_width)));
+                    TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
+                    _reshape_weights_output.allocator()->init(info_wt);
+                }
+                else
+                {
+                    TensorShape shape_wt(weights->info()->dimension(1), weights->info()->dimension(0));
+                    TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
+                    _reshape_weights_output.allocator()->init(info_wt);
+                }
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR_ON(!_is_batched_fc_layer);
+
+                const float transpose_width = 16.0f / input->info()->element_size();
+                TensorShape shape_wt(weights->info()->dimension(1) * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(weights->info()->dimension(0) / transpose_width)));
+                TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
+                _reshape_weights_output.allocator()->init(info_wt);
+            }
+
+            // Reshape the weights
+            _reshape_weights_kernel.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer);
+        }
+    }
+
+    if(_is_batched_fc_layer)
+    {
+        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
+                                                                                  input->info()->tensor_shape().cend(),
+                                                                                  output->info()->tensor_shape().cbegin() + 1));
+
+        if(_is_fc_after_conv)
+        {
+            // Fully Connected layer after a Convolution Layer with batches
+            configure_conv_fc_wb(input, weights_to_use, output);
+        }
+        else
+        {
+            // Fully Connected layer after a Fully Connected Layer with batches
+            configure_fc_fc_wb(input, weights_to_use, output);
+        }
+    }
+    else
+    {
+        // In case of not batched fully connected layer, the weights will not be reshaped using transposed1xW
+        _is_fc_after_conv = ((weights_to_use->info()->dimension(1)) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)));
+
+        if(_is_fc_after_conv)
+        {
+            // Fully Connected layer after a Convolution Layer without batches
+            configure_conv_fc_nb(input, weights_to_use, output);
+        }
+        else
+        {
+            // Fully Connected layer after a Fully Connected Layer without batches
+            configure_fc_fc_nb(input, weights_to_use, output);
+        }
+    }
+
+    // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
+    if(!are_weights_reshaped)
+    {
+        if(transpose_weights || _is_batched_fc_layer)
+        {
+            // Allocate the tensor for the weights reshaped
+            _reshape_weights_output.allocator()->allocate();
+        }
+    }
+}
+
+void NEFullyConnectedLayer::run()
+{
+    // Reshape of the weights (happens only once)
+    if(!_are_weights_reshaped)
+    {
+        _are_weights_reshaped = true;
+        _reshape_weights_kernel.run();
+    }
+
+    // Linearize input if comes from a convolutional layer
+    if(_is_fc_after_conv)
+    {
+        NEScheduler::get().schedule(&_im2col_kernel, Window::DimY);
+    }
+
+    // Interleave input
+    if(_is_batched_fc_layer)
+    {
+        NEScheduler::get().schedule(&_interleave4x4_kernel, Window::DimY);
+    }
+
+    // Run matrix multiply
+    NEScheduler::get().schedule(&_mm_kernel, _is_batched_fc_layer ? Window::DimY : Window::DimX);
+
+    // Accumulate biases if provided
+    if(_accumulate_biases)
+    {
+        NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY);
+    }
+}
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
new file mode 100644
index 0000000000..15d5f4effb
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include <cmath>
+
+using namespace arm_compute;
+
+NEGEMM::NEGEMM()
+    : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _run_vector_matrix_multiplication(false), _run_addition(false)
+{
+}
+
+void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::F32, DataType::F16, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(d, 1, DataType::F32, DataType::F16, DataType::QS8);
+
+    if(c != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(c, 1, DataType::F32, DataType::F16, DataType::QS8);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c);
+        ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
+        ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix B");
+        ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(0) != d->info()->dimension(0), "The C matrix must have the same number of rows as the output matrix");
+        ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != d->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix");
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, d);
+    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+
+    // Check if the first input tensor is a vector. If so, all the kernels for reshaping the tensors can be skipped
+    if((a->info()->dimension(1) == 1))
+    {
+        _run_vector_matrix_multiplication = true;
+
+        // Configure the matrix multiply kernel
+        _mm_kernel.configure(a, b, d, alpha);
+    }
+    else
+    {
+        _run_vector_matrix_multiplication = false;
+
+        TensorShape shape_tmp_a = a->info()->tensor_shape();
+        TensorShape shape_tmp_b = b->info()->tensor_shape();
+
+        shape_tmp_a.set(0, a->info()->dimension(0) * 4);
+        shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f));
+
+        switch(a->info()->data_type())
+        {
+            case DataType::F32:
+            {
+                shape_tmp_b.set(0, b->info()->dimension(1) * 4);
+                shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 4.0f));
+                break;
+            }
+            case DataType::F16:
+#ifdef ARM_COMPUTE_ENABLE_FP16
+                {
+                    shape_tmp_b.set(0, b->info()->dimension(1) * 8);
+                    shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 8.0f));
+                    break;
+                }
+#endif
+            case DataType::QS8:
+            {
+                shape_tmp_b.set(0, b->info()->dimension(1) * 16);
+                shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.0f));
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR_ON("Data type not supported");
+            }
+        }
+
+        TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type(), a->info()->fixed_point_position());
+        TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), a->info()->fixed_point_position());
+
+        _tmp_a.allocator()->init(info_a);
+        _tmp_b.allocator()->init(info_b);
+
+        // Configure interleave kernel
+        _interleave_kernel.configure(a, &_tmp_a);
+
+        // Configure transpose kernel
+        _transpose_kernel.configure(b, &_tmp_b);
+
+        // Configure matrix multiplication kernel
+        _mm_kernel.configure(&_tmp_a, &_tmp_b, d, alpha);
+
+        // Allocate once the all configure methods have been called
+        _tmp_a.allocator()->allocate();
+        _tmp_b.allocator()->allocate();
+    }
+
+    // Configure matrix addition kernel
+    if(beta != 0 && c != nullptr)
+    {
+        _ma_kernel.configure(c, d, beta);
+        _run_addition = true;
+    }
+}
+
+void NEGEMM::run()
+{
+    if(!_run_vector_matrix_multiplication)
+    {
+        // Run interleave kernel
+        NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
+
+        // Run transpose kernel
+        NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+    }
+
+    // Run matrix multiply kernel
+    NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
+
+    // Run matrix addition kernel
+    if(_run_addition)
+    {
+        NEScheduler::get().schedule(&_ma_kernel, Window::DimY);
+    }
+}
diff --git a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
new file mode 100644
index 0000000000..4c77c88656
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+
+using namespace arm_compute;
+
+void NEGEMMInterleave4x4::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEGEMMLowp.cpp b/src/runtime/NEON/functions/NEGEMMLowp.cpp
new file mode 100644
index 0000000000..b64f769459
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMMLowp.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowp.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+using namespace arm_compute;
+
+NEGEMMLowp::NEGEMMLowp()
+    : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
+{
+}
+
+void NEGEMMLowp::configure(const ITensor *a, const ITensor *b, ITensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
+    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != output->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
+    ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != output->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C");
+
+    /* The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] */
+    TensorShape shape_tmp_a = a->info()->tensor_shape();
+    shape_tmp_a.set(0, a->info()->dimension(0) * 4);
+    shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
+
+    TensorShape shape_tmp_b = b->info()->tensor_shape();
+    shape_tmp_b.set(0, b->info()->dimension(1) * 16);
+    shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
+
+    TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
+    TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+    _tmp_a.allocator()->init(info_a);
+    _tmp_b.allocator()->init(info_b);
+
+    _interleave_kernel.configure(a, &_tmp_a);
+    _transpose_kernel.configure(b, &_tmp_b);
+    _mm_kernel.configure(&_tmp_a, &_tmp_b, output, a_offset, b_offset, output_offset, output_mult_int, shift);
+
+    _tmp_a.allocator()->allocate();
+    _tmp_b.allocator()->allocate();
+}
+
+void NEGEMMLowp::run()
+{
+    /* Run interleave kernel */
+    NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
+
+    /* Run transpose kernel */
+    NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+
+    /* Run matrix multiply kernel */
+    NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp b/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
new file mode 100644
index 0000000000..dc40ecec14
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+void NEGEMMTranspose1xW::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEGaussian3x3.cpp b/src/runtime/NEON/functions/NEGaussian3x3.cpp
new file mode 100644
index 0000000000..95ba5cbdf9
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGaussian3x3.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGaussian3x3.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEGaussian3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<NEGaussian3x3Kernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NEGaussian5x5.cpp b/src/runtime/NEON/functions/NEGaussian5x5.cpp
new file mode 100644
index 0000000000..5ccc765966
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGaussian5x5.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+using namespace arm_compute;
+
+NEGaussian5x5::NEGaussian5x5()
+    : _kernel_hor(), _kernel_vert(), _tmp(), _border_handler()
+{
+}
+
+void NEGaussian5x5::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    // Init temporary buffer
+    TensorInfo tensor_info(input->info()->tensor_shape(), Format::S16);
+    _tmp.allocator()->init(tensor_info);
+
+    // Create and configure kernels for the two passes
+    _kernel_hor.configure(input, &_tmp, border_mode == BorderMode::UNDEFINED);
+    _kernel_vert.configure(&_tmp, output, border_mode == BorderMode::UNDEFINED);
+
+    _tmp.allocator()->allocate();
+
+    _border_handler.configure(input, _kernel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+}
+
+void NEGaussian5x5::run()
+{
+    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
+    NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEGaussianPyramid.cpp b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
new file mode 100644
index 0000000000..e1d64f11f6
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h"
+#include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
+#include "arm_compute/runtime/Pyramid.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include <cstddef>
+
+using namespace arm_compute;
+
+NEGaussianPyramid::NEGaussianPyramid()
+    : _input(nullptr), _pyramid(nullptr), _tmp()
+{
+}
+
+NEGaussianPyramidHalf::NEGaussianPyramidHalf()
+    : _border_handler(), _horizontal_reduction(), _vertical_reduction()
+{
+}
+
+void NEGaussianPyramidHalf::configure(const ITensor *input, IPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
+    ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_HALF != pyramid->info()->scale());
+
+    /* Get number of pyramid levels */
+    const size_t num_levels = pyramid->info()->num_levels();
+
+    _input   = input;
+    _pyramid = pyramid;
+
+    if(num_levels > 1)
+    {
+        _border_handler       = arm_compute::cpp14::make_unique<NEFillBorderKernel[]>(num_levels - 1);
+        _horizontal_reduction = arm_compute::cpp14::make_unique<NEGaussianPyramidHorKernel[]>(num_levels - 1);
+        _vertical_reduction   = arm_compute::cpp14::make_unique<NEGaussianPyramidVertKernel[]>(num_levels - 1);
+
+        // Apply half scale to the X dimension of the tensor shape
+        TensorShape tensor_shape = pyramid->info()->tensor_shape();
+        tensor_shape.set(0, (pyramid->info()->width() + 1) * SCALE_PYRAMID_HALF);
+
+        PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_HALF, tensor_shape, Format::S16);
+        _tmp.init(pyramid_info);
+
+        for(unsigned int i = 0; i < num_levels - 1; ++i)
+        {
+            /* Configure horizontal kernel */
+            _horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode == BorderMode::UNDEFINED);
+
+            /* Configure vertical kernel */
+            _vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), border_mode == BorderMode::UNDEFINED);
+
+            /* Configure border */
+            _border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
+        }
+
+        _tmp.allocate();
+    }
+}
+
+void NEGaussianPyramidHalf::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_pyramid == nullptr, "Unconfigured function");
+
+    /* Get number of pyramid levels */
+    const size_t num_levels = _pyramid->info()->num_levels();
+
+    /* The first level of the pyramid has the input image */
+    _pyramid->get_pyramid_level(0)->copy_from(*_input);
+
+    for(unsigned int i = 0; i < num_levels - 1; ++i)
+    {
+        _border_handler[i].run(_border_handler[i].window());
+        NEScheduler::get().schedule(_horizontal_reduction.get() + i, Window::DimY);
+        NEScheduler::get().schedule(_vertical_reduction.get() + i, Window::DimY);
+    }
+}
+
+NEGaussianPyramidOrb::NEGaussianPyramidOrb()
+    : _offsets(), _gaus5x5(), _scale_nearest()
+{
+}
+
+void NEGaussianPyramidOrb::configure(const ITensor *input, IPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
+    ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_ORB != pyramid->info()->scale());
+
+    /* Get number of pyramid levels */
+    const size_t num_levels = pyramid->info()->num_levels();
+
+    _input   = input;
+    _pyramid = pyramid;
+
+    if(num_levels > 1)
+    {
+        _gaus5x5       = arm_compute::cpp14::make_unique<NEGaussian5x5[]>(num_levels - 1);
+        _scale_nearest = arm_compute::cpp14::make_unique<NEScaleKernel[]>(num_levels - 1);
+        _offsets       = arm_compute::cpp14::make_unique<Image[]>(num_levels - 1);
+
+        PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_ORB, pyramid->info()->tensor_shape(), Format::U8);
+        _tmp.init(pyramid_info);
+
+        for(unsigned int i = 0; i < num_levels - 1; ++i)
+        {
+            const size_t width  = _pyramid->get_pyramid_level(i + 1)->info()->dimension(0);
+            const size_t height = _pyramid->get_pyramid_level(i + 1)->info()->dimension(1);
+
+            /* Allocate Image for the offsets used by NEAREST interpolation */
+            TensorInfo tensor_info(TensorShape(width, height), Format::S32);
+            _offsets[i].allocator()->init(tensor_info);
+
+            /* Configure gaussian 5x5 */
+            _gaus5x5[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
+
+            /* Configure scale image kernel */
+            _scale_nearest[i].configure(_tmp.get_pyramid_level(i), nullptr, nullptr, _offsets.get() + i, _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR,
+                                        border_mode == BorderMode::UNDEFINED);
+
+            _offsets[i].allocator()->allocate();
+        }
+
+        _tmp.allocate();
+    }
+}
+
+void NEGaussianPyramidOrb::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_pyramid == nullptr, "Unconfigured function");
+
+    /* Get number of pyramid levels */
+    const size_t num_levels = _pyramid->info()->num_levels();
+
+    /* The first level of the pyramid has the input image */
+    _pyramid->get_pyramid_level(0)->copy_from(*_input);
+
+    for(unsigned int i = 0; i < num_levels - 1; ++i)
+    {
+        _gaus5x5[i].run();
+        NEScheduler::get().schedule(_scale_nearest.get() + i, Window::DimY);
+    }
+}
diff --git a/src/runtime/NEON/functions/NEHOGDescriptor.cpp b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
new file mode 100644
index 0000000000..a592f53d44
--- /dev/null
+++ b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEHOGDescriptor.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/HOGInfo.h"
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEHOGDescriptor::NEHOGDescriptor()
+    : _gradient(), _orient_bin(), _block_norm(), _mag(), _phase(), _hog_space()
+{
+}
+
+void NEHOGDescriptor::configure(ITensor *input, ITensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    ARM_COMPUTE_ERROR_ON(nullptr == hog);
+
+    const HOGInfo *hog_info = hog->info();
+    const size_t   width    = input->info()->dimension(Window::DimX);
+    const size_t   height   = input->info()->dimension(Window::DimY);
+    const size_t   num_bins = hog_info->num_bins();
+
+    Size2D cell_size = hog_info->cell_size();
+
+    // Calculate number of cells along the x and y directions for the hog_space
+    const size_t num_cells_x = width / cell_size.width;
+    const size_t num_cells_y = height / cell_size.height;
+
+    // TensorShape of the input image
+    const TensorShape &shape_img = input->info()->tensor_shape();
+
+    // TensorShape of the hog space
+    TensorShape shape_hog_space = input->info()->tensor_shape();
+    shape_hog_space.set(Window::DimX, num_cells_x);
+    shape_hog_space.set(Window::DimY, num_cells_y);
+
+    // Allocate memory for magnitude, phase and hog space
+    TensorInfo info_mag(shape_img, Format::S16);
+    _mag.allocator()->init(info_mag);
+
+    TensorInfo info_phase(shape_img, Format::U8);
+    _phase.allocator()->init(info_phase);
+
+    TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
+    _hog_space.allocator()->init(info_space);
+
+    // Initialise gradient kernel
+    _gradient.configure(input, &_mag, &_phase, hog_info->phase_type(), border_mode, constant_border_value);
+
+    // Initialise orientation binning kernel
+    _orient_bin.configure(&_mag, &_phase, &_hog_space, hog->info());
+
+    // Initialize HOG norm kernel
+    _block_norm.configure(&_hog_space, output, hog->info());
+
+    // Allocate intermediate tensors
+    _mag.allocator()->allocate();
+    _phase.allocator()->allocate();
+    _hog_space.allocator()->allocate();
+}
+
+void NEHOGDescriptor::run()
+{
+    // Run gradient
+    _gradient.run();
+
+    // Run orientation binning kernel
+    NEScheduler::get().schedule(&_orient_bin, Window::DimY);
+
+    // Run block normalization kernel
+    NEScheduler::get().schedule(&_block_norm, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEHOGDetector.cpp b/src/runtime/NEON/functions/NEHOGDetector.cpp
new file mode 100644
index 0000000000..e8ed29d0b9
--- /dev/null
+++ b/src/runtime/NEON/functions/NEHOGDetector.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEHOGDetector.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h"
+
+using namespace arm_compute;
+
+void NEHOGDetector::configure(const ITensor *input, const IHOG *hog, IDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, size_t idx_class)
+{
+    auto k = arm_compute::cpp14::make_unique<NEHOGDetectorKernel>();
+    k->configure(input, hog, detection_windows, detection_window_stride, threshold, idx_class);
+    _kernel = std::move(k);
+}
+\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEHOGGradient.cpp b/src/runtime/NEON/functions/NEHOGGradient.cpp
new file mode 100644
index 0000000000..2f4b8802e3
--- /dev/null
+++ b/src/runtime/NEON/functions/NEHOGGradient.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEHOGGradient.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEHOGGradient::NEHOGGradient()
+    : _derivative(), _mag_phase(nullptr), _gx(), _gy()
+{
+}
+
+void NEHOGGradient::configure(ITensor *input, ITensor *output_magnitude, ITensor *output_phase, PhaseType phase_type, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_magnitude, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_phase, 1, DataType::U8);
+
+    const TensorShape &shape_img = input->info()->tensor_shape();
+
+    // Allocate image memory
+    TensorInfo info(shape_img, Format::S16);
+    _gx.allocator()->init(info);
+    _gy.allocator()->init(info);
+
+    // Initialise derivate kernel
+    _derivative.configure(input, &_gx, &_gy, border_mode, constant_border_value);
+
+    // Initialise magnitude/phase kernel
+    if(PhaseType::UNSIGNED == phase_type)
+    {
+        auto k = arm_compute::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>>();
+        k->configure(&_gx, &_gy, output_magnitude, output_phase);
+        _mag_phase = std::move(k);
+    }
+    else
+    {
+        auto k = arm_compute::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+        k->configure(&_gx, &_gy, output_magnitude, output_phase);
+        _mag_phase = std::move(k);
+    }
+
+    // Allocate intermediate tensors
+    _gx.allocator()->allocate();
+    _gy.allocator()->allocate();
+}
+
+void NEHOGGradient::run()
+{
+    // Run derivative
+    _derivative.run();
+
+    // Run magnitude/phase kernel
+    NEScheduler::get().schedule(_mag_phase.get(), Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
new file mode 100644
index 0000000000..173b8f4c42
--- /dev/null
+++ b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/Tensor.h"
+
+using namespace arm_compute;
+
+NEHOGMultiDetection::NEHOGMultiDetection()
+    : _gradient_kernel(), _orient_bin_kernel(), _block_norm_kernel(), _hog_detect_kernel(), _non_maxima_kernel(), _hog_space(), _hog_norm_space(), _detection_windows(), _mag(), _phase(),
+      _non_maxima_suppression(false), _num_orient_bin_kernel(0), _num_block_norm_kernel(0), _num_hog_detect_kernel(0)
+{
+}
+
+void NEHOGMultiDetection::configure(ITensor *input, const IMultiHOG *multi_hog, IDetectionWindowArray *detection_windows, const ISize2DArray *detection_window_strides, BorderMode border_mode,
+                                    uint8_t constant_border_value, float threshold, bool non_maxima_suppression, float min_distance)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_INVALID_MULTI_HOG(multi_hog);
+    ARM_COMPUTE_ERROR_ON(nullptr == detection_windows);
+    ARM_COMPUTE_ERROR_ON(detection_window_strides->num_values() != multi_hog->num_models());
+
+    const size_t       width      = input->info()->dimension(Window::DimX);
+    const size_t       height     = input->info()->dimension(Window::DimY);
+    const TensorShape &shape_img  = input->info()->tensor_shape();
+    const size_t       num_models = multi_hog->num_models();
+    PhaseType          phase_type = multi_hog->model(0)->info()->phase_type();
+
+    size_t prev_num_bins     = multi_hog->model(0)->info()->num_bins();
+    Size2D prev_cell_size    = multi_hog->model(0)->info()->cell_size();
+    Size2D prev_block_size   = multi_hog->model(0)->info()->block_size();
+    Size2D prev_block_stride = multi_hog->model(0)->info()->block_stride();
+
+    /* Check if NEHOGOrientationBinningKernel and NEHOGBlockNormalizationKernel kernels can be skipped for a specific HOG data-object
+     *
+     * 1) NEHOGOrientationBinningKernel and NEHOGBlockNormalizationKernel are skipped if the cell size and the number of bins don't change.
+     *        Since "multi_hog" is sorted,it is enough to check the HOG descriptors at level "ith" and level "(i-1)th
+     * 2) NEHOGBlockNormalizationKernel is skipped if the cell size, the number of bins and block size do not change.
+     *         Since "multi_hog" is sorted,it is enough to check the HOG descriptors at level "ith" and level "(i-1)th
+     *
+     * @note Since the orientation binning and block normalization kernels can be skipped, we need to keep track of the input to process for each kernel
+     *       with "input_orient_bin", "input_hog_detect" and "input_block_norm"
+     */
+    std::vector<size_t> input_orient_bin;
+    std::vector<size_t> input_hog_detect;
+    std::vector<std::pair<size_t, size_t>> input_block_norm;
+
+    input_orient_bin.push_back(0);
+    input_hog_detect.push_back(0);
+    input_block_norm.emplace_back(0, 0);
+
+    for(size_t i = 1; i < num_models; ++i)
+    {
+        size_t cur_num_bins     = multi_hog->model(i)->info()->num_bins();
+        Size2D cur_cell_size    = multi_hog->model(i)->info()->cell_size();
+        Size2D cur_block_size   = multi_hog->model(i)->info()->block_size();
+        Size2D cur_block_stride = multi_hog->model(i)->info()->block_stride();
+
+        if((cur_num_bins != prev_num_bins) || (cur_cell_size.width != prev_cell_size.width) || (cur_cell_size.height != prev_cell_size.height))
+        {
+            prev_num_bins     = cur_num_bins;
+            prev_cell_size    = cur_cell_size;
+            prev_block_size   = cur_block_size;
+            prev_block_stride = cur_block_stride;
+
+            // Compute orientation binning and block normalization kernels. Update input to process
+            input_orient_bin.push_back(i);
+            input_block_norm.emplace_back(i, input_orient_bin.size() - 1);
+        }
+        else if((cur_block_size.width != prev_block_size.width) || (cur_block_size.height != prev_block_size.height) || (cur_block_stride.width != prev_block_stride.width)
+                || (cur_block_stride.height != prev_block_stride.height))
+        {
+            prev_block_size   = cur_block_size;
+            prev_block_stride = cur_block_stride;
+
+            // Compute block normalization kernel. Update input to process
+            input_block_norm.emplace_back(i, input_orient_bin.size() - 1);
+        }
+
+        // Update input to process for hog detector kernel
+        input_hog_detect.push_back(input_block_norm.size() - 1);
+    }
+
+    _detection_windows      = detection_windows;
+    _non_maxima_suppression = non_maxima_suppression;
+    _num_orient_bin_kernel  = input_orient_bin.size(); // Number of NEHOGOrientationBinningKernel kernels to compute
+    _num_block_norm_kernel  = input_block_norm.size(); // Number of NEHOGBlockNormalizationKernel kernels to compute
+    _num_hog_detect_kernel  = input_hog_detect.size(); // Number of NEHOGDetector functions to compute
+
+    _orient_bin_kernel = arm_compute::cpp14::make_unique<NEHOGOrientationBinningKernel[]>(_num_orient_bin_kernel);
+    _block_norm_kernel = arm_compute::cpp14::make_unique<NEHOGBlockNormalizationKernel[]>(_num_block_norm_kernel);
+    _hog_detect_kernel = arm_compute::cpp14::make_unique<NEHOGDetector[]>(_num_hog_detect_kernel);
+    _non_maxima_kernel = arm_compute::cpp14::make_unique<CPPDetectionWindowNonMaximaSuppressionKernel>();
+    _hog_space         = arm_compute::cpp14::make_unique<Tensor[]>(_num_orient_bin_kernel);
+    _hog_norm_space    = arm_compute::cpp14::make_unique<Tensor[]>(_num_block_norm_kernel);
+
+    // Allocate tensors for magnitude and phase
+    TensorInfo info_mag(shape_img, Format::S16);
+    _mag.allocator()->init(info_mag);
+
+    TensorInfo info_phase(shape_img, Format::U8);
+    _phase.allocator()->init(info_phase);
+
+    // Initialise gradient kernel
+    _gradient_kernel.configure(input, &_mag, &_phase, phase_type, border_mode, constant_border_value);
+
+    // Configure NETensor for the HOG space and orientation binning kernel
+    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+    {
+        const size_t idx_multi_hog = input_orient_bin[i];
+
+        // Get the corresponding cell size and number of bins
+        const Size2D &cell     = multi_hog->model(idx_multi_hog)->info()->cell_size();
+        const size_t  num_bins = multi_hog->model(idx_multi_hog)->info()->num_bins();
+
+        // Calculate number of cells along the x and y directions for the hog_space
+        const size_t num_cells_x = width / cell.width;
+        const size_t num_cells_y = height / cell.height;
+
+        // TensorShape of hog space
+        TensorShape shape_hog_space = input->info()->tensor_shape();
+        shape_hog_space.set(Window::DimX, num_cells_x);
+        shape_hog_space.set(Window::DimY, num_cells_y);
+
+        // Allocate HOG space
+        TensorInfo info_space(shape_hog_space, num_bins, DataType::F32);
+        _hog_space[i].allocator()->init(info_space);
+
+        // Initialise orientation binning kernel
+        _orient_bin_kernel[i].configure(&_mag, &_phase, _hog_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+    }
+
+    // Configure NETensor for the normalized HOG space and block normalization kernel
+    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+    {
+        const size_t idx_multi_hog  = input_block_norm[i].first;
+        const size_t idx_orient_bin = input_block_norm[i].second;
+
+        // Allocate normalized HOG space
+        TensorInfo tensor_info(*(multi_hog->model(idx_multi_hog)->info()), width, height);
+        _hog_norm_space[i].allocator()->init(tensor_info);
+
+        // Initialize block normalization kernel
+        _block_norm_kernel[i].configure(_hog_space.get() + idx_orient_bin, _hog_norm_space.get() + i, multi_hog->model(idx_multi_hog)->info());
+    }
+
+    // Configure HOG detector kernel
+    for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
+    {
+        const size_t idx_block_norm = input_hog_detect[i];
+
+        _hog_detect_kernel[i].configure(_hog_norm_space.get() + idx_block_norm, multi_hog->model(i), detection_windows, detection_window_strides->at(i), threshold, i);
+    }
+
+    // Configure non maxima suppression kernel
+    _non_maxima_kernel->configure(_detection_windows, min_distance);
+
+    // Allocate intermediate tensors
+    _mag.allocator()->allocate();
+    _phase.allocator()->allocate();
+
+    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+    {
+        _hog_space[i].allocator()->allocate();
+    }
+
+    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+    {
+        _hog_norm_space[i].allocator()->allocate();
+    }
+}
+
+void NEHOGMultiDetection::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_detection_windows == nullptr, "Unconfigured function");
+
+    // Reset detection window
+    _detection_windows->clear();
+
+    // Run gradient
+    _gradient_kernel.run();
+
+    // Run orientation binning kernel
+    for(size_t i = 0; i < _num_orient_bin_kernel; ++i)
+    {
+        NEScheduler::get().schedule(_orient_bin_kernel.get() + i, Window::DimY);
+    }
+
+    // Run block normalization kernel
+    for(size_t i = 0; i < _num_block_norm_kernel; ++i)
+    {
+        NEScheduler::get().schedule(_block_norm_kernel.get() + i, Window::DimY);
+    }
+
+    // Run HOG detector kernel
+    for(size_t i = 0; i < _num_hog_detect_kernel; ++i)
+    {
+        _hog_detect_kernel[i].run();
+    }
+
+    // Run non-maxima suppression kernel if enabled
+    if(_non_maxima_suppression)
+    {
+        _non_maxima_kernel->run(_non_maxima_kernel->window());
+    }
+}
diff --git a/src/runtime/NEON/functions/NEHarrisCorners.cpp b/src/runtime/NEON/functions/NEHarrisCorners.cpp
new file mode 100644
index 0000000000..b54fb67ab7
--- /dev/null
+++ b/src/runtime/NEON/functions/NEHarrisCorners.cpp
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEHarrisCorners.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/Array.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/NEON/functions/NESobel3x3.h"
+#include "arm_compute/runtime/NEON/functions/NESobel5x5.h"
+#include "arm_compute/runtime/NEON/functions/NESobel7x7.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include <cmath>
+#include <utility>
+
+using namespace arm_compute;
+
+NEHarrisCorners::NEHarrisCorners()
+    : _sobel(), _harris_score(), _non_max_suppr(), _candidates(), _sort_euclidean(), _border_gx(), _border_gy(), _gx(), _gy(), _score(), _nonmax(), _corners_list(), _num_corner_candidates(0)
+{
+}
+
+void NEHarrisCorners::configure(IImage *input, float threshold, float min_dist,
+                                float sensitivity, int32_t gradient_size, int32_t block_size, KeyPointArray *corners,
+                                BorderMode border_mode, uint8_t constant_border_value, bool use_fp16)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(!(block_size == 3 || block_size == 5 || block_size == 7));
+
+    const TensorShape shape = input->info()->tensor_shape();
+    TensorInfo        tensor_info_gxgy;
+
+    if(gradient_size < 7)
+    {
+        tensor_info_gxgy.init(shape, Format::S16);
+    }
+    else
+    {
+        tensor_info_gxgy.init(shape, Format::S32);
+    }
+
+    _gx.allocator()->init(tensor_info_gxgy);
+    _gy.allocator()->init(tensor_info_gxgy);
+
+    TensorInfo tensor_info_score(shape, Format::F32);
+    _score.allocator()->init(tensor_info_score);
+    _nonmax.allocator()->init(tensor_info_score);
+
+    _corners_list = arm_compute::cpp14::make_unique<InternalKeypoint[]>(shape.x() * shape.y());
+
+    // Set/init Sobel kernel accordingly with gradient_size
+    switch(gradient_size)
+    {
+        case 3:
+        {
+            auto k = arm_compute::cpp14::make_unique<NESobel3x3>();
+            k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+            _sobel = std::move(k);
+            break;
+        }
+        case 5:
+        {
+            auto k = arm_compute::cpp14::make_unique<NESobel5x5>();
+            k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+            _sobel = std::move(k);
+            break;
+        }
+        case 7:
+        {
+            auto k = arm_compute::cpp14::make_unique<NESobel7x7>();
+            k->configure(input, &_gx, &_gy, border_mode, constant_border_value);
+            _sobel = std::move(k);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Gradient size not implemented");
+    }
+
+    // Normalization factor
+    const float norm_factor = 1.0f / (255.0f * pow(4.0f, gradient_size / 2) * block_size);
+
+    if(use_fp16)
+    {
+        switch(block_size)
+        {
+            case 3:
+            {
+                auto k = arm_compute::cpp14::make_unique<NEHarrisScoreFP16Kernel<3>>();
+                k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+                _harris_score = std::move(k);
+            }
+            break;
+            case 5:
+            {
+                auto k = arm_compute::cpp14::make_unique<NEHarrisScoreFP16Kernel<5>>();
+                k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+                _harris_score = std::move(k);
+            }
+            break;
+            case 7:
+            {
+                auto k = arm_compute::cpp14::make_unique<NEHarrisScoreFP16Kernel<7>>();
+                k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+                _harris_score = std::move(k);
+            }
+            default:
+                break;
+        }
+    }
+    else
+    {
+        // Set/init Harris Score kernel accordingly with block_size
+        switch(block_size)
+        {
+            case 3:
+            {
+                auto k = arm_compute::cpp14::make_unique<NEHarrisScoreKernel<3>>();
+                k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+                _harris_score = std::move(k);
+            }
+            break;
+            case 5:
+            {
+                auto k = arm_compute::cpp14::make_unique<NEHarrisScoreKernel<5>>();
+                k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+                _harris_score = std::move(k);
+            }
+            break;
+            case 7:
+            {
+                auto k = arm_compute::cpp14::make_unique<NEHarrisScoreKernel<7>>();
+                k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+                _harris_score = std::move(k);
+            }
+            default:
+                break;
+        }
+    }
+
+    // Configure border filling before harris score
+    _border_gx.configure(&_gx, _harris_score->border_size(), border_mode, constant_border_value);
+    _border_gy.configure(&_gy, _harris_score->border_size(), border_mode, constant_border_value);
+
+    // Init non-maxima suppression function
+    _non_max_suppr.configure(&_score, &_nonmax, border_mode);
+
+    // Init corner candidates kernel
+    _candidates.configure(&_nonmax, _corners_list.get(), &_num_corner_candidates);
+
+    // Init euclidean distance
+    _sort_euclidean.configure(_corners_list.get(), corners, &_num_corner_candidates, min_dist);
+
+    // Allocate once all the configure methods have been called
+    _gx.allocator()->allocate();
+    _gy.allocator()->allocate();
+    _score.allocator()->allocate();
+    _nonmax.allocator()->allocate();
+}
+
+void NEHarrisCorners::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function");
+
+    // Init to 0 number of corner candidates
+    _num_corner_candidates = 0;
+
+    // Run Sobel kernel
+    _sobel->run();
+
+    // Fill border before harris score kernel
+    _border_gx.run(_border_gx.window());
+    _border_gy.run(_border_gy.window());
+
+    // Run harris score kernel
+    NEScheduler::get().schedule(_harris_score.get(), Window::DimY);
+
+    // Run non-maxima suppression
+    _non_max_suppr.run();
+
+    // Run corner candidate kernel
+    NEScheduler::get().schedule(&_candidates, Window::DimY);
+
+    // Run sort & euclidean distance
+    _sort_euclidean.run(_sort_euclidean.window());
+}
diff --git a/src/runtime/NEON/functions/NEHistogram.cpp b/src/runtime/NEON/functions/NEHistogram.cpp
new file mode 100644
index 0000000000..c42b2a56e0
--- /dev/null
+++ b/src/runtime/NEON/functions/NEHistogram.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEHistogram.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IDistribution1D.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEHistogram::NEHistogram()
+    : _histogram_kernel(), _local_hist(), _window_lut(arm_compute::cpp14::make_unique<uint32_t[]>(window_lut_default_size)), _local_hist_size(0)
+{
+}
+
+void NEHistogram::configure(const IImage *input, IDistribution1D *output)
+{
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+    // Allocate space for threads local histograms
+    _local_hist_size = output->num_bins() * NEScheduler::get().num_threads();
+    _local_hist      = arm_compute::cpp14::make_unique<uint32_t[]>(_local_hist_size);
+
+    // Configure kernel
+    _histogram_kernel.configure(input, output, _local_hist.get(), _window_lut.get());
+}
+
+void NEHistogram::run()
+{
+    // Calculate histogram of input.
+    NEScheduler::get().schedule(&_histogram_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEIntegralImage.cpp b/src/runtime/NEON/functions/NEIntegralImage.cpp
new file mode 100644
index 0000000000..af604e9295
--- /dev/null
+++ b/src/runtime/NEON/functions/NEIntegralImage.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEIntegralImage.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEIntegralImageKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEIntegralImage::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEIntegralImageKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+    _border_handler.configure(output, _kernel->border_size(), BorderMode::CONSTANT, 0);
+}
diff --git a/src/runtime/NEON/functions/NELaplacianPyramid.cpp b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
new file mode 100644
index 0000000000..8232c79f2d
--- /dev/null
+++ b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NELaplacianPyramid.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IPyramid.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
+#include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
+#include "arm_compute/runtime/Tensor.h"
+
+using namespace arm_compute;
+
+NELaplacianPyramid::NELaplacianPyramid()
+    : _num_levels(0), _gaussian_pyr_function(), _convf(), _subf(), _gauss_pyr(), _conv_pyr(), _depth_function()
+{
+}
+
+void NELaplacianPyramid::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(0 == _num_levels, "Unconfigured function");
+
+    // Compute Gaussian Pyramid
+    _gaussian_pyr_function.run();
+
+    for(unsigned int i = 0; i < _num_levels; ++i)
+    {
+        // Apply Gaussian filter to gaussian pyramid image
+        _convf[i].run();
+    }
+
+    for(unsigned int i = 0; i < _num_levels; ++i)
+    {
+        // Compute laplacian image
+        _subf[i].run();
+    }
+
+    _depth_function.run();
+}
+
+void NELaplacianPyramid::configure(const ITensor *input, IPyramid *pyramid, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON(0 == pyramid->info()->num_levels());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->info()->width());
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(1));
+
+    _num_levels = pyramid->info()->num_levels();
+
+    // Create and initialize the gaussian pyramid and the convoluted pyramid
+    PyramidInfo pyramid_info;
+    pyramid_info.init(_num_levels, 0.5f, pyramid->info()->tensor_shape(), arm_compute::Format::U8);
+
+    _gauss_pyr.init(pyramid_info);
+    _conv_pyr.init(pyramid_info);
+
+    // Create Gaussian Pyramid function
+    _gaussian_pyr_function.configure(input, &_gauss_pyr, border_mode, constant_border_value);
+
+    _convf = arm_compute::cpp14::make_unique<NEGaussian5x5[]>(_num_levels);
+    _subf  = arm_compute::cpp14::make_unique<NEArithmeticSubtraction[]>(_num_levels);
+
+    for(unsigned int i = 0; i < _num_levels; ++i)
+    {
+        _convf[i].configure(_gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), border_mode, constant_border_value);
+        _subf[i].configure(_gauss_pyr.get_pyramid_level(i), _conv_pyr.get_pyramid_level(i), pyramid->get_pyramid_level(i), ConvertPolicy::WRAP);
+    }
+
+    _depth_function.configure(_conv_pyr.get_pyramid_level(_num_levels - 1), output, ConvertPolicy::WRAP, 0);
+
+    _gauss_pyr.allocate();
+    _conv_pyr.allocate();
+}
diff --git a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
new file mode 100644
index 0000000000..36ac4a74d1
--- /dev/null
+++ b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IPyramid.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cstddef>
+
+using namespace arm_compute;
+
+NELaplacianReconstruct::NELaplacianReconstruct()
+    : _tmp_pyr(), _addf(), _scalef(), _depthf()
+{
+}
+
+void NELaplacianReconstruct::configure(const IPyramid *pyramid, const ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
+    ARM_COMPUTE_ERROR_ON(input == output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
+    ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() != pyramid->get_pyramid_level(0)->info()->num_dimensions());
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != pyramid->get_pyramid_level(0)->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != pyramid->get_pyramid_level(0)->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->get_pyramid_level(pyramid->info()->num_levels() - 1)->info()->dimension(1));
+
+    const size_t num_levels = pyramid->info()->num_levels();
+
+    // Create and initialize the tmp pyramid: I(n-2) = upsample( input + Laplace(n-1) )
+    PyramidInfo pyramid_info;
+    pyramid_info.init(num_levels, 0.5f, output->info()->tensor_shape(), arm_compute::Format::S16);
+
+    _tmp_pyr.init(pyramid_info);
+
+    // Allocate add and scale functions. Level 0 does not need to be scaled.
+    _addf   = arm_compute::cpp14::make_unique<NEArithmeticAddition[]>(num_levels);
+    _scalef = arm_compute::cpp14::make_unique<NEScale[]>(num_levels - 1);
+
+    const size_t last_level = num_levels - 1;
+
+    _addf[last_level].configure(input, pyramid->get_pyramid_level(last_level), _tmp_pyr.get_pyramid_level(last_level), ConvertPolicy::SATURATE);
+
+    // Scale levels n-1 to 1, and add levels n-2 to 0
+    for(size_t l = 0; l < last_level; ++l)
+    {
+        _scalef[l].configure(_tmp_pyr.get_pyramid_level(l + 1), _tmp_pyr.get_pyramid_level(l), arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, constant_border_value);
+        _addf[l].configure(_tmp_pyr.get_pyramid_level(l), pyramid->get_pyramid_level(l), _tmp_pyr.get_pyramid_level(l), ConvertPolicy::SATURATE);
+    }
+
+    // Convert level 0 from S16 to U8
+    _depthf.configure(_tmp_pyr.get_pyramid_level(0), output, ConvertPolicy::SATURATE, 0);
+
+    _tmp_pyr.allocate();
+}
+
+void NELaplacianReconstruct::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_addf == nullptr, "Unconfigured function");
+
+    const size_t last_level = _tmp_pyr.info()->num_levels() - 1;
+
+    _addf[last_level].run();
+
+    // Run l = [last_level - 1, 0]
+    for(size_t l = last_level; l-- > 0;)
+    {
+        _scalef[l].run();
+        _addf[l].run();
+    }
+
+    _depthf.run();
+}
diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
new file mode 100644
index 0000000000..85d7ba3650
--- /dev/null
+++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <cmath>
+#include <tuple>
+
+using namespace arm_compute;
+
+NELocallyConnectedLayer::NELocallyConnectedLayer()
+    : _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(), _is_first_run(false)
+{
+}
+
+void NELocallyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 2);
+    }
+
+    bool _has_bias = (biases != nullptr);
+    _is_first_run  = true;
+
+    // Get parameters for conv_info
+    unsigned int stride_x = 0;
+    unsigned int stride_y = 0;
+    unsigned int pad_x    = 0;
+    unsigned int pad_y    = 0;
+    std::tie(stride_x, stride_y) = conv_info.stride();
+    std::tie(pad_x, pad_y)       = conv_info.pad();
+
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0),
+                                                 stride_x, stride_y, pad_x, pad_y, conv_info.round());
+
+    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+    ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
+
+    // Create tensor to store the reshaped weights
+    const size_t mat_weights_cols = weights->info()->dimension(3);
+    const size_t mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
+    const size_t mat_weights_num  = weights->info()->dimension(4);
+
+    const TensorShape shape_wr(mat_weights_cols, mat_weights_rows, mat_weights_num);
+
+    _weights_reshaped.allocator()->init(TensorInfo(shape_wr, 1, weights->info()->data_type()));
+
+    // Create tensor to store im2col reshaped inputs
+    const size_t mat_input_cols = mat_weights_rows;
+    const size_t mat_input_rows = conv_w * conv_h;
+    TensorShape  shape_im2col   = input->info()->tensor_shape();
+    shape_im2col.set(0, mat_input_cols);
+    shape_im2col.set(1, mat_input_rows);
+    shape_im2col.set(2, 1);
+
+    _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
+
+    // Create locally connected layer output tensor
+    TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
+    shape_gemm.set(0, mat_weights_cols);
+    shape_gemm.set(1, mat_input_rows);
+    _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
+
+    // Configure kernels
+    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, std::make_pair(conv_w, conv_h), conv_info, _has_bias);
+    _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
+    _mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
+    _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+
+    // Allocate intermediate tensors
+    _weights_reshaped.allocator()->allocate();
+    _input_im2col_reshaped.allocator()->allocate();
+    _gemm_output.allocator()->allocate();
+}
+
+void NELocallyConnectedLayer::run()
+{
+    // Run weights reshaping (Runs once for every configure)
+    if(_is_first_run)
+    {
+        _is_first_run = false;
+        NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
+    }
+
+    // Run input reshaping
+    NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
+
+    // Runs GEMM on reshaped matrices
+    NEScheduler::get().schedule(&_mm_kernel, Window::DimX);
+
+    // Reshape output matrix
+    NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEMagnitude.cpp b/src/runtime/NEON/functions/NEMagnitude.cpp
new file mode 100644
index 0000000000..9390ca2b6a
--- /dev/null
+++ b/src/runtime/NEON/functions/NEMagnitude.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEMagnitude.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
+#include "arm_compute/core/Types.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEMagnitude::configure(const ITensor *input1, const ITensor *input2, ITensor *output, bool use_fp16)
+{
+    if(use_fp16)
+    {
+        auto k = arm_compute::cpp14::make_unique<NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+        k->configure(input1, input2, output, nullptr);
+        _kernel = std::move(k);
+    }
+    else
+    {
+        auto k = arm_compute::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+        k->configure(input1, input2, output, nullptr);
+        _kernel = std::move(k);
+    }
+}
diff --git a/src/runtime/NEON/functions/NEMeanStdDev.cpp b/src/runtime/NEON/functions/NEMeanStdDev.cpp
new file mode 100644
index 0000000000..47143f5e5b
--- /dev/null
+++ b/src/runtime/NEON/functions/NEMeanStdDev.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEMeanStdDev.h"
+
+#include "arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEMeanStdDev::NEMeanStdDev()
+    : _mean_stddev_kernel(), _global_sum(0), _global_sum_squared(0)
+{
+}
+
+void NEMeanStdDev::configure(const IImage *input, float *mean, float *stddev)
+{
+    _mean_stddev_kernel.configure(input, mean, &_global_sum, stddev, &_global_sum_squared);
+}
+
+void NEMeanStdDev::run()
+{
+    _global_sum         = 0;
+    _global_sum_squared = 0;
+
+    NEScheduler::get().schedule(&_mean_stddev_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEMedian3x3.cpp b/src/runtime/NEON/functions/NEMedian3x3.cpp
new file mode 100644
index 0000000000..aa7cc97081
--- /dev/null
+++ b/src/runtime/NEON/functions/NEMedian3x3.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEMedian3x3.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEMedian3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<NEMedian3x3Kernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NEMinMaxLocation.cpp b/src/runtime/NEON/functions/NEMinMaxLocation.cpp
new file mode 100644
index 0000000000..cab9200cf8
--- /dev/null
+++ b/src/runtime/NEON/functions/NEMinMaxLocation.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEMinMaxLocation.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEMinMaxLocation::NEMinMaxLocation()
+    : _min_max(), _min_max_loc()
+{
+}
+
+void NEMinMaxLocation::configure(const IImage *input, int32_t *min, int32_t *max, ICoordinates2DArray *min_loc, ICoordinates2DArray *max_loc, uint32_t *min_count, uint32_t *max_count)
+{
+    _min_max.configure(input, min, max);
+    _min_max_loc.configure(input, min, max, min_loc, max_loc, min_count, max_count);
+}
+
+void NEMinMaxLocation::run()
+{
+    _min_max.reset();
+
+    /* Run min max kernel */
+    NEScheduler::get().schedule(&_min_max, Window::DimY);
+
+    /* Run min max location */
+    NEScheduler::get().schedule(&_min_max_loc, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NENonLinearFilter.cpp b/src/runtime/NEON/functions/NENonLinearFilter.cpp
new file mode 100644
index 0000000000..01aea3b671
--- /dev/null
+++ b/src/runtime/NEON/functions/NENonLinearFilter.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NENonLinearFilter.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NENonLinearFilter::configure(ITensor *input, ITensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
+                                  BorderMode border_mode,
+                                  uint8_t    constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<NENonLinearFilterKernel>();
+    k->configure(input, output, function, mask_size, pattern, mask, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp b/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
new file mode 100644
index 0000000000..a7b3759a45
--- /dev/null
+++ b/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NENonMaximaSuppression3x3::configure(ITensor *input, ITensor *output, BorderMode border_mode)
+{
+    auto k = arm_compute::cpp14::make_unique<NENonMaximaSuppression3x3Kernel>();
+    k->configure(input, output, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+
+    if(border_mode != BorderMode::UNDEFINED)
+    {
+        _border_handler.configure(input, 1, BorderMode::CONSTANT, 0);
+    }
+    else
+    {
+        _border_handler.configure(input, 1, BorderMode::UNDEFINED, 0);
+    }
+}
diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp
new file mode 100644
index 0000000000..69ff32591f
--- /dev/null
+++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NENormalizationLayer::NENormalizationLayer()
+    : _norm_kernel(), _multiply_kernel(), _border_handler(), _input_squared()
+{
+}
+
+void NENormalizationLayer::configure(const ITensor *input, ITensor *output, NormalizationLayerInfo norm_info)
+{
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+
+    TensorInfo tensor_info(input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+    _input_squared.allocator()->init(tensor_info);
+
+    // Configure kernels
+    _norm_kernel.configure(input, &_input_squared, output, norm_info);
+    _multiply_kernel.configure(input, input, &_input_squared, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _border_handler.configure(&_input_squared, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0.0f));
+
+    // Allocate the tensor once the configure methods have been called
+    _input_squared.allocator()->allocate();
+}
+
+void NENormalizationLayer::run()
+{
+    NEScheduler::get().schedule(&_multiply_kernel, Window::DimY);
+    NEScheduler::get().schedule(&_border_handler, Window::DimY);
+    NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NEOpticalFlow.cpp b/src/runtime/NEON/functions/NEOpticalFlow.cpp
new file mode 100644
index 0000000000..49135e442c
--- /dev/null
+++ b/src/runtime/NEON/functions/NEOpticalFlow.cpp
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEOpticalFlow.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/NEON/functions/NEScharr3x3.h"
+#include "arm_compute/runtime/Pyramid.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+using namespace arm_compute;
+
+NEOpticalFlow::NEOpticalFlow()
+    : _func_scharr(), _kernel_tracker(), _scharr_gx(), _scharr_gy(), _new_points(nullptr), _new_points_estimates(nullptr), _old_points(nullptr), _new_points_internal(), _old_points_internal(),
+      _num_levels(0)
+{
+}
+
+void NEOpticalFlow::configure(const Pyramid *old_pyramid, const Pyramid *new_pyramid, const IKeyPointArray *old_points, const IKeyPointArray *new_points_estimates,
+                              IKeyPointArray *new_points, Termination termination, float epsilon, unsigned int num_iterations, size_t window_dimension,
+                              bool use_initial_estimate, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == old_pyramid);
+    ARM_COMPUTE_ERROR_ON(nullptr == new_pyramid);
+    ARM_COMPUTE_ERROR_ON(nullptr == old_points);
+    ARM_COMPUTE_ERROR_ON(nullptr == new_points_estimates);
+    ARM_COMPUTE_ERROR_ON(nullptr == new_points);
+    ARM_COMPUTE_ERROR_ON(old_pyramid->info()->num_levels() != new_pyramid->info()->num_levels());
+    ARM_COMPUTE_ERROR_ON(0 == old_pyramid->info()->num_levels());
+    ARM_COMPUTE_ERROR_ON(old_pyramid->info()->width() != new_pyramid->info()->width());
+    ARM_COMPUTE_ERROR_ON(old_pyramid->info()->height() != new_pyramid->info()->height());
+    ARM_COMPUTE_ERROR_ON(use_initial_estimate && old_points->num_values() != new_points_estimates->num_values());
+
+    _num_levels           = old_pyramid->info()->num_levels();
+    _old_points           = old_points;
+    _new_points           = new_points;
+    _new_points_estimates = new_points_estimates;
+
+    const float pyr_scale = old_pyramid->info()->scale();
+
+    _func_scharr    = arm_compute::cpp14::make_unique<NEScharr3x3[]>(_num_levels);
+    _kernel_tracker = arm_compute::cpp14::make_unique<NELKTrackerKernel[]>(_num_levels);
+    _scharr_gx      = arm_compute::cpp14::make_unique<Tensor[]>(_num_levels);
+    _scharr_gy      = arm_compute::cpp14::make_unique<Tensor[]>(_num_levels);
+
+    _old_points_internal = LKInternalKeypointArray(old_points->num_values());
+    _new_points_internal = LKInternalKeypointArray(old_points->num_values());
+    _new_points->resize(old_points->num_values());
+
+    for(unsigned int i = 0; i < _num_levels; ++i)
+    {
+        // Get images from the ith level of old and right pyramid
+        IImage *old_ith_input = old_pyramid->get_pyramid_level(i);
+        IImage *new_ith_input = new_pyramid->get_pyramid_level(i);
+
+        // Get width and height of images
+        const unsigned int width_ith  = old_ith_input->info()->dimension(0);
+        const unsigned int height_ith = new_ith_input->info()->dimension(1);
+
+        TensorInfo tensor_info(TensorShape(width_ith, height_ith), Format::S16);
+
+        _scharr_gx[i].allocator()->init(tensor_info);
+        _scharr_gy[i].allocator()->init(tensor_info);
+
+        // Init Scharr kernel
+        _func_scharr[i].configure(old_ith_input, _scharr_gx.get() + i, _scharr_gy.get() + i, border_mode, constant_border_value);
+
+        // Init Lucas-Kanade kernel
+        _kernel_tracker[i].configure(old_ith_input, new_ith_input, _scharr_gx.get() + i, _scharr_gy.get() + i,
+                                     old_points, new_points_estimates, new_points,
+                                     &_old_points_internal, &_new_points_internal,
+                                     termination, use_initial_estimate, epsilon, num_iterations, window_dimension,
+                                     i, _num_levels, pyr_scale);
+
+        _scharr_gx[i].allocator()->allocate();
+        _scharr_gy[i].allocator()->allocate();
+    }
+}
+
+void NEOpticalFlow::run()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_num_levels == 0, "Unconfigured function");
+
+    for(unsigned int level = _num_levels; level > 0; --level)
+    {
+        // Run Scharr kernel
+        _func_scharr[level - 1].run();
+
+        // Run Lucas-Kanade kernel
+        NEScheduler::get().schedule(_kernel_tracker.get() + level - 1, Window::DimX);
+    }
+}
diff --git a/src/runtime/NEON/functions/NEPhase.cpp b/src/runtime/NEON/functions/NEPhase.cpp
new file mode 100644
index 0000000000..7683f461d3
--- /dev/null
+++ b/src/runtime/NEON/functions/NEPhase.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEPhase.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEPhase::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+    k->configure(input1, input2, nullptr, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
new file mode 100644
index 0000000000..056d33b370
--- /dev/null
+++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEPixelWiseMultiplication::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+{
+    auto k = arm_compute::cpp14::make_unique<NEPixelWiseMultiplicationKernel>();
+    k->configure(input1, input2, output, scale, overflow_policy, rounding_policy);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
new file mode 100644
index 0000000000..6f0cc4f160
--- /dev/null
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
+
+using namespace arm_compute;
+
+void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
+{
+    // Configure pooling kernel
+    auto k = arm_compute::cpp14::make_unique<NEPoolingLayerKernel>();
+    k->configure(input, output, pool_info);
+    _kernel = std::move(k);
+
+    // Configure border depending on operation required
+    BorderMode border_mode = (pool_info.pool_type() == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(0));
+}
diff --git a/src/runtime/NEON/functions/NERemap.cpp b/src/runtime/NEON/functions/NERemap.cpp
new file mode 100644
index 0000000000..9f06fb699c
--- /dev/null
+++ b/src/runtime/NEON/functions/NERemap.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NERemap.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NERemapKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NERemap::configure(ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported");
+
+    auto k = arm_compute::cpp14::make_unique<NERemapKernel>();
+
+    k->configure(input, map_x, map_y, output, policy);
+
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
new file mode 100644
index 0000000000..b70f626df0
--- /dev/null
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEScale.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include <cmath>
+#include <cstddef>
+#include <utility>
+
+using namespace arm_compute;
+
+namespace
+{
+void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, size_t input_element_size)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == offsets);
+
+    Window win;
+    win.set(Window::DimX, Window::Dimension(0, offsets->info()->dimension(0), 1));
+    win.set(Window::DimY, Window::Dimension(0, offsets->info()->dimension(1), 1));
+
+    if(dx != nullptr && dy != nullptr)
+    {
+        // Pre-compute the offset and pixel's distance for BILINEAR interpolation
+        Iterator offsets_it(offsets, win);
+        Iterator dx_it(dx, win);
+        Iterator dy_it(dy, win);
+
+        execute_window_loop(win, [&](const Coordinates & id)
+        {
+            const float in_x  = (id.x() + 0.5f) * wr - 0.5f;
+            const float in_y  = (id.y() + 0.5f) * hr - 0.5f;
+            const int   in_xi = std::floor(in_x);
+            const int   in_yi = std::floor(in_y);
+
+            *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * input_element_size;
+            *reinterpret_cast<float *>(dx_it.ptr())        = in_x - in_xi;
+            *reinterpret_cast<float *>(dy_it.ptr())        = in_y - in_yi;
+        },
+        offsets_it, dx_it, dy_it);
+    }
+    else
+    {
+        // Pre-compute the offset for NEAREST interpolation
+        Iterator offsets_it(offsets, win);
+
+        execute_window_loop(win, [&](const Coordinates & id)
+        {
+            const size_t in_xi = (id.x() + 0.5f) * wr;
+
+            *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * input_element_size;
+        },
+        offsets_it);
+    }
+}
+} // namespace
+
+NEScale::NEScale()
+    : _offsets(), _dx(), _dy()
+{
+}
+
+void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
+    }
+
+    // Get the tensor shape
+    const TensorShape shape(output->info()->dimension(0), output->info()->dimension(1));
+
+    // Compute the ratio between source width/height and destination width/height
+    const auto wr = static_cast<float>(input->info()->dimension(0)) / static_cast<float>(output->info()->dimension(0));
+    const auto hr = static_cast<float>(input->info()->dimension(1)) / static_cast<float>(output->info()->dimension(1));
+
+    // Get the element size of the input image
+    const size_t input_element_size = input->info()->element_size();
+
+    // Area interpolation behaves as Nearest Neighbour in case of up-sampling
+    if(policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+    {
+        policy = InterpolationPolicy::NEAREST_NEIGHBOR;
+    }
+
+    auto k = arm_compute::cpp14::make_unique<NEScaleKernel>();
+
+    // Check if the border mode is UNDEFINED
+    const bool border_undefined = border_mode == BorderMode::UNDEFINED;
+
+    switch(policy)
+    {
+        case InterpolationPolicy::NEAREST_NEIGHBOR:
+        {
+            TensorInfo tensor_info_offsets(shape, Format::S32);
+            _offsets.allocator()->init(tensor_info_offsets);
+
+            k->configure(input, nullptr, nullptr, &_offsets, output, policy, border_undefined);
+
+            // Allocate once the configure methods have been called
+            _offsets.allocator()->allocate();
+
+            // Pre-compute offsets for nearest interpolation
+            precompute_dx_dy_offsets(nullptr, nullptr, &_offsets, wr, hr, input_element_size);
+            break;
+        }
+        case InterpolationPolicy::BILINEAR:
+        {
+            TensorInfo tensor_info_offsets(shape, Format::S32);
+            TensorInfo tensor_info_dxdy(shape, Format::F32);
+
+            _offsets.allocator()->init(tensor_info_offsets);
+            _dx.allocator()->init(tensor_info_dxdy);
+            _dy.allocator()->init(tensor_info_dxdy);
+
+            k->configure(input, &_dx, &_dy, &_offsets, output, policy, border_undefined);
+
+            // Allocate once the configure methods have been called
+            _offsets.allocator()->allocate();
+            _dx.allocator()->allocate();
+            _dy.allocator()->allocate();
+
+            // Pre-compute dx, dy and offsets for bilinear interpolation
+            precompute_dx_dy_offsets(&_dx, &_dy, &_offsets, wr, hr, input_element_size);
+            break;
+        }
+        case InterpolationPolicy::AREA:
+        {
+            k->configure(input, nullptr, nullptr, nullptr, output, policy, border_undefined);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Unsupported interpolation mode");
+    }
+
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NEScharr3x3.cpp b/src/runtime/NEON/functions/NEScharr3x3.cpp
new file mode 100644
index 0000000000..04b3f14ce7
--- /dev/null
+++ b/src/runtime/NEON/functions/NEScharr3x3.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEScharr3x3.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEScharr3x3::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<NEScharr3x3Kernel>();
+    k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NESobel3x3.cpp b/src/runtime/NEON/functions/NESobel3x3.cpp
new file mode 100644
index 0000000000..3b46fd78c1
--- /dev/null
+++ b/src/runtime/NEON/functions/NESobel3x3.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NESobel3x3.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NESobel3x3Kernel.h"
+#include "arm_compute/core/PixelValue.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NESobel3x3::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    auto k = arm_compute::cpp14::make_unique<NESobel3x3Kernel>();
+    k->configure(input, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(constant_border_value));
+}
diff --git a/src/runtime/NEON/functions/NESobel5x5.cpp b/src/runtime/NEON/functions/NESobel5x5.cpp
new file mode 100644
index 0000000000..8967a22ba1
--- /dev/null
+++ b/src/runtime/NEON/functions/NESobel5x5.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NESobel5x5.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+using namespace arm_compute;
+
+NESobel5x5::NESobel5x5()
+    : _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler()
+{
+}
+
+void NESobel5x5::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+
+    const bool run_sobel_x = output_x != nullptr;
+    const bool run_sobel_y = output_y != nullptr;
+
+    TensorInfo tensor_info(input->info()->tensor_shape(), Format::S16);
+
+    if(run_sobel_x && run_sobel_y)
+    {
+        _tmp_x.allocator()->init(tensor_info);
+        _tmp_y.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+        _tmp_x.allocator()->allocate();
+        _tmp_y.allocator()->allocate();
+    }
+    else if(run_sobel_x)
+    {
+        _tmp_x.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _tmp_x.allocator()->allocate();
+    }
+    else if(run_sobel_y)
+    {
+        _tmp_y.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
+        _tmp_y.allocator()->allocate();
+    }
+
+    _border_handler.configure(input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+}
+
+void NESobel5x5::run()
+{
+    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
+    NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NESobel7x7.cpp b/src/runtime/NEON/functions/NESobel7x7.cpp
new file mode 100644
index 0000000000..f628da9709
--- /dev/null
+++ b/src/runtime/NEON/functions/NESobel7x7.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NESobel7x7.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+using namespace arm_compute;
+
+NESobel7x7::NESobel7x7()
+    : _sobel_hor(), _sobel_vert(), _tmp_x(), _tmp_y(), _border_handler()
+{
+}
+
+void NESobel7x7::configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+
+    const bool run_sobel_x = output_x != nullptr;
+    const bool run_sobel_y = output_y != nullptr;
+
+    TensorInfo tensor_info(input->info()->tensor_shape(), Format::S32);
+
+    if(run_sobel_x && run_sobel_y)
+    {
+        _tmp_x.allocator()->init(tensor_info);
+        _tmp_y.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, &_tmp_x, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(&_tmp_x, &_tmp_y, output_x, output_y, border_mode == BorderMode::UNDEFINED);
+        _tmp_x.allocator()->allocate();
+        _tmp_y.allocator()->allocate();
+    }
+    else if(run_sobel_x)
+    {
+        _tmp_x.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, &_tmp_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(&_tmp_x, nullptr, output_x, nullptr, border_mode == BorderMode::UNDEFINED);
+        _tmp_x.allocator()->allocate();
+    }
+    else if(run_sobel_y)
+    {
+        _tmp_y.allocator()->init(tensor_info);
+        _sobel_hor.configure(input, nullptr, &_tmp_y, border_mode == BorderMode::UNDEFINED);
+        _sobel_vert.configure(nullptr, &_tmp_y, nullptr, output_y, border_mode == BorderMode::UNDEFINED);
+        _tmp_y.allocator()->allocate();
+    }
+
+    _border_handler.configure(input, _sobel_hor.border_size(), border_mode, PixelValue(constant_border_value));
+}
+
+void NESobel7x7::run()
+{
+    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
+    NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
new file mode 100644
index 0000000000..0651eab1bc
--- /dev/null
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include <cfloat>
+
+using namespace arm_compute;
+
+NESoftmaxLayer::NESoftmaxLayer()
+    : _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _fill_border_kernel(), _max(), _sum(), _tmp()
+{
+}
+
+void NESoftmaxLayer::configure(ITensor *input, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+
+    // Create intermediate tensors shapes
+    TensorInfo tensor_info_tmp(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
+    _tmp.allocator()->init(tensor_info_tmp);
+
+    TensorShape shape = input->info()->tensor_shape();
+    shape.set(0, 1);
+    TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
+    _max.allocator()->init(tensor_info_max_sum);
+    _sum.allocator()->init(tensor_info_max_sum);
+
+    // Configure Kernels
+    _max_kernel.configure(input, &_max);
+    _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum);
+    _norm_kernel.configure(&_tmp, &_sum, output);
+    _fill_border_kernel.configure(input, _max_kernel.border_size(), BorderMode::CONSTANT, PixelValue(-FLT_MAX));
+
+    // Allocate intermediate tensors
+    _tmp.allocator()->allocate();
+    _max.allocator()->allocate();
+    _sum.allocator()->allocate();
+}
+
+void NESoftmaxLayer::run()
+{
+    NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
+    NEScheduler::get().schedule(&_max_kernel, Window::DimY);
+    NEScheduler::get().schedule(&_shift_exp_sum_kernel, Window::DimY);
+    NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
+}
diff --git a/src/runtime/NEON/functions/NETableLookup.cpp b/src/runtime/NEON/functions/NETableLookup.cpp
new file mode 100644
index 0000000000..ebb8a0ac9b
--- /dev/null
+++ b/src/runtime/NEON/functions/NETableLookup.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NETableLookup.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NETableLookupKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NETableLookup::configure(const ITensor *input, const ILut *lut, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NETableLookupKernel>();
+    k->configure(input, lut, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEThreshold.cpp b/src/runtime/NEON/functions/NEThreshold.cpp
new file mode 100644
index 0000000000..93dc124880
--- /dev/null
+++ b/src/runtime/NEON/functions/NEThreshold.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEThreshold.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEThresholdKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEThreshold::configure(const ITensor *input, ITensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
+{
+    auto k = arm_compute::cpp14::make_unique<NEThresholdKernel>();
+    k->configure(input, output, threshold, false_value, true_value, type, upper);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NETranspose.cpp b/src/runtime/NEON/functions/NETranspose.cpp
new file mode 100644
index 0000000000..53ac9c5ee3
--- /dev/null
+++ b/src/runtime/NEON/functions/NETranspose.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NETranspose.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NETranspose::configure(const ITensor *input, ITensor *output)
+{
+    auto k = arm_compute::cpp14::make_unique<NETransposeKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
diff --git a/src/runtime/NEON/functions/NEWarpAffine.cpp b/src/runtime/NEON/functions/NEWarpAffine.cpp
new file mode 100644
index 0000000000..24fb16f9e3
--- /dev/null
+++ b/src/runtime/NEON/functions/NEWarpAffine.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEWarpAffine.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEWarpAffine::configure(ITensor *input, ITensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == matrix);
+
+    switch(policy)
+    {
+        case InterpolationPolicy::NEAREST_NEIGHBOR:
+        {
+            auto k = arm_compute::cpp14::make_unique<NEWarpAffineKernel<InterpolationPolicy::NEAREST_NEIGHBOR>>();
+            k->configure(input, output, matrix, border_mode, constant_border_value);
+            _kernel = std::move(k);
+            break;
+        }
+        case InterpolationPolicy::BILINEAR:
+        {
+            auto k = arm_compute::cpp14::make_unique<NEWarpAffineKernel<InterpolationPolicy::BILINEAR>>();
+            k->configure(input, output, matrix, border_mode, constant_border_value);
+            _kernel = std::move(k);
+            break;
+        }
+        case InterpolationPolicy::AREA:
+        default:
+            ARM_COMPUTE_ERROR("Interpolation type not supported");
+    }
+
+    _border_handler.configure(input, _kernel->border_size(), border_mode, constant_border_value);
+}
diff --git a/src/runtime/NEON/functions/NEWarpPerspective.cpp b/src/runtime/NEON/functions/NEWarpPerspective.cpp
new file mode 100644
index 0000000000..84b2df5bfa
--- /dev/null
+++ b/src/runtime/NEON/functions/NEWarpPerspective.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEWarpPerspective.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void NEWarpPerspective::configure(ITensor *input, ITensor *output, const float *matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON(nullptr == matrix);
+
+    switch(policy)
+    {
+        case InterpolationPolicy::NEAREST_NEIGHBOR:
+        {
+            auto k = arm_compute::cpp14::make_unique<NEWarpPerspectiveKernel<InterpolationPolicy::NEAREST_NEIGHBOR>>();
+            k->configure(input, output, matrix, border_mode, constant_border_value);
+            _kernel = std::move(k);
+            break;
+        }
+        case InterpolationPolicy::BILINEAR:
+        {
+            auto k = arm_compute::cpp14::make_unique<NEWarpPerspectiveKernel<InterpolationPolicy::BILINEAR>>();
+            k->configure(input, output, matrix, border_mode, constant_border_value);
+            _kernel = std::move(k);
+            break;
+        }
+        case InterpolationPolicy::AREA:
+        default:
+            ARM_COMPUTE_ERROR("Interpolation type not supported");
+    }
+
+    _border_handler.configure(input, _kernel->border_size(), border_mode, constant_border_value);
+}
diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
new file mode 100644
index 0000000000..0cced73276
--- /dev/null
+++ b/src/runtime/OMP/OMPScheduler.cpp
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/OMP/OMPScheduler.h"
+
+#include "arm_compute/core/CPP/ICPPKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+
+#include <omp.h>
+
+using namespace arm_compute;
+
+OMPScheduler &OMPScheduler::get()
+{
+    static OMPScheduler scheduler;
+    return scheduler;
+}
+
+OMPScheduler::OMPScheduler()
+    : _num_threads(omp_get_max_threads())
+{
+}
+
+unsigned int OMPScheduler::num_threads() const
+{
+    return _num_threads;
+}
+
+void OMPScheduler::set_num_threads(unsigned int num_threads)
+{
+    const unsigned int num_cores = omp_get_max_threads();
+    _num_threads                 = num_threads == 0 ? num_cores : num_threads;
+}
+
+void OMPScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
+
+    const Window      &max_window     = kernel->window();
+    const unsigned int num_iterations = max_window.num_iterations(split_dimension);
+    const unsigned int num_threads    = std::min(num_iterations, _num_threads);
+
+    if(!kernel->is_parallelisable() || 1 == num_threads)
+    {
+        kernel->run(max_window);
+    }
+    else
+    {
+        #pragma omp parallel num_threads(num_threads)
+        {
+            #pragma omp for
+            for(unsigned int t = 0; t < num_threads; ++t)
+            {
+                Window win = max_window.split_window(split_dimension, t, num_threads);
+                win.set_thread_id(t);
+                win.set_num_threads(num_threads);
+                kernel->run(win);
+            }
+        }
+    }
+}
diff --git a/src/runtime/Pyramid.cpp b/src/runtime/Pyramid.cpp
new file mode 100644
index 0000000000..f1b6c93b50
--- /dev/null
+++ b/src/runtime/Pyramid.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/Pyramid.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PyramidInfo.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+
+#include <cmath>
+
+using namespace arm_compute;
+
+void Pyramid::init(const PyramidInfo &info)
+{
+    internal_init(info, false);
+}
+
+void Pyramid::init_auto_padding(const PyramidInfo &info)
+{
+    internal_init(info, true);
+}
+
+void Pyramid::internal_init(const PyramidInfo &info, bool auto_padding)
+{
+    _info    = info;
+    _pyramid = arm_compute::cpp14::make_unique<Tensor[]>(_info.num_levels());
+
+    size_t      w            = _info.width();
+    size_t      h            = _info.height();
+    size_t      ref_w        = w;
+    size_t      ref_h        = h;
+    bool        is_orb_scale = (SCALE_PYRAMID_ORB == _info.scale());
+    TensorShape tensor_shape = _info.tensor_shape();
+
+    // Note: Look-up table used by the OpenVX sample implementation
+    const float c_orbscale[4] = { 0.5f,
+                                  SCALE_PYRAMID_ORB,
+                                  SCALE_PYRAMID_ORB * SCALE_PYRAMID_ORB,
+                                  SCALE_PYRAMID_ORB *SCALE_PYRAMID_ORB * SCALE_PYRAMID_ORB
+                                };
+
+    for(size_t i = 0; i < _info.num_levels(); ++i)
+    {
+        TensorInfo tensor_info(tensor_shape, _info.format());
+
+        if(auto_padding)
+        {
+            tensor_info.auto_padding();
+        }
+
+        (_pyramid.get() + i)->allocator()->init(tensor_info);
+
+        if(is_orb_scale)
+        {
+            float orb_scale = c_orbscale[(i + 1) % 4];
+            w               = static_cast<int>(std::ceil(static_cast<float>(ref_w) * orb_scale));
+            h               = static_cast<int>(std::ceil(static_cast<float>(ref_h) * orb_scale));
+
+            if(0 == ((i + 1) % 4))
+            {
+                ref_w = w;
+                ref_h = h;
+            }
+        }
+        else
+        {
+            w = (w + 1) * _info.scale();
+            h = (h + 1) * _info.scale();
+        }
+
+        // Update tensor_shape
+        tensor_shape.set(0, w);
+        tensor_shape.set(1, h);
+    }
+}
+
+void Pyramid::allocate()
+{
+    ARM_COMPUTE_ERROR_ON(_pyramid == nullptr);
+
+    for(size_t i = 0; i < _info.num_levels(); ++i)
+    {
+        (_pyramid.get() + i)->allocator()->allocate();
+    }
+}
+
+const PyramidInfo *Pyramid::info() const
+{
+    return &_info;
+}
+
+Tensor *Pyramid::get_pyramid_level(size_t index) const
+{
+    ARM_COMPUTE_ERROR_ON(index >= _info.num_levels());
+
+    return (_pyramid.get() + index);
+}
diff --git a/src/runtime/Scheduler.cpp b/src/runtime/Scheduler.cpp
new file mode 100644
index 0000000000..a131928293
--- /dev/null
+++ b/src/runtime/Scheduler.cpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/Scheduler.h"
+
+#include "arm_compute/core/Error.h"
+#if ARM_COMPUTE_CPP_SCHEDULER
+#include "arm_compute/runtime/CPP/CPPScheduler.h"
+#endif
+
+#include "arm_compute/runtime/SingleThreadScheduler.h"
+
+#if ARM_COMPUTE_OPENMP_SCHEDULER
+#include "arm_compute/runtime/OMP/OMPScheduler.h"
+#endif
+
+using namespace arm_compute;
+
+#if !ARM_COMPUTE_CPP_SCHEDULER && ARM_COMPUTE_OPENMP_SCHEDULER
+Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::OMP;
+#elif ARM_COMPUTE_CPP_SCHEDULER && !ARM_COMPUTE_OPENMP_SCHEDULER
+Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::CPP;
+#elif ARM_COMPUTE_CPP_SCHEDULER && ARM_COMPUTE_OPENMP_SCHEDULER
+Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::CPP;
+#else
+Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::ST;
+#endif
+
+void Scheduler::set(Type t)
+{
+    ARM_COMPUTE_ERROR_ON(!Scheduler::is_available(t));
+    _scheduler_type = t;
+}
+
+bool Scheduler::is_available(Type t)
+{
+    switch(t)
+    {
+        case Type::ST:
+        {
+            return true;
+        }
+        case Type::CPP:
+        {
+#if ARM_COMPUTE_CPP_SCHEDULER
+            return true;
+#else
+            return false;
+#endif
+        }
+        case Type::OMP:
+        {
+#if ARM_COMPUTE_OPENMP_SCHEDULER
+            return true;
+#else
+            return false;
+#endif
+        }
+        case Type::CUSTOM:
+        {
+            return _custom_scheduler != nullptr;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Invalid Scheduler type");
+            return false;
+        }
+    }
+}
+
+Scheduler::Type Scheduler::get_type()
+{
+    return _scheduler_type;
+}
+
+IScheduler &Scheduler::get()
+{
+    switch(_scheduler_type)
+    {
+        case Type::ST:
+        {
+            return SingleThreadScheduler::get();
+        }
+        case Type::CPP:
+        {
+#if ARM_COMPUTE_CPP_SCHEDULER
+            return CPPScheduler::get();
+#else
+            ARM_COMPUTE_ERROR("Recompile with cppthreads=1 to use C++11 scheduler.");
+#endif
+            break;
+        }
+        case Type::OMP:
+        {
+#if ARM_COMPUTE_OPENMP_SCHEDULER
+            return OMPScheduler::get();
+#else
+            ARM_COMPUTE_ERROR("Recompile with openmp=1 to use openmp scheduler.");
+#endif
+            break;
+        }
+        case Type::CUSTOM:
+        {
+            if(_custom_scheduler == nullptr)
+            {
+                ARM_COMPUTE_ERROR("No custom scheduler has been setup. Call set(std::shared_ptr<IScheduler> &scheduler) before Scheduler::get()");
+            }
+            else
+            {
+                return *_custom_scheduler;
+            }
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Invalid Scheduler type");
+            break;
+        }
+    }
+    return SingleThreadScheduler::get();
+}
+
+std::shared_ptr<IScheduler> Scheduler::_custom_scheduler = nullptr;
+
+void Scheduler::set(std::shared_ptr<IScheduler> &scheduler)
+{
+    _custom_scheduler = scheduler;
+    set(Type::CUSTOM);
+}
diff --git a/src/runtime/SubTensor.cpp b/src/runtime/SubTensor.cpp
new file mode 100644
index 0000000000..32924be3dc
--- /dev/null
+++ b/src/runtime/SubTensor.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/SubTensor.h"
+
+#include "arm_compute/core/Error.h"
+
+using namespace arm_compute;
+
+SubTensor::SubTensor(ITensor *parent, const TensorShape &tensor_shape, const Coordinates &coords)
+    : _parent(nullptr), _info()
+{
+    ARM_COMPUTE_ERROR_ON(parent == nullptr);
+    _info   = SubTensorInfo(parent->info(), tensor_shape, coords);
+    _parent = parent;
+}
+
+ITensorInfo *SubTensor::info() const
+{
+    return &_info;
+}
+
+ITensorInfo *SubTensor::info()
+{
+    return &_info;
+}
+
+uint8_t *SubTensor::buffer() const
+{
+    ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+    return _parent->buffer();
+}
+
+ITensor *SubTensor::parent()
+{
+    return _parent;
+}
diff --git a/src/runtime/Tensor.cpp b/src/runtime/Tensor.cpp
new file mode 100644
index 0000000000..435068c61d
--- /dev/null
+++ b/src/runtime/Tensor.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/Tensor.h"
+
+using namespace arm_compute;
+
+Tensor::Tensor()
+    : _allocator()
+{
+}
+
+ITensorInfo *Tensor::info() const
+{
+    return &_allocator.info();
+}
+
+ITensorInfo *Tensor::info()
+{
+    return &_allocator.info();
+}
+
+uint8_t *Tensor::buffer() const
+{
+    return _allocator.data();
+}
+
+TensorAllocator *Tensor::allocator()
+{
+    return &_allocator;
+}
diff --git a/src/runtime/TensorAllocator.cpp b/src/runtime/TensorAllocator.cpp
new file mode 100644
index 0000000000..5c719c761a
--- /dev/null
+++ b/src/runtime/TensorAllocator.cpp
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+
+#include <cstddef>
+
+using namespace arm_compute;
+
+namespace
+{
+bool validate_subtensor_shape(const TensorInfo &parent_info, const TensorInfo &child_info, const Coordinates &coords)
+{
+    bool               is_valid     = true;
+    const TensorShape &parent_shape = parent_info.tensor_shape();
+    const TensorShape &child_shape  = child_info.tensor_shape();
+    const size_t       parent_dims  = parent_info.num_dimensions();
+    const size_t       child_dims   = child_info.num_dimensions();
+
+    if(child_dims <= parent_dims)
+    {
+        for(size_t num_dimensions = child_dims; num_dimensions > 0; --num_dimensions)
+        {
+            const size_t child_dim_size = coords[num_dimensions - 1] + child_shape[num_dimensions - 1];
+
+            if((coords[num_dimensions - 1] < 0) || (child_dim_size > parent_shape[num_dimensions - 1]))
+            {
+                is_valid = false;
+                break;
+            }
+        }
+    }
+    else
+    {
+        is_valid = false;
+    }
+
+    return is_valid;
+}
+} // namespace
+
+TensorAllocator::TensorAllocator()
+    : _buffer(nullptr)
+{
+}
+
+void TensorAllocator::init(const TensorAllocator &allocator, const Coordinates &coords, TensorInfo sub_info)
+{
+    // Get parent info
+    const TensorInfo parent_info = allocator.info();
+
+    // Check if coordinates and new shape are within the parent tensor
+    ARM_COMPUTE_ERROR_ON(!validate_subtensor_shape(parent_info, sub_info, coords));
+    ARM_COMPUTE_UNUSED(validate_subtensor_shape);
+
+    // Copy pointer to buffer
+    _buffer = allocator._buffer;
+
+    // Init tensor info with new dimensions
+    size_t total_size = parent_info.offset_element_in_bytes(coords) + sub_info.total_size() - sub_info.offset_first_element_in_bytes();
+    sub_info.init(sub_info.tensor_shape(), sub_info.format(), parent_info.strides_in_bytes(), parent_info.offset_element_in_bytes(coords), total_size);
+
+    // Set TensorInfo
+    init(sub_info);
+}
+
+uint8_t *TensorAllocator::data() const
+{
+    return (_buffer != nullptr) ? _buffer.get()->data() : nullptr;
+}
+
+void TensorAllocator::allocate()
+{
+    ARM_COMPUTE_ERROR_ON(_buffer != nullptr);
+
+    _buffer = std::make_shared<std::vector<uint8_t>>(info().total_size());
+    info().set_is_resizable(false);
+}
+
+void TensorAllocator::free()
+{
+    ARM_COMPUTE_ERROR_ON(_buffer == nullptr);
+
+    _buffer.reset();
+    info().set_is_resizable(true);
+}
+
+uint8_t *TensorAllocator::lock()
+{
+    return (_buffer != nullptr) ? _buffer.get()->data() : nullptr;
+}
+
+void TensorAllocator::unlock()
+{
+}
diff --git a/src/runtime/Utils.cpp b/src/runtime/Utils.cpp
new file mode 100644
index 0000000000..1b06117c7b
--- /dev/null
+++ b/src/runtime/Utils.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/Utils.h"
+
+#include <map>
+#include <string>
+
+using namespace arm_compute;
+
+const std::string &arm_compute::string_from_scheduler_type(Scheduler::Type t)
+{
+    static std::map<Scheduler::Type, const std::string> scheduler_type_map =
+    {
+        { Scheduler::Type::ST, "Single Thread" },
+        { Scheduler::Type::CPP, "C++11 Threads" },
+        { Scheduler::Type::OMP, "OpenMP Threads" },
+        { Scheduler::Type::CUSTOM, "Custom" }
+    };
+
+    return scheduler_type_map[t];
+}
diff --git a/tests/CL/CLAccessor.h b/tests/CL/CLAccessor.h
new file mode 100644
index 0000000000..21db3ee23d
--- /dev/null
+++ b/tests/CL/CLAccessor.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_CL_CLACCESSOR_H__
+#define __ARM_COMPUTE_TEST_CL_CLACCESSOR_H__
+
+#include "IAccessor.h"
+
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace cl
+{
+/** Accessor implementation for @ref CLTensor objects. */
+class CLAccessor : public IAccessor
+{
+public:
+    /** Create an accessor for the given @p tensor.
+     *
+     * @param[in, out] tensor To be accessed tensor.
+     *
+     * @note The CL memory is mapped by the constructor.
+     *
+     */
+    CLAccessor(CLTensor &tensor);
+
+    CLAccessor(const CLAccessor &) = delete;
+    CLAccessor &operator=(const CLAccessor &) = delete;
+    CLAccessor(CLAccessor &&)                 = default;
+    CLAccessor &operator=(CLAccessor &&) = default;
+
+    /** Destructor that unmaps the CL memory. */
+    ~CLAccessor();
+
+    TensorShape shape() const override;
+    size_t      element_size() const override;
+    size_t      size() const override;
+    Format      format() const override;
+    DataType    data_type() const override;
+    int         num_channels() const override;
+    int         num_elements() const override;
+    int         fixed_point_position() const override;
+    const void *operator()(const Coordinates &coord) const override;
+    void *operator()(const Coordinates &coord) override;
+
+private:
+    CLTensor &_tensor;
+};
+
+inline CLAccessor::CLAccessor(CLTensor &tensor)
+    : _tensor{ tensor }
+{
+    _tensor.map();
+}
+
+inline CLAccessor::~CLAccessor()
+{
+    _tensor.unmap();
+}
+
+inline TensorShape CLAccessor::shape() const
+{
+    return _tensor.info()->tensor_shape();
+}
+
+inline size_t CLAccessor::element_size() const
+{
+    return _tensor.info()->element_size();
+}
+
+inline size_t CLAccessor::size() const
+{
+    return _tensor.info()->total_size();
+}
+
+inline Format CLAccessor::format() const
+{
+    return _tensor.info()->format();
+}
+
+inline DataType CLAccessor::data_type() const
+{
+    return _tensor.info()->data_type();
+}
+
+inline int CLAccessor::num_channels() const
+{
+    return _tensor.info()->num_channels();
+}
+
+inline int CLAccessor::num_elements() const
+{
+    return _tensor.info()->tensor_shape().total_size();
+}
+
+inline int CLAccessor::fixed_point_position() const
+{
+    return _tensor.info()->fixed_point_position();
+}
+
+inline const void *CLAccessor::operator()(const Coordinates &coord) const
+{
+    return _tensor.ptr_to_element(coord);
+}
+
+inline void *CLAccessor::operator()(const Coordinates &coord)
+{
+    return _tensor.ptr_to_element(coord);
+}
+} // cl
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/CL/Helper.h b/tests/CL/Helper.h
new file mode 100644
index 0000000000..a6063e95ae
--- /dev/null
+++ b/tests/CL/Helper.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_CL_HELPER_H__
+#define __ARM_COMPUTE_TEST_CL_HELPER_H__
+
+#include "Globals.h"
+#include "TensorLibrary.h"
+
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace cl
+{
+/** Helper to create an empty tensor.
+ *
+ * @param[in] shape                Desired shape.
+ * @param[in] data_type            Desired data type.
+ * @param[in] num_channels         (Optional) It indicates the number of channels for each tensor element
+ * @param[in] fixed_point_position (Optional) Fixed point position that expresses the number of bits for the fractional part of the number when the tensor's data type is QS8 or QS16.
+ *
+ * @return Empty @ref CLTensor with the specified shape and data type.
+ */
+inline CLTensor create_tensor(const TensorShape &shape, DataType data_type, int num_channels = 1, int fixed_point_position = 0)
+{
+    CLTensor tensor;
+    tensor.allocator()->init(TensorInfo(shape, num_channels, data_type, fixed_point_position));
+
+    return tensor;
+}
+
+/** Helper to create an empty tensor.
+ *
+ * @param[in] name      File name from which to get the dimensions.
+ * @param[in] data_type Desired data type.
+ *
+ * @return Empty @ref CLTensor with the specified shape and data type.
+ */
+inline CLTensor create_tensor(const std::string &name, DataType data_type)
+{
+    constexpr unsigned int num_channels = 1;
+
+    const RawTensor &raw = library->get(name);
+
+    CLTensor tensor;
+    tensor.allocator()->init(TensorInfo(raw.shape(), num_channels, data_type));
+
+    return tensor;
+}
+} // namespace cl
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
new file mode 100644
index 0000000000..3c4f5029b1
--- /dev/null
+++ b/tests/CMakeLists.txt
@@ -0,0 +1,85 @@
+# Copyright (c) 2017 ARM Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+cmake_minimum_required (VERSION 3.1)
+project (arm_compute_test)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+add_library(boost_unit_test_framework STATIC IMPORTED)
+set_target_properties(boost_unit_test_framework PROPERTIES
+    IMPORTED_LOCATION "${CMAKE_SOURCE_DIR}/../3rdparty/linux/armv7a/libboost_unit_test_framework.a"
+)
+
+add_library(boost_program_options STATIC IMPORTED)
+set_target_properties(boost_program_options PROPERTIES
+    IMPORTED_LOCATION "${CMAKE_SOURCE_DIR}/../3rdparty/linux/armv7a/libboost_program_options.a"
+)
+
+add_library(arm_compute SHARED IMPORTED)
+set_target_properties(arm_compute PROPERTIES
+    IMPORTED_LOCATION "${CMAKE_SOURCE_DIR}/../build/libarm_compute.so"
+)
+
+include_directories("${CMAKE_SOURCE_DIR}")
+include_directories("${CMAKE_SOURCE_DIR}/..")
+include_directories("${CMAKE_SOURCE_DIR}/../3rdparty/include/")
+
+# TensorLibrary
+set(tensor_library_SOURCE_FILES
+    ${CMAKE_SOURCE_DIR}/RawTensor.h
+    ${CMAKE_SOURCE_DIR}/RawTensor.cpp
+    ${CMAKE_SOURCE_DIR}/TensorCache.h
+    ${CMAKE_SOURCE_DIR}/TensorLibrary.h
+    ${CMAKE_SOURCE_DIR}/TensorLibrary.cpp
+)
+
+add_library(tensor_library OBJECT
+    ${tensor_library_SOURCE_FILES}
+)
+
+set(arm_compute_test_SOURCE_FILES
+    ${CMAKE_SOURCE_DIR}/BorderModeDataset.h
+    ${CMAKE_SOURCE_DIR}/ConvertPolicyDataset.h
+    ${CMAKE_SOURCE_DIR}/Globals.h
+    ${CMAKE_SOURCE_DIR}/IAccessor.h
+    ${CMAKE_SOURCE_DIR}/ImageDatasets.h
+    ${CMAKE_SOURCE_DIR}/InterpolationPolicyDataset.h
+    ${CMAKE_SOURCE_DIR}/NormalizationTypeDataset.h
+    ${CMAKE_SOURCE_DIR}/ProgramOptions.h
+    ${CMAKE_SOURCE_DIR}/ProgramOptions.cpp
+    ${CMAKE_SOURCE_DIR}/RoundingPolicyDataset.h
+    ${CMAKE_SOURCE_DIR}/ShapeDatasets.h
+    ${CMAKE_SOURCE_DIR}/TypePrinter.h
+    ${CMAKE_SOURCE_DIR}/TypeReader.h
+    ${CMAKE_SOURCE_DIR}/UserConfiguration.h
+    ${CMAKE_SOURCE_DIR}/UserConfiguration.cpp
+    ${CMAKE_SOURCE_DIR}/Utils.h
+    ${CMAKE_SOURCE_DIR}/boost_wrapper.h
+)
+
+add_library(arm_compute_test OBJECT
+    ${arm_compute_test_SOURCE_FILES}
+)
+
+add_subdirectory(validation)
+add_subdirectory(benchmark)
diff --git a/tests/Globals.h b/tests/Globals.h
new file mode 100644
index 0000000000..a44d7ce83b
--- /dev/null
+++ b/tests/Globals.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_GLOBALS_H__
+#define __ARM_COMPUTE_TEST_GLOBALS_H__
+
+#include "TensorLibrary.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace test
+{
+extern std::unique_ptr<TensorLibrary> library;
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/IAccessor.h b/tests/IAccessor.h
new file mode 100644
index 0000000000..3c06dc36be
--- /dev/null
+++ b/tests/IAccessor.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_IACCESSOR_H__
+#define __ARM_COMPUTE_TEST_IACCESSOR_H__
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+namespace test
+{
+/** Common interface to provide information and access to tensor like
+ * structures.
+ */
+class IAccessor
+{
+public:
+    /** Pure virtual destructor. */
+    virtual ~IAccessor() = 0;
+
+    /** Shape of the tensor. */
+    virtual TensorShape shape() const = 0;
+
+    /** Size of each element in the tensor in bytes. */
+    virtual size_t element_size() const = 0;
+
+    /** Total size of the tensor in bytes. */
+    virtual size_t size() const = 0;
+
+    /** Image format of the tensor. */
+    virtual Format format() const = 0;
+
+    /** Data type of the tensor. */
+    virtual DataType data_type() const = 0;
+
+    /** Number of channels of the tensor. */
+    virtual int num_channels() const = 0;
+
+    /** Number of elements of the tensor. */
+    virtual int num_elements() const = 0;
+
+    /** Number of bits for the fractional part. */
+    virtual int fixed_point_position() const = 0;
+
+    /** Read only access to the specified element.
+     *
+     * @param[in] coord Coordinates of the desired element.
+     *
+     * @return A pointer to the desired element.
+     */
+    virtual const void *operator()(const Coordinates &coord) const = 0;
+
+    /** Access to the specified element.
+     *
+     * @param[in] coord Coordinates of the desired element.
+     *
+     * @return A pointer to the desired element.
+     */
+    virtual void *operator()(const Coordinates &coord) = 0;
+};
+
+inline IAccessor::~IAccessor()
+{
+}
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/NEON/Helper.h b/tests/NEON/Helper.h
new file mode 100644
index 0000000000..c8f1c2e635
--- /dev/null
+++ b/tests/NEON/Helper.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_NEON_HELPER_H__
+#define __ARM_COMPUTE_TEST_NEON_HELPER_H__
+
+#include "Globals.h"
+#include "TensorLibrary.h"
+
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace neon
+{
+/** Helper to create an empty tensor.
+ *
+ * @param[in] shape                Desired shape.
+ * @param[in] data_type            Desired data type.
+ * @param[in] num_channels         (Optional) It indicates the number of channels for each tensor element
+ * @param[in] fixed_point_position (Optional) Fixed point position that expresses the number of bits for the fractional part of the number when the tensor's data type is QS8 or QS16.
+ *
+ * @return Empty @ref Tensor with the specified shape and data type.
+ */
+inline Tensor create_tensor(const TensorShape &shape, DataType data_type, int num_channels = 1, int fixed_point_position = 0)
+{
+    Tensor tensor;
+    tensor.allocator()->init(TensorInfo(shape, num_channels, data_type, fixed_point_position));
+
+    return tensor;
+}
+
+/** Helper to create an empty tensor.
+ *
+ * @param[in] name                 File name from which to get the dimensions.
+ * @param[in] data_type            Desired data type.
+ * @param[in] fixed_point_position (Optional) Number of bits for the fractional part of the fixed point numbers
+ *
+ * @return Empty @ref Tensor with the specified shape and data type.
+ */
+inline Tensor create_tensor(const std::string &name, DataType data_type, int fixed_point_position = 0)
+{
+    constexpr unsigned int num_channels = 1;
+
+    const RawTensor &raw = library->get(name);
+
+    Tensor tensor;
+    tensor.allocator()->init(TensorInfo(raw.shape(), num_channels, data_type, fixed_point_position));
+
+    return tensor;
+}
+} // namespace neon
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/NEON/NEAccessor.h b/tests/NEON/NEAccessor.h
new file mode 100644
index 0000000000..be28c27d98
--- /dev/null
+++ b/tests/NEON/NEAccessor.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_NEON_NEACCESSOR_H__
+#define __ARM_COMPUTE_TEST_NEON_NEACCESSOR_H__
+
+#include "IAccessor.h"
+
+#include "arm_compute/runtime/Tensor.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace neon
+{
+/** Accessor implementation for @ref Tensor objects. */
+class NEAccessor : public IAccessor
+{
+public:
+    /** Create an accessor for the given @p tensor.
+     *
+     * @param[in, out] tensor To be accessed tensor.
+     */
+    NEAccessor(Tensor &tensor);
+
+    NEAccessor(const NEAccessor &) = delete;
+    NEAccessor &operator=(const NEAccessor &) = delete;
+    NEAccessor(NEAccessor &&)                 = default;
+    NEAccessor &operator=(NEAccessor &&) = default;
+
+    TensorShape shape() const override;
+    size_t      element_size() const override;
+    size_t      size() const override;
+    Format      format() const override;
+    DataType    data_type() const override;
+    int         num_channels() const override;
+    int         num_elements() const override;
+    int         fixed_point_position() const override;
+    const void *operator()(const Coordinates &coord) const override;
+    void *operator()(const Coordinates &coord) override;
+
+private:
+    Tensor &_tensor;
+};
+
+inline NEAccessor::NEAccessor(Tensor &tensor)
+    : _tensor{ tensor }
+{
+}
+
+inline TensorShape NEAccessor::shape() const
+{
+    return _tensor.info()->tensor_shape();
+}
+
+inline size_t NEAccessor::element_size() const
+{
+    return _tensor.info()->element_size();
+}
+
+inline size_t NEAccessor::size() const
+{
+    return _tensor.info()->total_size();
+}
+
+inline Format NEAccessor::format() const
+{
+    return _tensor.info()->format();
+}
+
+inline DataType NEAccessor::data_type() const
+{
+    return _tensor.info()->data_type();
+}
+
+inline int NEAccessor::num_channels() const
+{
+    return _tensor.info()->num_channels();
+}
+
+inline int NEAccessor::num_elements() const
+{
+    return _tensor.info()->tensor_shape().total_size();
+}
+
+inline int NEAccessor::fixed_point_position() const
+{
+    return _tensor.info()->fixed_point_position();
+}
+
+inline const void *NEAccessor::operator()(const Coordinates &coord) const
+{
+    return _tensor.ptr_to_element(coord);
+}
+
+inline void *NEAccessor::operator()(const Coordinates &coord)
+{
+    return _tensor.ptr_to_element(coord);
+}
+} // namespace neon
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/ProgramOptions.cpp b/tests/ProgramOptions.cpp
new file mode 100644
index 0000000000..0ae92f64e7
--- /dev/null
+++ b/tests/ProgramOptions.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ProgramOptions.h"
+
+#include "TypePrinter.h"
+#include "TypeReader.h"
+
+#include "arm_compute/core/Types.h"
+
+#include <random>
+#include <sstream>
+
+namespace arm_compute
+{
+namespace test
+{
+ProgramOptions::ProgramOptions()
+{
+    boost::program_options::options_description generic("Generic options");
+    generic.add_options()("help", "Print help message")("seed", boost::program_options::value<std::random_device::result_type>(), "Seed for the tensor library");
+
+    _visible.add(generic);
+
+    _hidden.add_options()("path", boost::program_options::value<std::string>(), "Path from where to load the asset/s");
+
+    _positional.add("path", 1);
+}
+
+void ProgramOptions::add_options(const boost::program_options::options_description &options)
+{
+    _visible.add(options);
+}
+
+bool ProgramOptions::wants_help() const
+{
+    return (_vm.count("help") != 0);
+}
+
+std::string ProgramOptions::get_help() const
+{
+    std::stringstream help;
+    help << _visible;
+
+    return help.str();
+}
+
+void ProgramOptions::parse_commandline(int argc, char **argv)
+{
+    boost::program_options::options_description all;
+    all.add(_visible).add(_hidden);
+
+    boost::program_options::store(boost::program_options::command_line_parser(argc, argv)
+                                  .options(all)
+                                  .positional(_positional)
+                                  .allow_unregistered()
+                                  .run(),
+                                  _vm);
+
+    if(_vm.count("help") == 0 && _vm.count("path") == 0)
+    {
+        throw boost::program_options::required_option("PATH");
+    }
+
+    boost::program_options::notify(_vm);
+}
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/ProgramOptions.h b/tests/ProgramOptions.h
new file mode 100644
index 0000000000..b61ae01b30
--- /dev/null
+++ b/tests/ProgramOptions.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_PROGRAM_OPTIONS_H__
+#define __ARM_COMPUTE_TEST_PROGRAM_OPTIONS_H__
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Weffc++"
+#pragma GCC diagnostic ignored "-Wnon-virtual-dtor"
+#pragma GCC diagnostic ignored "-Wctor-dtor-privacy"
+#include "boost/program_options.hpp"
+#pragma GCC diagnostic pop
+
+#include <random>
+#include <sstream>
+
+namespace arm_compute
+{
+namespace test
+{
+/** Defines available commandline arguments and allows to parse them. */
+class ProgramOptions
+{
+public:
+    /** Defines available options. */
+    ProgramOptions();
+
+    /** Signals if the --help flag has been passed on the commandline. */
+    bool wants_help() const;
+
+    /** Returns a string describing all available options. */
+    std::string get_help() const;
+
+    /** Parses the given arguments and makes them available via @ref get.
+     *
+     * @param[in] argc Number of command line arguments.
+     * @param[in] argv Pointer to the command line arguments.
+     */
+    void parse_commandline(int argc, char **argv);
+
+    /** Sets @p value if it has been specified on the command line.
+     *
+     * @note The type T has to match the type that has been specified for the
+     *       command line option.
+     *
+     * @param[in]  name  Name of the option to query.
+     * @param[out] value Variable to which the value will be assigned.
+     *
+     * @return True if the value is assigned, false otherwise.
+     */
+    template <typename T>
+    bool get(const std::string &name, T &value) const;
+
+protected:
+    /** Allows subclasses to add more specific options
+     *
+     * @param[in] options Boost object containing options and their descriptions
+     */
+    void add_options(const boost::program_options::options_description &options);
+
+private:
+    boost::program_options::options_description            _hidden{};
+    boost::program_options::options_description            _visible{ "Configuration options" };
+    boost::program_options::positional_options_description _positional{};
+    boost::program_options::variables_map                  _vm{};
+};
+
+template <typename T>
+bool ProgramOptions::get(const std::string &name, T &value) const
+{
+    if(_vm.count(name) != 0)
+    {
+        value = _vm[name].as<T>();
+        return true;
+    }
+
+    return false;
+}
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/RawTensor.cpp b/tests/RawTensor.cpp
new file mode 100644
index 0000000000..6bfdf57b36
--- /dev/null
+++ b/tests/RawTensor.cpp
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "RawTensor.h"
+
+#include "Utils.h"
+
+#include "arm_compute/core/Utils.h"
+
+#include <algorithm>
+#include <array>
+#include <functional>
+#include <stdexcept>
+#include <utility>
+
+namespace arm_compute
+{
+namespace test
+{
+RawTensor::RawTensor(TensorShape shape, Format format, int fixed_point_position)
+    : _buffer(nullptr),
+      _shape(shape),
+      _format(format),
+      _fixed_point_position(fixed_point_position)
+{
+    _buffer = ::arm_compute::test::cpp14::make_unique<BufferType[]>(size());
+}
+
+RawTensor::RawTensor(TensorShape shape, DataType data_type, int num_channels, int fixed_point_position)
+    : _buffer(nullptr),
+      _shape(shape),
+      _data_type(data_type),
+      _num_channels(num_channels),
+      _fixed_point_position(fixed_point_position)
+{
+    _buffer = ::arm_compute::test::cpp14::make_unique<BufferType[]>(size());
+}
+
+RawTensor::RawTensor(const RawTensor &tensor)
+    : _buffer(nullptr),
+      _shape(tensor.shape()),
+      _format(tensor.format()),
+      _fixed_point_position(tensor.fixed_point_position())
+{
+    _buffer = ::arm_compute::test::cpp14::make_unique<BufferType[]>(tensor.size());
+    std::copy(tensor.data(), tensor.data() + size(), _buffer.get());
+}
+
+RawTensor &RawTensor::operator=(RawTensor tensor)
+{
+    swap(*this, tensor);
+
+    return *this;
+}
+
+RawTensor::BufferType &RawTensor::operator[](size_t offset)
+{
+    return _buffer[offset];
+}
+
+const RawTensor::BufferType &RawTensor::operator[](size_t offset) const
+{
+    return _buffer[offset];
+}
+
+TensorShape RawTensor::shape() const
+{
+    return _shape;
+}
+
+size_t RawTensor::element_size() const
+{
+    return num_channels() * element_size_from_data_type(data_type());
+}
+
+int RawTensor::fixed_point_position() const
+{
+    return _fixed_point_position;
+}
+
+size_t RawTensor::size() const
+{
+    const size_t size = std::accumulate(_shape.cbegin(), _shape.cend(), 1, std::multiplies<size_t>());
+    return size * element_size();
+}
+
+Format RawTensor::format() const
+{
+    return _format;
+}
+
+DataType RawTensor::data_type() const
+{
+    if(_format != Format::UNKNOWN)
+    {
+        return data_type_from_format(_format);
+    }
+    else
+    {
+        return _data_type;
+    }
+}
+
+int RawTensor::num_channels() const
+{
+    switch(_format)
+    {
+        case Format::U8:
+        case Format::S16:
+        case Format::U16:
+        case Format::S32:
+        case Format::U32:
+            return 1;
+        case Format::RGB888:
+            return 3;
+        case Format::UNKNOWN:
+            return _num_channels;
+        default:
+            ARM_COMPUTE_ERROR("NOT SUPPORTED!");
+    }
+}
+
+int RawTensor::num_elements() const
+{
+    return _shape.total_size();
+}
+
+const RawTensor::BufferType *RawTensor::data() const
+{
+    return _buffer.get();
+}
+
+RawTensor::BufferType *RawTensor::data()
+{
+    return _buffer.get();
+}
+
+const RawTensor::BufferType *RawTensor::operator()(const Coordinates &coord) const
+{
+    return _buffer.get() + coord2index(_shape, coord) * element_size();
+}
+
+RawTensor::BufferType *RawTensor::operator()(const Coordinates &coord)
+{
+    return _buffer.get() + coord2index(_shape, coord) * element_size();
+}
+
+void swap(RawTensor &tensor1, RawTensor &tensor2)
+{
+    // Use unqualified call to swap to enable ADL. But make std::swap available
+    // as backup.
+    using std::swap;
+    swap(tensor1._shape, tensor2._shape);
+    swap(tensor1._format, tensor2._format);
+    swap(tensor1._data_type, tensor2._data_type);
+    swap(tensor1._num_channels, tensor2._num_channels);
+    swap(tensor1._buffer, tensor2._buffer);
+}
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/RawTensor.h b/tests/RawTensor.h
new file mode 100644
index 0000000000..87ceb438e8
--- /dev/null
+++ b/tests/RawTensor.h
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_RAW_TENSOR_H__
+#define __ARM_COMPUTE_TEST_RAW_TENSOR_H__
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+namespace arm_compute
+{
+namespace test
+{
+/** Simple tensor object that stores elements in a consecutive chunk of memory.
+ *
+ * It can be created by either loading an image from a file which also
+ * initialises the content of the tensor or by explcitly specifying the size.
+ * The latter leaves the content uninitialised.
+ *
+ * Furthermore, the class provides methods to convert the tensor's values into
+ * different image format.
+ */
+class RawTensor final
+{
+public:
+    /** Create an uninitialised tensor of the given @p shape and @p format.
+     *
+     * @param[in] shape                Shape of the new raw tensor.
+     * @param[in] format               Format of the new raw tensor.
+     * @param[in] fixed_point_position (Optional) Number of bits for the fractional part of the fixed point numbers
+     */
+    RawTensor(TensorShape shape, Format format, int fixed_point_position = 0);
+
+    /** Create an uninitialised tensor of the given @p shape and @p data type.
+     *
+     * @param[in] shape                Shape of the new raw tensor.
+     * @param[in] data_type            Data type of the new raw tensor.
+     * @param[in] num_channels         (Optional) Number of channels (default = 1).
+     * @param[in] fixed_point_position (Optional) Number of bits for the fractional part of the fixed point numbers (default = 0).
+     */
+    RawTensor(TensorShape shape, DataType data_type, int num_channels = 1, int fixed_point_position = 0);
+
+    /** Create a deep copy of the given @p tensor.
+     *
+     * @param[in] tensor To be copied tensor.
+     */
+    RawTensor(const RawTensor &tensor);
+
+    /** Create a deep copy of the given @p tensor.
+     *
+     * @param[in] tensor To be copied tensor.
+     */
+    RawTensor &operator     =(RawTensor tensor);
+    RawTensor(RawTensor &&) = default;
+    ~RawTensor()            = default;
+
+    using BufferType = uint8_t;
+    using Buffer     = std::unique_ptr<BufferType[]>;
+
+    /** Return value at @p offset in the buffer.
+     *
+     * @param[in] offset Offset within the buffer.
+     */
+    BufferType &operator[](size_t offset);
+
+    /** Return constant value at @p offset in the buffer.
+     *
+     * @param[in] offset Offset within the buffer.
+     */
+    const BufferType &operator[](size_t offset) const;
+
+    /** Shape of the tensor. */
+    TensorShape shape() const;
+
+    /** Size of each element in the tensor in bytes. */
+    size_t element_size() const;
+
+    /** Total size of the tensor in bytes. */
+    size_t size() const;
+
+    /** Image format of the tensor. */
+    Format format() const;
+
+    /** Data type of the tensor. */
+    DataType data_type() const;
+
+    /** Number of channels of the tensor. */
+    int num_channels() const;
+
+    /** Number of elements of the tensor. */
+    int num_elements() const;
+
+    /** The number of bits for the fractional part of the fixed point numbers. */
+    int fixed_point_position() const;
+
+    /** Constant pointer to the underlying buffer. */
+    const BufferType *data() const;
+
+    /** Pointer to the underlying buffer. */
+    BufferType *data();
+
+    /** Read only access to the specified element.
+     *
+     * @param[in] coord Coordinates of the desired element.
+     *
+     * @return A pointer to the desired element.
+     */
+    const BufferType *operator()(const Coordinates &coord) const;
+
+    /** Access to the specified element.
+     *
+     * @param[in] coord Coordinates of the desired element.
+     *
+     * @return A pointer to the desired element.
+     */
+    BufferType *operator()(const Coordinates &coord);
+
+    /** Swaps the content of the provided tensors.
+     *
+     * @param[in, out] tensor1 Tensor to be swapped.
+     * @param[in, out] tensor2 Tensor to be swapped.
+     */
+    friend void swap(RawTensor &tensor1, RawTensor &tensor2);
+
+private:
+    Buffer      _buffer{ nullptr };
+    TensorShape _shape{};
+    Format      _format{ Format::UNKNOWN };
+    DataType    _data_type{ DataType::UNKNOWN };
+    int         _num_channels{ 0 };
+    int         _fixed_point_position{ 0 };
+};
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/SConscript b/tests/SConscript
new file mode 100644
index 0000000000..049113aba2
--- /dev/null
+++ b/tests/SConscript
@@ -0,0 +1,150 @@
+# Copyright (c) 2017 ARM Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import SCons
+import os.path
+
+Import('env')
+Import('vars')
+Import('arm_compute_a')
+Import('arm_compute_so')
+
+# vars is imported from arm_compute:
+variables = [
+    #FIXME Remove before release (And remove all references to INTERNAL_ONLY)
+    BoolVariable("internal_only", "Enable ARM internal only tests", True),
+    BoolVariable("pmu", "Enable PMU counters", False),
+    BoolVariable("validation_tests", "Build validation test programs", True),
+    BoolVariable("benchmark_tests", "Build validation test programs", True)
+]
+
+# We need a separate set of Variables for the Help message (Otherwise the global variables will get displayed twice)
+new_options = Variables('scons')
+
+for v in variables:
+    new_options.Add(v)
+    vars.Add(v)
+
+# Clone the environment to make sure we're not polluting the arm_compute one:
+common_env = env.Clone()
+vars.Update(common_env)
+
+Help(new_options.GenerateHelpText(common_env))
+
+if env['os'] in ['android', 'bare_metal']:
+    common_env.Append(LIBS = [arm_compute_a])
+    arm_compute_lib = arm_compute_a
+else:
+    common_env.Append(LIBS = ["arm_compute"])
+    arm_compute_lib = arm_compute_so
+
+if env['arch'] == 'arm64-v8.2-a' and ( common_env['validation_tests'] or common_env['benchmark_tests']):
+    print("validation_tests=1 and benchmark_tests=1 are not currently supported for arch=arm64-v8.2-a")
+    Exit(1)
+
+#FIXME Delete before release
+if common_env['internal_only']:
+    common_env.Append(CPPDEFINES=['INTERNAL_ONLY'])
+
+common_env.Append(CPPPATH = [".", "#3rdparty/include"])
+common_env.Append(LIBPATH = ["#3rdparty/%s/%s" % (env['os'], env['arch'])])
+common_env.Append(LIBPATH = ["#build/%s" % env['build_dir']])
+common_env.Append(LIBPATH = ["#build/%s/opencl-1.2-stubs" % env['build_dir']])
+common_env.Append(LIBS = ['boost_program_options'])
+common_env.Append(CXXFLAGS = ['-Wno-missing-field-initializers'])
+
+validation_env = common_env.Clone()
+benchmark_env = common_env.Clone()
+
+validation_env.Append(CPPDEFINES=['BOOST'])
+# overloaded virtual function "benchmark::Fixture::SetUp" is only partially overridden
+benchmark_env.Append(CPPFLAGS=['-Wno-overloaded-virtual'])
+
+files = Glob('*.cpp')
+
+common_objects = [ common_env.StaticObject( f ) for f in files ]
+
+validation_env.Append(LIBS = ['boost_unit_test_framework'])
+benchmark_env.Append(LIBS = ['benchmark'])
+
+files_validation = Glob('validation/*.cpp')
+files_benchmark = Glob('benchmark/*.cpp')
+
+if env['os'] == 'android' or not common_env['pmu']:
+    if env['os'] == 'android' and common_env['pmu']:
+        if env['Werror']:
+            print("pmu=1 is not supported for os=android")
+            Exit(1)
+        else:
+            print("pmu=1 is not supported for os=android")
+
+    files_benchmark = [f for f in files_benchmark if "PMU" not in os.path.basename(str(f))]
+
+# Add unit tests
+files_validation += Glob('validation/UNIT/*.cpp')
+files_validation += Glob('validation/UNIT/*/*.cpp')
+
+if env['opencl']:
+    Import('opencl')
+
+    benchmark_env.Append(CPPDEFINES=['OPENCL'])
+
+    files_validation += Glob('validation/CL/*.cpp')
+    files_validation += Glob('validation/CL/*/*.cpp')
+    files_validation += Glob('validation/system_tests/CL/*.cpp')
+    files_benchmark += Glob('benchmark/CL/*/*.cpp')
+    files_benchmark += Glob('benchmark/CL/*.cpp')
+    files_benchmark += Glob('benchmark/system_tests/CL/*.cpp')
+
+    validation_env.Append(LIBS = "OpenCL")
+    benchmark_env.Append(LIBS = "OpenCL")
+
+if env['neon']:
+    files_validation += Glob('validation/NEON/*.cpp')
+    files_validation += Glob('validation/NEON/*/*.cpp')
+    files_validation += Glob('validation/system_tests/NEON/*.cpp')
+    files_benchmark += Glob('benchmark/NEON/*/*.cpp')
+    files_benchmark += Glob('benchmark/NEON/*.cpp')
+    files_benchmark += Glob('benchmark/system_tests/NEON/*.cpp')
+
+if env['os'] == 'android':
+    validation_env.Append(LIBS = ["log"])
+    benchmark_env.Append(LIBS = ["log"])
+else:
+    benchmark_env.Append(LIBS = ["rt"])
+
+if common_env['validation_tests']:
+    arm_compute_validation = validation_env.Program('arm_compute_validation',
+                                                   files_validation + common_objects)
+    Depends(arm_compute_validation, arm_compute_lib)
+    if env['opencl']:
+        Depends(arm_compute_validation, opencl)
+    Default(arm_compute_validation)
+    Export('arm_compute_validation')
+if common_env['benchmark_tests']:
+    arm_compute_benchmark = benchmark_env.Program('arm_compute_benchmark',
+                                                 files_benchmark + common_objects)
+    Depends(arm_compute_benchmark, arm_compute_lib)
+    if env['opencl']:
+        Depends(arm_compute_benchmark, opencl)
+    Default(arm_compute_benchmark)
+    Export('arm_compute_benchmark')
+
diff --git a/tests/TensorCache.h b/tests/TensorCache.h
new file mode 100644
index 0000000000..b4f7b59e15
--- /dev/null
+++ b/tests/TensorCache.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_TENSOR_CACHE_H__
+#define __ARM_COMPUTE_TEST_TENSOR_CACHE_H__
+
+#include "RawTensor.h"
+
+#include <map>
+#include <mutex>
+#include <utility>
+
+namespace arm_compute
+{
+namespace test
+{
+/** Stores @ref RawTensor categorised by the image they are created from
+ * including name, format and channel.
+ */
+class TensorCache
+{
+public:
+    /* Search the cache for a tensor of created from the specified image and
+     * format.
+     *
+     * @param[in] key Key to look up the tensor. Consists of image name and format.
+     *
+     * @return The cached tensor matching the image name and format if found. A
+     *         nullptr otherwise.
+     */
+    RawTensor *find(std::tuple<const std::string &, Format> key);
+
+    /* Search the cache for a tensor of created from the specified image,
+     * format and channel.
+     *
+     * @param[in] key Key to look up the tensor. Consists of image name, format and channel.
+     *
+     * @return The cached tensor matching the image name and format if found. A
+     *         nullptr otherwise.
+     */
+    RawTensor *find(std::tuple<const std::string &, Format, Channel> key);
+
+    /** Add the given tensor to the cache. Can later be found under the given
+     * image name and format.
+     *
+     * @param[in] key Key under which to store the tensor. Consists of image name and format.
+     * @param[in] raw Raw tensor to be stored.
+     *
+     * @return A reference to the cached tensor.
+     */
+    RawTensor &add(std::tuple<const std::string &, Format> key, RawTensor raw);
+
+    /** Add the given tensor to the cache. Can later be found under the given
+     * image name, format and channel.
+     *
+     * @param[in] key Key under which to store the tensor. Consists of image name, format and channel.
+     * @param[in] raw Raw tensor to be stored.
+     *
+     * @return A reference to the cached tensor.
+     */
+    RawTensor &add(std::tuple<const std::string &, Format, Channel> key, RawTensor raw);
+
+private:
+    using FormatMap  = std::map<std::tuple<std::string, Format>, RawTensor>;
+    using ChannelMap = std::map<std::tuple<std::string, Format, Channel>, RawTensor>;
+
+    FormatMap  _raw_tensor_cache{};
+    ChannelMap _raw_tensor_channel_cache{};
+    std::mutex _raw_tensor_cache_mutex{};
+    std::mutex _raw_tensor_channel_cache_mutex{};
+};
+
+inline RawTensor *TensorCache::find(std::tuple<const std::string &, Format> key)
+{
+    const auto it = _raw_tensor_cache.find(key);
+    return it == _raw_tensor_cache.end() ? nullptr : &it->second;
+}
+
+inline RawTensor *TensorCache::find(std::tuple<const std::string &, Format, Channel> key)
+{
+    const auto it = _raw_tensor_channel_cache.find(key);
+    return it == _raw_tensor_channel_cache.end() ? nullptr : &it->second;
+}
+
+inline RawTensor &TensorCache::add(std::tuple<const std::string &, Format> key, RawTensor raw)
+{
+    std::lock_guard<std::mutex> lock(_raw_tensor_channel_cache_mutex);
+    return std::get<0>(_raw_tensor_cache.emplace(std::move(key), std::move(raw)))->second;
+}
+
+inline RawTensor &TensorCache::add(std::tuple<const std::string &, Format, Channel> key, RawTensor raw)
+{
+    std::lock_guard<std::mutex> lock(_raw_tensor_channel_cache_mutex);
+    return std::get<0>(_raw_tensor_channel_cache.emplace(std::move(key), std::move(raw)))->second;
+}
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/TensorLibrary.cpp b/tests/TensorLibrary.cpp
new file mode 100644
index 0000000000..0c85136a38
--- /dev/null
+++ b/tests/TensorLibrary.cpp
@@ -0,0 +1,475 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "TensorLibrary.h"
+
+#include "TypePrinter.h"
+#include "UserConfiguration.h"
+#include "Utils.h"
+
+#include "arm_compute/core/ITensor.h"
+
+#include <cctype>
+#include <fstream>
+#include <limits>
+#include <map>
+#include <mutex>
+#include <sstream>
+#include <stdexcept>
+#include <tuple>
+#include <unordered_map>
+#include <utility>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace
+{
+void convert_rgb_to_u8(const RawTensor &src, RawTensor &dst)
+{
+    const size_t min_size = std::min(src.size(), dst.size());
+
+    for(size_t i = 0, j = 0; i < min_size; i += 3, ++j)
+    {
+        dst.data()[j] = 0.2126f * src.data()[i + 0] + 0.7152f * src.data()[i + 1] + 0.0722f * src.data()[i + 2];
+    }
+}
+
+void convert_rgb_to_u16(const RawTensor &src, RawTensor &dst)
+{
+    const size_t min_size = std::min(src.size(), dst.size());
+
+    for(size_t i = 0, j = 0; i < min_size; i += 3, ++j)
+    {
+        reinterpret_cast<uint16_t *>(dst.data())[j] = 0.2126f * src.data()[i + 0] + 0.7152f * src.data()[i + 1] + 0.0722f * src.data()[i + 2];
+    }
+}
+
+void convert_rgb_to_s16(const RawTensor &src, RawTensor &dst)
+{
+    const size_t min_size = std::min(src.size(), dst.size());
+
+    for(size_t i = 0, j = 0; i < min_size; i += 3, ++j)
+    {
+        reinterpret_cast<int16_t *>(dst.data())[j] = 0.2126f * src.data()[i + 0] + 0.7152f * src.data()[i + 1] + 0.0722f * src.data()[i + 2];
+    }
+}
+
+void extract_r_from_rgb(const RawTensor &src, RawTensor &dst)
+{
+    const size_t min_size = std::min(src.size(), dst.size());
+
+    for(size_t i = 0, j = 0; i < min_size; i += 3, ++j)
+    {
+        dst.data()[j] = src.data()[i];
+    }
+}
+
+void extract_g_from_rgb(const RawTensor &src, RawTensor &dst)
+{
+    const size_t min_size = std::min(src.size(), dst.size());
+
+    for(size_t i = 1, j = 0; i < min_size; i += 3, ++j)
+    {
+        dst.data()[j] = src.data()[i];
+    }
+}
+
+void discard_comments(std::ifstream &fs)
+{
+    while(fs.peek() == '#')
+    {
+        fs.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
+    }
+}
+
+void discard_comments_and_spaces(std::ifstream &fs)
+{
+    while(true)
+    {
+        discard_comments(fs);
+
+        if(isspace(fs.peek()) == 0)
+        {
+            break;
+        }
+
+        fs.ignore(1);
+    }
+}
+
+std::tuple<unsigned int, unsigned int, int> parse_ppm_header(std::ifstream &fs)
+{
+    // Check the PPM magic number is valid
+    std::array<char, 2> magic_number{ { 0 } };
+    fs >> magic_number[0] >> magic_number[1];
+
+    if(magic_number[0] != 'P' || magic_number[1] != '6')
+    {
+        throw std::runtime_error("Only raw PPM format is suported");
+    }
+
+    discard_comments_and_spaces(fs);
+
+    unsigned int width = 0;
+    fs >> width;
+
+    discard_comments_and_spaces(fs);
+
+    unsigned int height = 0;
+    fs >> height;
+
+    discard_comments_and_spaces(fs);
+
+    int max_value = 0;
+    fs >> max_value;
+
+    if(!fs.good())
+    {
+        throw std::runtime_error("Cannot read image dimensions");
+    }
+
+    if(max_value != 255)
+    {
+        throw std::runtime_error("RawTensor doesn't have 8-bit values");
+    }
+
+    discard_comments(fs);
+
+    if(isspace(fs.peek()) == 0)
+    {
+        throw std::runtime_error("Invalid PPM header");
+    }
+
+    fs.ignore(1);
+
+    return std::make_tuple(width, height, max_value);
+}
+
+RawTensor load_ppm(const std::string &path)
+{
+    std::ifstream file(path, std::ios::in | std::ios::binary);
+
+    if(!file.good())
+    {
+        throw std::runtime_error("Could not load PPM image: " + path);
+    }
+
+    unsigned int width  = 0;
+    unsigned int height = 0;
+
+    std::tie(width, height, std::ignore) = parse_ppm_header(file);
+
+    RawTensor raw(TensorShape(width, height), Format::RGB888);
+
+    // Check if the file is large enough to fill the image
+    const size_t current_position = file.tellg();
+    file.seekg(0, std::ios_base::end);
+    const size_t end_position = file.tellg();
+    file.seekg(current_position, std::ios_base::beg);
+
+    if((end_position - current_position) < raw.size())
+    {
+        throw std::runtime_error("Not enough data in file");
+    }
+
+    file.read(reinterpret_cast<std::fstream::char_type *>(raw.data()), raw.size());
+
+    if(!file.good())
+    {
+        throw std::runtime_error("Failure while reading image buffer");
+    }
+
+    return raw;
+}
+} // namespace
+
+TensorLibrary::TensorLibrary(std::string path)
+    : _library_path(std::move(path)), _seed{ std::random_device()() }
+{
+}
+
+TensorLibrary::TensorLibrary(std::string path, std::random_device::result_type seed)
+    : _library_path(std::move(path)), _seed{ seed }
+{
+}
+
+std::random_device::result_type TensorLibrary::seed() const
+{
+    return _seed;
+}
+
+void TensorLibrary::fill(RawTensor &raw, const std::string &name, Format format) const
+{
+    //FIXME: Should be done by swapping cached buffers
+    const RawTensor &src = get(name, format);
+    std::copy_n(src.data(), raw.size(), raw.data());
+}
+
+void TensorLibrary::fill(RawTensor &raw, const std::string &name, Channel channel) const
+{
+    fill(raw, name, get_format_for_channel(channel), channel);
+}
+
+void TensorLibrary::fill(RawTensor &raw, const std::string &name, Format format, Channel channel) const
+{
+    const RawTensor &src = get(name, format, channel);
+    std::copy_n(src.data(), raw.size(), raw.data());
+}
+
+const TensorLibrary::Loader &TensorLibrary::get_loader(const std::string &extension) const
+{
+    static std::unordered_map<std::string, Loader> loaders =
+    {
+        { "ppm", load_ppm }
+    };
+
+    const auto it = loaders.find(extension);
+
+    if(it != loaders.end())
+    {
+        return it->second;
+    }
+    else
+    {
+        throw std::invalid_argument("Cannot load image with extension '" + extension + "'");
+    }
+}
+
+const TensorLibrary::Converter &TensorLibrary::get_converter(Format src, Format dst) const
+{
+    static std::map<std::pair<Format, Format>, Converter> converters =
+    {
+        { std::make_pair(Format::RGB888, Format::U8), convert_rgb_to_u8 },
+        { std::make_pair(Format::RGB888, Format::U16), convert_rgb_to_u16 },
+        { std::make_pair(Format::RGB888, Format::S16), convert_rgb_to_s16 }
+    };
+
+    const auto it = converters.find(std::make_pair(src, dst));
+
+    if(it != converters.end())
+    {
+        return it->second;
+    }
+    else
+    {
+        std::stringstream msg;
+        msg << "Cannot convert from format '" << src << "' to format '" << dst << "'\n";
+        throw std::invalid_argument(msg.str());
+    }
+}
+
+const TensorLibrary::Converter &TensorLibrary::get_converter(DataType src, Format dst) const
+{
+    static std::map<std::pair<DataType, Format>, Converter> converters = {};
+
+    const auto it = converters.find(std::make_pair(src, dst));
+
+    if(it != converters.end())
+    {
+        return it->second;
+    }
+    else
+    {
+        std::stringstream msg;
+        msg << "Cannot convert from data type '" << src << "' to format '" << dst << "'\n";
+        throw std::invalid_argument(msg.str());
+    }
+}
+
+const TensorLibrary::Converter &TensorLibrary::get_converter(DataType src, DataType dst) const
+{
+    static std::map<std::pair<DataType, DataType>, Converter> converters = {};
+
+    const auto it = converters.find(std::make_pair(src, dst));
+
+    if(it != converters.end())
+    {
+        return it->second;
+    }
+    else
+    {
+        std::stringstream msg;
+        msg << "Cannot convert from data type '" << src << "' to data type '" << dst << "'\n";
+        throw std::invalid_argument(msg.str());
+    }
+}
+
+const TensorLibrary::Converter &TensorLibrary::get_converter(Format src, DataType dst) const
+{
+    static std::map<std::pair<Format, DataType>, Converter> converters = {};
+
+    const auto it = converters.find(std::make_pair(src, dst));
+
+    if(it != converters.end())
+    {
+        return it->second;
+    }
+    else
+    {
+        std::stringstream msg;
+        msg << "Cannot convert from format '" << src << "' to data type '" << dst << "'\n";
+        throw std::invalid_argument(msg.str());
+    }
+}
+
+const TensorLibrary::Extractor &TensorLibrary::get_extractor(Format format, Channel channel) const
+{
+    static std::map<std::pair<Format, Channel>, Extractor> extractors =
+    {
+        { std::make_pair(Format::RGB888, Channel::R), extract_r_from_rgb },
+        { std::make_pair(Format::RGB888, Channel::G), extract_g_from_rgb }
+    };
+
+    const auto it = extractors.find(std::make_pair(format, channel));
+
+    if(it != extractors.end())
+    {
+        return it->second;
+    }
+    else
+    {
+        std::stringstream msg;
+        msg << "Cannot extract channel '" << channel << "' from format '" << format << "'\n";
+        throw std::invalid_argument(msg.str());
+    }
+}
+
+RawTensor TensorLibrary::load_image(const std::string &name) const
+{
+#ifdef _WIN32
+    const std::string image_path = ("\\images\\");
+#else
+    const std::string image_path = ("/images/");
+#endif
+
+    const std::string path      = _library_path + image_path + name;
+    const std::string extension = path.substr(path.find_last_of('.') + 1);
+    return (*get_loader(extension))(path);
+}
+
+const RawTensor &TensorLibrary::find_or_create_raw_tensor(const std::string &name, Format format) const
+{
+    std::lock_guard<std::mutex> guard(_format_lock);
+
+    const RawTensor *ptr = _cache.find(std::make_tuple(name, format));
+
+    if(ptr != nullptr)
+    {
+        return *ptr;
+    }
+
+    RawTensor raw = load_image(name);
+
+    if(raw.format() != format)
+    {
+        //FIXME: Remove unnecessary copy
+        RawTensor dst(raw.shape(), format);
+        (*get_converter(raw.format(), format))(raw, dst);
+        raw = std::move(dst);
+    }
+
+    return _cache.add(std::make_tuple(name, format), std::move(raw));
+}
+
+const RawTensor &TensorLibrary::find_or_create_raw_tensor(const std::string &name, Format format, Channel channel) const
+{
+    std::lock_guard<std::mutex> guard(_channel_lock);
+
+    const RawTensor *ptr = _cache.find(std::make_tuple(name, format, channel));
+
+    if(ptr != nullptr)
+    {
+        return *ptr;
+    }
+
+    const RawTensor &src = get(name, format);
+    //FIXME: Need to change shape to match channel
+    RawTensor dst(src.shape(), get_channel_format(channel));
+
+    (*get_extractor(format, channel))(src, dst);
+
+    return _cache.add(std::make_tuple(name, format, channel), std::move(dst));
+}
+
+RawTensor TensorLibrary::get(const TensorShape &shape, DataType data_type, int num_channels, int fixed_point_position)
+{
+    return RawTensor(shape, data_type, num_channels, fixed_point_position);
+}
+
+RawTensor TensorLibrary::get(const TensorShape &shape, Format format)
+{
+    return RawTensor(shape, format);
+}
+
+const RawTensor &TensorLibrary::get(const std::string &name) const
+{
+    //FIXME: Format should be derived from the image name. Not be fixed to RGB.
+    return find_or_create_raw_tensor(name, Format::RGB888);
+}
+
+RawTensor TensorLibrary::get(const std::string &name)
+{
+    //FIXME: Format should be derived from the image name. Not be fixed to RGB.
+    return RawTensor(find_or_create_raw_tensor(name, Format::RGB888));
+}
+
+RawTensor TensorLibrary::get(const std::string &name, DataType data_type, int num_channels) const
+{
+    const RawTensor &raw = get(name);
+
+    return RawTensor(raw.shape(), data_type, num_channels);
+}
+
+const RawTensor &TensorLibrary::get(const std::string &name, Format format) const
+{
+    return find_or_create_raw_tensor(name, format);
+}
+
+RawTensor TensorLibrary::get(const std::string &name, Format format)
+{
+    return RawTensor(find_or_create_raw_tensor(name, format));
+}
+
+const RawTensor &TensorLibrary::get(const std::string &name, Channel channel) const
+{
+    return get(name, get_format_for_channel(channel), channel);
+}
+
+RawTensor TensorLibrary::get(const std::string &name, Channel channel)
+{
+    return RawTensor(get(name, get_format_for_channel(channel), channel));
+}
+
+const RawTensor &TensorLibrary::get(const std::string &name, Format format, Channel channel) const
+{
+    return find_or_create_raw_tensor(name, format, channel);
+}
+
+RawTensor TensorLibrary::get(const std::string &name, Format format, Channel channel)
+{
+    return RawTensor(find_or_create_raw_tensor(name, format, channel));
+}
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/TensorLibrary.h b/tests/TensorLibrary.h
new file mode 100644
index 0000000000..b3974836ae
--- /dev/null
+++ b/tests/TensorLibrary.h
@@ -0,0 +1,656 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_TENSOR_LIBRARY_H__
+#define __ARM_COMPUTE_TEST_TENSOR_LIBRARY_H__
+
+#include "RawTensor.h"
+#include "TensorCache.h"
+#include "Utils.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Window.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <fstream>
+#include <random>
+#include <string>
+#include <type_traits>
+
+namespace arm_compute
+{
+namespace test
+{
+/** Factory class to create and fill tensors.
+ *
+ * Allows to initialise tensors from loaded images or by specifying the shape
+ * explicitly. Furthermore, provides methods to fill tensors with the content of
+ * loaded images or with random values.
+ */
+class TensorLibrary final
+{
+public:
+    /** Initialises the library with a @p path to the image directory.
+     *
+     * @param[in] path Path to load images from.
+     */
+    TensorLibrary(std::string path);
+
+    /** Initialises the library with a @p path to the image directory.
+     * Furthermore, sets the seed for the random generator to @p seed.
+     *
+     * @param[in] path Path to load images from.
+     * @param[in] seed Seed used to initialise the random number generator.
+     */
+    TensorLibrary(std::string path, std::random_device::result_type seed);
+
+    /** Seed that is used to fill tensors with random values. */
+    std::random_device::result_type seed() const;
+
+    /** Creates an uninitialised raw tensor with the given @p shape, @p
+     * data_type and @p num_channels.
+     *
+     * @param[in] shape                Shape used to initialise the tensor.
+     * @param[in] data_type            Data type used to initialise the tensor.
+     * @param[in] num_channels         (Optional) Number of channels used to initialise the tensor.
+     * @param[in] fixed_point_position (Optional) Number of bits for the fractional part of the fixed point numbers
+     */
+    static RawTensor get(const TensorShape &shape, DataType data_type, int num_channels = 1, int fixed_point_position = 0);
+
+    /** Creates an uninitialised raw tensor with the given @p shape and @p format.
+     *
+     * @param[in] shape  Shape used to initialise the tensor.
+     * @param[in] format Format used to initialise the tensor.
+     */
+    static RawTensor get(const TensorShape &shape, Format format);
+
+    /** Provides a contant raw tensor for the specified image.
+     *
+     * @param[in] name Image file used to look up the raw tensor.
+     */
+    const RawTensor &get(const std::string &name) const;
+
+    /** Provides a raw tensor for the specified image.
+     *
+     * @param[in] name Image file used to look up the raw tensor.
+     */
+    RawTensor get(const std::string &name);
+
+    /** Creates an uninitialised raw tensor with the given @p data_type and @p
+     * num_channels. The shape is derived from the specified image.
+     *
+     * @param[in] name         Image file used to initialise the tensor.
+     * @param[in] data_type    Data type used to initialise the tensor.
+     * @param[in] num_channels Number of channels used to initialise the tensor.
+     */
+    RawTensor get(const std::string &name, DataType data_type, int num_channels = 1) const;
+
+    /** Provides a contant raw tensor for the specified image after it has been
+     * converted to @p format.
+     *
+     * @param[in] name   Image file used to look up the raw tensor.
+     * @param[in] format Format used to look up the raw tensor.
+     */
+    const RawTensor &get(const std::string &name, Format format) const;
+
+    /** Provides a raw tensor for the specified image after it has been
+     * converted to @p format.
+     *
+     * @param[in] name   Image file used to look up the raw tensor.
+     * @param[in] format Format used to look up the raw tensor.
+     */
+    RawTensor get(const std::string &name, Format format);
+
+    /** Provides a contant raw tensor for the specified channel after it has
+     * been extracted form the given image.
+     *
+     * @param[in] name    Image file used to look up the raw tensor.
+     * @param[in] channel Channel used to look up the raw tensor.
+     *
+     * @note The channel has to be unambiguous so that the format can be
+     *       inferred automatically.
+     */
+    const RawTensor &get(const std::string &name, Channel channel) const;
+
+    /** Provides a raw tensor for the specified channel after it has been
+     * extracted form the given image.
+     *
+     * @param[in] name    Image file used to look up the raw tensor.
+     * @param[in] channel Channel used to look up the raw tensor.
+     *
+     * @note The channel has to be unambiguous so that the format can be
+     *       inferred automatically.
+     */
+    RawTensor get(const std::string &name, Channel channel);
+
+    /** Provides a constant raw tensor for the specified channel after it has
+     * been extracted form the given image formatted to @p format.
+     *
+     * @param[in] name    Image file used to look up the raw tensor.
+     * @param[in] format  Format used to look up the raw tensor.
+     * @param[in] channel Channel used to look up the raw tensor.
+     */
+    const RawTensor &get(const std::string &name, Format format, Channel channel) const;
+
+    /** Provides a raw tensor for the specified channel after it has been
+     * extracted form the given image formatted to @p format.
+     *
+     * @param[in] name    Image file used to look up the raw tensor.
+     * @param[in] format  Format used to look up the raw tensor.
+     * @param[in] channel Channel used to look up the raw tensor.
+     */
+    RawTensor get(const std::string &name, Format format, Channel channel);
+
+    /** Fills the specified @p tensor with random values drawn from @p
+     * distribution.
+     *
+     * @param[in, out] tensor       To be filled tensor.
+     * @param[in]      distribution Distribution used to fill the tensor.
+     * @param[in]      seed_offset  The offset will be added to the global seed before initialising the random generator.
+     *
+     * @note The @p distribution has to provide operator(Generator &) which
+     *       will be used to draw samples.
+     */
+    template <typename T, typename D>
+    void fill(T &&tensor, D &&distribution, std::random_device::result_type seed_offset) const;
+
+    /** Fills the specified @p raw tensor with random values drawn from @p
+     * distribution.
+     *
+     * @param[in, out] raw          To be filled raw.
+     * @param[in]      distribution Distribution used to fill the tensor.
+     * @param[in]      seed_offset  The offset will be added to the global seed before initialising the random generator.
+     *
+     * @note The @p distribution has to provide operator(Generator &) which
+     *       will be used to draw samples.
+     */
+    template <typename D>
+    void fill(RawTensor &raw, D &&distribution, std::random_device::result_type seed_offset) const;
+
+    /** Fills the specified @p tensor with the content of the specified image
+     * converted to the given format.
+     *
+     * @param[in, out] tensor To be filled tensor.
+     * @param[in]      name   Image file used to fill the tensor.
+     * @param[in]      format Format of the image used to fill the tensor.
+     *
+     * @warning No check is performed that the specified format actually
+     *          matches the format of the tensor.
+     */
+    template <typename T>
+    void fill(T &&tensor, const std::string &name, Format format) const;
+
+    /** Fills the raw tensor with the content of the specified image
+     * converted to the given format.
+     *
+     * @param[in, out] raw    To be filled raw tensor.
+     * @param[in]      name   Image file used to fill the tensor.
+     * @param[in]      format Format of the image used to fill the tensor.
+     *
+     * @warning No check is performed that the specified format actually
+     *          matches the format of the tensor.
+     */
+    void fill(RawTensor &raw, const std::string &name, Format format) const;
+
+    /** Fills the specified @p tensor with the content of the specified channel
+     * extracted from the given image.
+     *
+     * @param[in, out] tensor  To be filled tensor.
+     * @param[in]      name    Image file used to fill the tensor.
+     * @param[in]      channel Channel of the image used to fill the tensor.
+     *
+     * @note The channel has to be unambiguous so that the format can be
+     *       inferred automatically.
+     *
+     * @warning No check is performed that the specified format actually
+     *          matches the format of the tensor.
+     */
+    template <typename T>
+    void fill(T &&tensor, const std::string &name, Channel channel) const;
+
+    /** Fills the raw tensor with the content of the specified channel
+     * extracted from the given image.
+     *
+     * @param[in, out] raw     To be filled raw tensor.
+     * @param[in]      name    Image file used to fill the tensor.
+     * @param[in]      channel Channel of the image used to fill the tensor.
+     *
+     * @note The channel has to be unambiguous so that the format can be
+     *       inferred automatically.
+     *
+     * @warning No check is performed that the specified format actually
+     *          matches the format of the tensor.
+     */
+    void fill(RawTensor &raw, const std::string &name, Channel channel) const;
+
+    /** Fills the specified @p tensor with the content of the specified channel
+     * extracted from the given image after it has been converted to the given
+     * format.
+     *
+     * @param[in, out] tensor  To be filled tensor.
+     * @param[in]      name    Image file used to fill the tensor.
+     * @param[in]      format  Format of the image used to fill the tensor.
+     * @param[in]      channel Channel of the image used to fill the tensor.
+     *
+     * @warning No check is performed that the specified format actually
+     *          matches the format of the tensor.
+     */
+    template <typename T>
+    void fill(T &&tensor, const std::string &name, Format format, Channel channel) const;
+
+    /** Fills the raw tensor with the content of the specified channel
+     * extracted from the given image after it has been converted to the given
+     * format.
+     *
+     * @param[in, out] raw     To be filled raw tensor.
+     * @param[in]      name    Image file used to fill the tensor.
+     * @param[in]      format  Format of the image used to fill the tensor.
+     * @param[in]      channel Channel of the image used to fill the tensor.
+     *
+     * @warning No check is performed that the specified format actually
+     *          matches the format of the tensor.
+     */
+    void fill(RawTensor &raw, const std::string &name, Format format, Channel channel) const;
+
+    /** Fill a tensor with uniform distribution across the range of its type
+     *
+     * @param[in, out] tensor      To be filled tensor.
+     * @param[in]      seed_offset The offset will be added to the global seed before initialising the random generator.
+     */
+    template <typename T>
+    void fill_tensor_uniform(T &&tensor, std::random_device::result_type seed_offset) const;
+
+    /** Fill a tensor with uniform distribution across the a specified range
+     *
+     * @param[in, out] tensor      To be filled tensor.
+     * @param[in]      seed_offset The offset will be added to the global seed before initialising the random generator.
+     * @param[in]      low         lowest value in the range (inclusive)
+     * @param[in]      high        highest value in the range (inclusive)
+     *
+     * @note    @p low and @p high must be of the same type as the data type of @p tensor
+     */
+    template <typename T, typename D>
+    void fill_tensor_uniform(T &&tensor, std::random_device::result_type seed_offset, D low, D high) const;
+
+    /** Fills the specified @p tensor with data loaded from binary in specified path.
+     *
+     * @param[in, out] tensor To be filled tensor.
+     * @param[in]      name   Data file.
+     */
+    template <typename T>
+    void fill_layer_data(T &&tensor, std::string name) const;
+
+private:
+    // Function prototype to convert between image formats.
+    using Converter = void (*)(const RawTensor &src, RawTensor &dst);
+    // Function prototype to extract a channel from an image.
+    using Extractor = void (*)(const RawTensor &src, RawTensor &dst);
+    // Function prototype to load an image file.
+    using Loader = RawTensor (*)(const std::string &path);
+
+    const Converter &get_converter(Format src, Format dst) const;
+    const Converter &get_converter(DataType src, Format dst) const;
+    const Converter &get_converter(Format src, DataType dst) const;
+    const Converter &get_converter(DataType src, DataType dst) const;
+    const Extractor &get_extractor(Format format, Channel) const;
+    const Loader &get_loader(const std::string &extension) const;
+
+    /** Creates a raw tensor from the specified image.
+     *
+     * @param[in] name To be loaded image file.
+     *
+     * @note If use_single_image is true @p name is ignored and the user image
+     *       is loaded instead.
+     */
+    RawTensor load_image(const std::string &name) const;
+
+    /** Provides a raw tensor for the specified image and format.
+     *
+     * @param[in] name   Image file used to look up the raw tensor.
+     * @param[in] format Format used to look up the raw tensor.
+     *
+     * If the tensor has already been requested before the cached version will
+     * be returned. Otherwise the tensor will be added to the cache.
+     *
+     * @note If use_single_image is true @p name is ignored and the user image
+     *       is loaded instead.
+     */
+    const RawTensor &find_or_create_raw_tensor(const std::string &name, Format format) const;
+
+    /** Provides a raw tensor for the specified image, format and channel.
+     *
+     * @param[in] name    Image file used to look up the raw tensor.
+     * @param[in] format  Format used to look up the raw tensor.
+     * @param[in] channel Channel used to look up the raw tensor.
+     *
+     * If the tensor has already been requested before the cached version will
+     * be returned. Otherwise the tensor will be added to the cache.
+     *
+     * @note If use_single_image is true @p name is ignored and the user image
+     *       is loaded instead.
+     */
+    const RawTensor &find_or_create_raw_tensor(const std::string &name, Format format, Channel channel) const;
+
+    mutable TensorCache             _cache{};
+    mutable std::mutex              _format_lock{};
+    mutable std::mutex              _channel_lock{};
+    std::string                     _library_path;
+    std::random_device::result_type _seed;
+};
+
+template <typename T, typename D>
+void TensorLibrary::fill(T &&tensor, D &&distribution, std::random_device::result_type seed_offset) const
+{
+    Window window;
+    for(unsigned int d = 0; d < tensor.shape().num_dimensions(); ++d)
+    {
+        window.set(d, Window::Dimension(0, tensor.shape()[d], 1));
+    }
+
+    std::mt19937 gen(_seed + seed_offset);
+
+    //FIXME: Replace with normal loop
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        using ResultType         = typename std::remove_reference<D>::type::result_type;
+        const ResultType value   = distribution(gen);
+        void *const      out_ptr = tensor(id);
+        store_value_with_data_type(out_ptr, value, tensor.data_type());
+    });
+}
+
+template <typename D>
+void TensorLibrary::fill(RawTensor &raw, D &&distribution, std::random_device::result_type seed_offset) const
+{
+    std::mt19937 gen(_seed + seed_offset);
+
+    for(size_t offset = 0; offset < raw.size(); offset += raw.element_size())
+    {
+        using ResultType       = typename std::remove_reference<D>::type::result_type;
+        const ResultType value = distribution(gen);
+        store_value_with_data_type(raw.data() + offset, value, raw.data_type());
+    }
+}
+
+template <typename T>
+void TensorLibrary::fill(T &&tensor, const std::string &name, Format format) const
+{
+    const RawTensor &raw = get(name, format);
+
+    for(size_t offset = 0; offset < raw.size(); offset += raw.element_size())
+    {
+        const Coordinates id = index2coord(raw.shape(), offset / raw.element_size());
+
+        const RawTensor::BufferType *const raw_ptr = raw.data() + offset;
+        const auto                         out_ptr = static_cast<RawTensor::BufferType *>(tensor(id));
+        std::copy_n(raw_ptr, raw.element_size(), out_ptr);
+    }
+}
+
+template <typename T>
+void TensorLibrary::fill(T &&tensor, const std::string &name, Channel channel) const
+{
+    fill(std::forward<T>(tensor), name, get_format_for_channel(channel), channel);
+}
+
+template <typename T>
+void TensorLibrary::fill(T &&tensor, const std::string &name, Format format, Channel channel) const
+{
+    const RawTensor &raw = get(name, format, channel);
+
+    for(size_t offset = 0; offset < raw.size(); offset += raw.element_size())
+    {
+        const Coordinates id = index2coord(raw.shape(), offset / raw.element_size());
+
+        const RawTensor::BufferType *const raw_ptr = raw.data() + offset;
+        const auto                         out_ptr = static_cast<RawTensor::BufferType *>(tensor(id));
+        std::copy_n(raw_ptr, raw.element_size(), out_ptr);
+    }
+}
+
+template <typename T>
+void TensorLibrary::fill_tensor_uniform(T &&tensor, std::random_device::result_type seed_offset) const
+{
+    switch(tensor.data_type())
+    {
+        case DataType::U8:
+        {
+            std::uniform_int_distribution<uint8_t> distribution_u8(std::numeric_limits<uint8_t>::lowest(), std::numeric_limits<uint8_t>::max());
+            fill(tensor, distribution_u8, seed_offset);
+            break;
+        }
+        case DataType::S8:
+        case DataType::QS8:
+        {
+            std::uniform_int_distribution<int8_t> distribution_s8(std::numeric_limits<int8_t>::lowest(), std::numeric_limits<int8_t>::max());
+            fill(tensor, distribution_s8, seed_offset);
+            break;
+        }
+        case DataType::U16:
+        {
+            std::uniform_int_distribution<uint16_t> distribution_u16(std::numeric_limits<uint16_t>::lowest(), std::numeric_limits<uint16_t>::max());
+            fill(tensor, distribution_u16, seed_offset);
+            break;
+        }
+        case DataType::S16:
+        {
+            std::uniform_int_distribution<int16_t> distribution_s16(std::numeric_limits<int16_t>::lowest(), std::numeric_limits<int16_t>::max());
+            fill(tensor, distribution_s16, seed_offset);
+            break;
+        }
+        case DataType::U32:
+        {
+            std::uniform_int_distribution<uint32_t> distribution_u32(std::numeric_limits<uint32_t>::lowest(), std::numeric_limits<uint32_t>::max());
+            fill(tensor, distribution_u32, seed_offset);
+            break;
+        }
+        case DataType::S32:
+        {
+            std::uniform_int_distribution<int32_t> distribution_s32(std::numeric_limits<int32_t>::lowest(), std::numeric_limits<int32_t>::max());
+            fill(tensor, distribution_s32, seed_offset);
+            break;
+        }
+        case DataType::U64:
+        {
+            std::uniform_int_distribution<uint64_t> distribution_u64(std::numeric_limits<uint64_t>::lowest(), std::numeric_limits<uint64_t>::max());
+            fill(tensor, distribution_u64, seed_offset);
+            break;
+        }
+        case DataType::S64:
+        {
+            std::uniform_int_distribution<int64_t> distribution_s64(std::numeric_limits<int64_t>::lowest(), std::numeric_limits<int64_t>::max());
+            fill(tensor, distribution_s64, seed_offset);
+            break;
+        }
+#ifdef ENABLE_FP16
+        case DataType::F16:
+        {
+            std::uniform_real_distribution<float16_t> distribution_f16(std::numeric_limits<float16_t>::lowest(), std::numeric_limits<float16_t>::max());
+            fill(tensor, distribution_f16, seed_offset);
+            break;
+        }
+#endif
+        case DataType::F32:
+        {
+            // It doesn't make sense to check [-inf, inf], so hard code it to a big number
+            std::uniform_real_distribution<float> distribution_f32(-1000.f, 1000.f);
+            fill(tensor, distribution_f32, seed_offset);
+            break;
+        }
+        case DataType::F64:
+        {
+            // It doesn't make sense to check [-inf, inf], so hard code it to a big number
+            std::uniform_real_distribution<double> distribution_f64(-1000.f, 1000.f);
+            fill(tensor, distribution_f64, seed_offset);
+            break;
+        }
+        case DataType::SIZET:
+        {
+            std::uniform_int_distribution<size_t> distribution_sizet(std::numeric_limits<size_t>::lowest(), std::numeric_limits<size_t>::max());
+            fill(tensor, distribution_sizet, seed_offset);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("NOT SUPPORTED!");
+    }
+}
+
+template <typename T, typename D>
+void TensorLibrary::fill_tensor_uniform(T &&tensor, std::random_device::result_type seed_offset, D low, D high) const
+{
+    switch(tensor.data_type())
+    {
+        case DataType::U8:
+        {
+            ARM_COMPUTE_ERROR_ON(!(std::is_same<uint8_t, D>::value));
+            std::uniform_int_distribution<uint8_t> distribution_u8(low, high);
+            fill(tensor, distribution_u8, seed_offset);
+            break;
+        }
+        case DataType::S8:
+        case DataType::QS8:
+        {
+            ARM_COMPUTE_ERROR_ON(!(std::is_same<int8_t, D>::value));
+            std::uniform_int_distribution<int8_t> distribution_s8(low, high);
+            fill(tensor, distribution_s8, seed_offset);
+            break;
+        }
+        case DataType::U16:
+        {
+            ARM_COMPUTE_ERROR_ON(!(std::is_same<uint16_t, D>::value));
+            std::uniform_int_distribution<uint16_t> distribution_u16(low, high);
+            fill(tensor, distribution_u16, seed_offset);
+            break;
+        }
+        case DataType::S16:
+        {
+            ARM_COMPUTE_ERROR_ON(!(std::is_same<int16_t, D>::value));
+            std::uniform_int_distribution<int16_t> distribution_s16(low, high);
+            fill(tensor, distribution_s16, seed_offset);
+            break;
+        }
+        case DataType::U32:
+        {
+            ARM_COMPUTE_ERROR_ON(!(std::is_same<uint32_t, D>::value));
+            std::uniform_int_distribution<uint32_t> distribution_u32(low, high);
+            fill(tensor, distribution_u32, seed_offset);
+            break;
+        }
+        case DataType::S32:
+        {
+            ARM_COMPUTE_ERROR_ON(!(std::is_same<int32_t, D>::value));
+            std::uniform_int_distribution<int32_t> distribution_s32(low, high);
+            fill(tensor, distribution_s32, seed_offset);
+            break;
+        }
+        case DataType::U64:
+        {
+            ARM_COMPUTE_ERROR_ON(!(std::is_same<uint64_t, D>::value));
+            std::uniform_int_distribution<uint64_t> distribution_u64(low, high);
+            fill(tensor, distribution_u64, seed_offset);
+            break;
+        }
+        case DataType::S64:
+        {
+            ARM_COMPUTE_ERROR_ON(!(std::is_same<int64_t, D>::value));
+            std::uniform_int_distribution<int64_t> distribution_s64(low, high);
+            fill(tensor, distribution_s64, seed_offset);
+            break;
+        }
+#if ENABLE_FP16
+        case DataType::F16:
+        {
+            ARM_COMPUTE_ERROR_ON(!(std::is_same<float16_t, D>::value));
+            std::uniform_real_distribution<float16_t> distribution_f16(low, high);
+            fill(tensor, distribution_f16, seed_offset);
+            break;
+        }
+#endif
+        case DataType::F32:
+        {
+            ARM_COMPUTE_ERROR_ON(!(std::is_same<float, D>::value));
+            std::uniform_real_distribution<float> distribution_f32(low, high);
+            fill(tensor, distribution_f32, seed_offset);
+            break;
+        }
+        case DataType::F64:
+        {
+            ARM_COMPUTE_ERROR_ON(!(std::is_same<double, D>::value));
+            std::uniform_real_distribution<double> distribution_f64(low, high);
+            fill(tensor, distribution_f64, seed_offset);
+            break;
+        }
+        case DataType::SIZET:
+        {
+            ARM_COMPUTE_ERROR_ON(!(std::is_same<size_t, D>::value));
+            std::uniform_int_distribution<size_t> distribution_sizet(low, high);
+            fill(tensor, distribution_sizet, seed_offset);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("NOT SUPPORTED!");
+    }
+}
+
+template <typename T>
+void TensorLibrary::fill_layer_data(T &&tensor, std::string name) const
+{
+#ifdef _WIN32
+    const std::string path_separator("\\");
+#else
+    const std::string path_separator("/");
+#endif
+
+    const std::string path = _library_path + path_separator + name;
+
+    // Open file
+    std::ifstream file(path, std::ios::in | std::ios::binary);
+    if(!file.good())
+    {
+        throw std::runtime_error("Could not load binary data: " + path);
+    }
+
+    Window window;
+    for(unsigned int d = 0; d < tensor.shape().num_dimensions(); ++d)
+    {
+        window.set(d, Window::Dimension(0, tensor.shape()[d], 1));
+    }
+
+    //FIXME : Replace with normal loop
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        float val;
+        file.read(reinterpret_cast<char *>(&val), sizeof(float));
+        void *const out_ptr = tensor(id);
+        store_value_with_data_type(out_ptr, val, tensor.data_type());
+    });
+}
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/TypePrinter.h b/tests/TypePrinter.h
new file mode 100644
index 0000000000..3d5a19981f
--- /dev/null
+++ b/tests/TypePrinter.h
@@ -0,0 +1,403 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_TYPE_PRINTER_H__
+#define __ARM_COMPUTE_TEST_TYPE_PRINTER_H__
+
+#include "arm_compute/core/Dimensions.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Types.h"
+
+#include <ostream>
+
+namespace arm_compute
+{
+/** Formatted output of the Dimensions type. */
+template <typename T>
+inline ::std::ostream &operator<<(::std::ostream &os, const Dimensions<T> &dimensions)
+{
+    os << "(";
+
+    if(dimensions.num_dimensions() > 0)
+    {
+        os << dimensions[0];
+
+        for(unsigned int d = 1; d < dimensions.num_dimensions(); ++d)
+        {
+            os << ", " << dimensions[d];
+        }
+    }
+
+    os << ")";
+
+    return os;
+}
+
+/** Formatted output of the PadStridInfo type. */
+inline ::std::ostream &operator<<(::std::ostream &os, const PadStrideInfo &pad_stride_info)
+{
+    os << "(";
+    os << pad_stride_info.stride().first << ", " << pad_stride_info.stride().second;
+    os << ", ";
+    os << pad_stride_info.pad().first << ", " << pad_stride_info.pad().second;
+    os << ")";
+
+    return os;
+}
+
+/** Formatted output of the BorderMode type. */
+inline ::std::ostream &operator<<(::std::ostream &os, const BorderMode &mode)
+{
+    switch(mode)
+    {
+        case BorderMode::UNDEFINED:
+            os << "UNDEFINED";
+            break;
+        case BorderMode::CONSTANT:
+            os << "CONSTANT";
+            break;
+        case BorderMode::REPLICATE:
+            os << "REPLICATE";
+            break;
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+
+    return os;
+}
+
+/** Formatted output of the InterpolationPolicy type. */
+inline ::std::ostream &operator<<(::std::ostream &os, const InterpolationPolicy &policy)
+{
+    switch(policy)
+    {
+        case InterpolationPolicy::NEAREST_NEIGHBOR:
+            os << "NEAREST_NEIGHBOR";
+            break;
+        case InterpolationPolicy::BILINEAR:
+            os << "BILINEAR";
+            break;
+        case InterpolationPolicy::AREA:
+            os << "AREA";
+            break;
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+
+    return os;
+}
+
+/** Formatted output of the ConversionPolicy type. */
+inline ::std::ostream &operator<<(::std::ostream &os, const ConvertPolicy &policy)
+{
+    switch(policy)
+    {
+        case ConvertPolicy::WRAP:
+            os << "WRAP";
+            break;
+        case ConvertPolicy::SATURATE:
+            os << "SATURATE";
+            break;
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+
+    return os;
+}
+
+/** Formatted output of the activation function type. */
+inline ::std::ostream &operator<<(::std::ostream &os, const ActivationLayerInfo::ActivationFunction &act_function)
+{
+    switch(act_function)
+    {
+        case ActivationLayerInfo::ActivationFunction::ABS:
+            os << "ABS";
+            break;
+        case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+            os << "BOUNDED_RELU";
+            break;
+        case ActivationLayerInfo::ActivationFunction::LINEAR:
+            os << "LINEAR";
+            break;
+        case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+            os << "LOGISTIC";
+            break;
+        case ActivationLayerInfo::ActivationFunction::RELU:
+            os << "RELU";
+            break;
+        case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+            os << "SOFT_RELU";
+            break;
+        case ActivationLayerInfo::ActivationFunction::SQRT:
+            os << "SQRT";
+            break;
+        case ActivationLayerInfo::ActivationFunction::SQUARE:
+            os << "SQUARE";
+            break;
+        case ActivationLayerInfo::ActivationFunction::TANH:
+            os << "TANH";
+            break;
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+
+    return os;
+}
+
+/** Formatted output of the NormType type. */
+inline ::std::ostream &operator<<(::std::ostream &os, const NormType &norm_type)
+{
+    switch(norm_type)
+    {
+        case NormType::CROSS_MAP:
+            os << "CROSS_MAP";
+            break;
+        case NormType::IN_MAP_1D:
+            os << "IN_MAP_1D";
+            break;
+        case NormType::IN_MAP_2D:
+            os << "IN_MAP_2D";
+            break;
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+
+    return os;
+}
+
+/** Formatted output of the PoolingType type. */
+inline ::std::ostream &operator<<(::std::ostream &os, const PoolingType &pool_type)
+{
+    switch(pool_type)
+    {
+        case PoolingType::AVG:
+            os << "AVG";
+            break;
+        case PoolingType::MAX:
+            os << "MAX";
+            break;
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+
+    return os;
+}
+
+/** Formatted output of the RoundingPolicy type. */
+inline ::std::ostream &operator<<(::std::ostream &os, const RoundingPolicy &rounding_policy)
+{
+    switch(rounding_policy)
+    {
+        case RoundingPolicy::TO_ZERO:
+            os << "TO_ZERO";
+            break;
+        case RoundingPolicy::TO_NEAREST_UP:
+            os << "TO_NEAREST_UP";
+            break;
+        case RoundingPolicy::TO_NEAREST_EVEN:
+            os << "TO_NEAREST_EVEN";
+            break;
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+
+    return os;
+}
+
+/** Formatted output of the DataType type. */
+inline ::std::ostream &operator<<(::std::ostream &os, const DataType &data_type)
+{
+    switch(data_type)
+    {
+        case DataType::UNKNOWN:
+            os << "UNKNOWN";
+            break;
+        case DataType::U8:
+            os << "U8";
+            break;
+        case DataType::QS8:
+            os << "QS8";
+            break;
+        case DataType::S8:
+            os << "S8";
+            break;
+        case DataType::U16:
+            os << "U16";
+            break;
+        case DataType::S16:
+            os << "S16";
+            break;
+        case DataType::U32:
+            os << "U32";
+            break;
+        case DataType::S32:
+            os << "S32";
+            break;
+        case DataType::U64:
+            os << "U64";
+            break;
+        case DataType::S64:
+            os << "S64";
+            break;
+        case DataType::F16:
+            os << "F16";
+            break;
+        case DataType::F32:
+            os << "F32";
+            break;
+        case DataType::F64:
+            os << "F64";
+            break;
+        case DataType::SIZET:
+            os << "SIZET";
+            break;
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+
+    return os;
+}
+
+/** Formatted output of the Format type. */
+inline ::std::ostream &operator<<(::std::ostream &os, const Format &format)
+{
+    switch(format)
+    {
+        case Format::UNKNOWN:
+            os << "UNKNOWN";
+            break;
+        case Format::U8:
+            os << "U8";
+            break;
+        case Format::S16:
+            os << "S16";
+            break;
+        case Format::U16:
+            os << "U16";
+            break;
+        case Format::S32:
+            os << "S32";
+            break;
+        case Format::U32:
+            os << "U32";
+            break;
+        case Format::F16:
+            os << "F16";
+            break;
+        case Format::F32:
+            os << "F32";
+            break;
+        case Format::UV88:
+            os << "UV88";
+            break;
+        case Format::RGB888:
+            os << "RGB888";
+            break;
+        case Format::RGBA8888:
+            os << "RGBA8888";
+            break;
+        case Format::YUV444:
+            os << "YUV444";
+            break;
+        case Format::YUYV422:
+            os << "YUYV422";
+            break;
+        case Format::NV12:
+            os << "NV12";
+            break;
+        case Format::NV21:
+            os << "NV21";
+            break;
+        case Format::IYUV:
+            os << "IYUV";
+            break;
+        case Format::UYVY422:
+            os << "UYVY422";
+            break;
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+
+    return os;
+}
+
+/** Formatted output of the Channel type. */
+inline ::std::ostream &operator<<(::std::ostream &os, const Channel &channel)
+{
+    switch(channel)
+    {
+        case Channel::UNKNOWN:
+            os << "UNKNOWN";
+            break;
+        case Channel::C0:
+            os << "C0";
+            break;
+        case Channel::C1:
+            os << "C1";
+            break;
+        case Channel::C2:
+            os << "C2";
+            break;
+        case Channel::C3:
+            os << "C3";
+            break;
+        case Channel::R:
+            os << "R";
+            break;
+        case Channel::G:
+            os << "G";
+            break;
+        case Channel::B:
+            os << "B";
+            break;
+        case Channel::A:
+            os << "A";
+            break;
+        case Channel::Y:
+            os << "Y";
+            break;
+        case Channel::U:
+            os << "U";
+            break;
+        case Channel::V:
+            os << "V";
+            break;
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+
+    return os;
+}
+
+/** Formatted output of the BorderSize type. */
+inline ::std::ostream &operator<<(::std::ostream &os, const BorderSize &border)
+{
+    os << "{" << border.top << ", "
+       << border.right << ", "
+       << border.bottom << ", "
+       << border.left << "}";
+
+    return os;
+}
+} // namespace arm_compute
+#endif
diff --git a/tests/TypeReader.h b/tests/TypeReader.h
new file mode 100644
index 0000000000..82eb9e42cf
--- /dev/null
+++ b/tests/TypeReader.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_TYPE_READER_H__
+#define __ARM_COMPUTE_TEST_TYPE_READER_H__
+
+#include "arm_compute/core/Types.h"
+
+#include <algorithm>
+#include <cctype>
+#include <istream>
+
+namespace arm_compute
+{
+/** Formatted input of the BorderMode type. */
+inline ::std::istream &operator>>(::std::istream &is, BorderMode &mode)
+{
+    std::string value;
+
+    is >> value;
+
+    std::transform(value.begin(), value.end(), value.begin(), [](unsigned char c)
+    {
+        return std::toupper(c);
+    });
+
+    if(value == "UNDEFINED")
+    {
+        mode = BorderMode::UNDEFINED;
+    }
+    else if(value == "CONSTANT")
+    {
+        mode = BorderMode::CONSTANT;
+    }
+    else if(value == "REPLICATE")
+    {
+        mode = BorderMode::REPLICATE;
+    }
+    else
+    {
+        throw std::invalid_argument("Unsupported value '" + value + "' for border mode");
+    }
+
+    return is;
+}
+} // namespace arm_compute
+#endif
diff --git a/tests/Types.h b/tests/Types.h
new file mode 100644
index 0000000000..2cb69ff04e
--- /dev/null
+++ b/tests/Types.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_TYPES_H__
+#define __ARM_COMPUTE_TEST_TYPES_H__
+namespace arm_compute
+{
+/** Fixed point operation */
+enum class FixedPointOp
+{
+    EXP,       /**< Exponential */
+    LOG,       /**< Logarithm */
+    INV_SQRT,  /**< Inverse square root */
+    RECIPROCAL /**< Reciprocal */
+};
+} // namespace arm_compute
+#endif
diff --git a/tests/UserConfiguration.cpp b/tests/UserConfiguration.cpp
new file mode 100644
index 0000000000..a24de90468
--- /dev/null
+++ b/tests/UserConfiguration.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "UserConfiguration.h"
+
+#include "ProgramOptions.h"
+
+#include <string>
+
+namespace arm_compute
+{
+namespace test
+{
+UserConfiguration::UserConfiguration(const ProgramOptions &options)
+{
+    std::random_device::result_type tmp_seed = 0;
+    if(options.get("seed", tmp_seed))
+    {
+        seed = tmp_seed;
+    }
+
+    std::string tmp_path;
+    if(options.get("path", tmp_path))
+    {
+        path = tmp_path;
+    }
+
+    unsigned int tmp_threads = 0;
+    if(options.get("threads", tmp_threads))
+    {
+        threads = tmp_threads;
+    }
+}
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/UserConfiguration.h b/tests/UserConfiguration.h
new file mode 100644
index 0000000000..dad0960bdb
--- /dev/null
+++ b/tests/UserConfiguration.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_USER_CONFIGURATION_H__
+#define __ARM_COMPUTE_TEST_USER_CONFIGURATION_H__
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Types.h"
+
+#include <random>
+#include <string>
+
+namespace arm_compute
+{
+namespace test
+{
+class ProgramOptions;
+
+/** Container providing easy access to runtime options provided by the user. */
+struct UserConfiguration
+{
+protected:
+    /** Wrapper around options to store if an option has been set. */
+    template <typename T>
+    class Option
+    {
+    public:
+        /** Initialise the option to its default (C++) value and mark it as 'not set'. */
+        Option();
+
+        /** Initialise the option to the given @p value and mark it as 'set'. */
+        Option(const T &value);
+
+        /** Assign the given @p value and mark it as 'set'. */
+        Option<T> &operator=(const T &value);
+
+        /** Query if the option has been set. */
+        constexpr bool is_set() const;
+
+        /** Return the underlying value as constant. */
+        T get() const;
+
+        /** Return the underlying value. */
+        T &get();
+
+        /** Implicitly return the underlying value. */
+        operator T() const;
+
+    private:
+        T    _value;
+        bool _is_set;
+    };
+
+public:
+    UserConfiguration() = default;
+
+    /** Initialise the configuration according to the program options.
+     *
+     * @param[in] options Parsed command line options.
+     */
+    UserConfiguration(const ProgramOptions &options);
+
+    Option<std::string>                     path{};
+    Option<std::random_device::result_type> seed{};
+    Option<unsigned int>                    threads{};
+};
+
+template <typename T>
+UserConfiguration::Option<T>::Option()
+    : _value{}, _is_set{ false }
+{
+}
+
+template <typename T>
+UserConfiguration::Option<T>::Option(const T &value)
+    : _value{ value }, _is_set{ true }
+{
+}
+
+template <typename T>
+UserConfiguration::Option<T> &UserConfiguration::Option<T>::operator=(const T &value)
+{
+    _value  = value;
+    _is_set = true;
+
+    return *this;
+}
+
+template <typename T>
+constexpr bool UserConfiguration::Option<T>::is_set() const
+{
+    return _is_set;
+}
+
+template <typename T>
+T UserConfiguration::Option<T>::get() const
+{
+    ARM_COMPUTE_ERROR_ON(!is_set());
+    return _value;
+}
+
+template <typename T>
+T &UserConfiguration::Option<T>::get()
+{
+    return _value;
+}
+
+template <typename T>
+UserConfiguration::Option<T>::operator T() const
+{
+    ARM_COMPUTE_ERROR_ON(!is_set());
+    return _value;
+}
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/Utils.h b/tests/Utils.h
new file mode 100644
index 0000000000..420890442a
--- /dev/null
+++ b/tests/Utils.h
@@ -0,0 +1,672 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_UTILS_H__
+#define __ARM_COMPUTE_TEST_UTILS_H__
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/FixedPoint.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+
+#include <cmath>
+#include <cstddef>
+#include <limits>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <type_traits>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace cpp11
+{
+#ifdef __ANDROID__
+/** Convert integer and float values to string.
+ *
+ * @note This function implements the same behaviour as std::to_string. The
+ *       latter is missing in some Android toolchains.
+ *
+ * @param[in] value Value to be converted to string.
+ *
+ * @return String representation of @p value.
+ */
+template <typename T, typename std::enable_if<std::is_arithmetic<typename std::decay<T>::type>::value, int>::type = 0>
+std::string to_string(T && value)
+{
+    std::stringstream stream;
+    stream << std::forward<T>(value);
+    return stream.str();
+}
+
+/** Convert string values to integer.
+ *
+ * @note This function implements the same behaviour as std::stoi. The latter
+ *       is missing in some Android toolchains.
+ *
+ * @param[in] str String to be converted to int.
+ *
+ * @return Integer representation of @p str.
+ */
+inline int stoi(const std::string &str)
+{
+    std::stringstream stream(str);
+    int               value = 0;
+    stream >> value;
+    return value;
+}
+
+/** Convert string values to unsigned long.
+ *
+ * @note This function implements the same behaviour as std::stoul. The latter
+ *       is missing in some Android toolchains.
+ *
+ * @param[in] str String to be converted to unsigned long.
+ *
+ * @return Unsigned long representation of @p str.
+ */
+inline unsigned long stoul(const std::string &str)
+{
+    std::stringstream stream(str);
+    unsigned long     value = 0;
+    stream >> value;
+    return value;
+}
+
+/** Convert string values to float.
+ *
+ * @note This function implements the same behaviour as std::stof. The latter
+ *       is missing in some Android toolchains.
+ *
+ * @param[in] str String to be converted to float.
+ *
+ * @return Float representation of @p str.
+ */
+inline float stof(const std::string &str)
+{
+    std::stringstream stream(str);
+    float             value = 0.f;
+    stream >> value;
+    return value;
+}
+
+/** Round floating-point value with half value rounding away from zero.
+ *
+ * @note This function implements the same behaviour as std::round except that it doesn't
+ *       support Integral type. The latter is not in the namespace std in some Android toolchains.
+ *
+ * @param[in] value floating-point value to be rounded.
+ *
+ * @return Floating-point value of rounded @p value.
+ */
+template <typename T, typename = typename std::enable_if<std::is_floating_point<T>::value>::type>
+inline T round(T value)
+{
+    return ::round(value);
+}
+
+/** Truncate floating-point value.
+ *
+ * @note This function implements the same behaviour as std::truncate except that it doesn't
+ *       support Integral type. The latter is not in the namespace std in some Android toolchains.
+ *
+ * @param[in] value floating-point value to be truncated.
+ *
+ * @return Floating-point value of truncated @p value.
+ */
+template <typename T, typename = typename std::enable_if<std::is_floating_point<T>::value>::type>
+inline T trunc(T value)
+{
+    return ::trunc(value);
+}
+
+/** Composes a floating point value with the magnitude of @p x and the sign of @p y.
+ *
+ * @note This function implements the same behaviour as std::copysign except that it doesn't
+ *       support Integral type. The latter is not in the namespace std in some Android toolchains.
+ *
+ * @param[in] x value that contains the magnitued to be used in constructing the result.
+ * @param[in] y value that contains the sign to be used in constructin the result.
+ *
+ * @return Floating-point value with magnitude of @p x and sign of @p y.
+ */
+template <typename T, typename = typename std::enable_if<std::is_floating_point<T>::value>::type>
+inline T copysign(T x, T y)
+{
+    return ::copysign(x, y);
+}
+#else
+/** Convert integer and float values to string.
+ *
+ * @note This function acts as a convenience wrapper around std::to_string. The
+ *       latter is missing in some Android toolchains.
+ *
+ * @param[in] value Value to be converted to string.
+ *
+ * @return String representation of @p value.
+ */
+template <typename T>
+std::string to_string(T &&value)
+{
+    return ::std::to_string(std::forward<T>(value));
+}
+
+/** Convert string values to integer.
+ *
+ * @note This function acts as a convenience wrapper around std::stoi. The
+ *       latter is missing in some Android toolchains.
+ *
+ * @param[in] args Arguments forwarded to std::stoi.
+ *
+ * @return Integer representation of input string.
+ */
+template <typename... Ts>
+int stoi(Ts &&... args)
+{
+    return ::std::stoi(std::forward<Ts>(args)...);
+}
+
+/** Convert string values to unsigned long.
+ *
+ * @note This function acts as a convenience wrapper around std::stoul. The
+ *       latter is missing in some Android toolchains.
+ *
+ * @param[in] args Arguments forwarded to std::stoul.
+ *
+ * @return Unsigned long representation of input string.
+ */
+template <typename... Ts>
+int stoul(Ts &&... args)
+{
+    return ::std::stoul(std::forward<Ts>(args)...);
+}
+
+/** Convert string values to float.
+ *
+ * @note This function acts as a convenience wrapper around std::stof. The
+ *       latter is missing in some Android toolchains.
+ *
+ * @param[in] args Arguments forwarded to std::stof.
+ *
+ * @return Float representation of input string.
+ */
+template <typename... Ts>
+int stof(Ts &&... args)
+{
+    return ::std::stof(std::forward<Ts>(args)...);
+}
+
+/** Round floating-point value with half value rounding away from zero.
+ *
+ * @note This function implements the same behaviour as std::round except that it doesn't
+ *       support Integral type. The latter is not in the namespace std in some Android toolchains.
+ *
+ * @param[in] value floating-point value to be rounded.
+ *
+ * @return Floating-point value of rounded @p value.
+ */
+template <typename T, typename = typename std::enable_if<std::is_floating_point<T>::value>::type>
+inline T round(T value)
+{
+    return std::round(value);
+}
+
+/** Truncate floating-point value.
+ *
+ * @note This function implements the same behaviour as std::truncate except that it doesn't
+ *       support Integral type. The latter is not in the namespace std in some Android toolchains.
+ *
+ * @param[in] value floating-point value to be truncated.
+ *
+ * @return Floating-point value of truncated @p value.
+ */
+template <typename T, typename = typename std::enable_if<std::is_floating_point<T>::value>::type>
+inline T trunc(T value)
+{
+    return std::trunc(value);
+}
+
+/** Composes a floating point value with the magnitude of @p x and the sign of @p y.
+ *
+ * @note This function implements the same behaviour as std::copysign except that it doesn't
+ *       support Integral type. The latter is not in the namespace std in some Android toolchains.
+ *
+ * @param[in] x value that contains the magnitued to be used in constructing the result.
+ * @param[in] y value that contains the sign to be used in constructin the result.
+ *
+ * @return Floating-point value with magnitude of @p x and sign of @p y.
+ */
+template <typename T, typename = typename std::enable_if<std::is_floating_point<T>::value>::type>
+inline T copysign(T x, T y)
+{
+    return std::copysign(x, y);
+}
+#endif
+
+/** Round floating-point value with half value rounding to positive infinity.
+ *
+ * @param[in] value floating-point value to be rounded.
+ *
+ * @return Floating-point value of rounded @p value.
+ */
+template <typename T, typename = typename std::enable_if<std::is_floating_point<T>::value>::type>
+inline T round_half_up(T value)
+{
+    return std::floor(value + 0.5f);
+}
+
+/** Round floating-point value with half value rounding to nearest even.
+ *
+ * @param[in] value   floating-point value to be rounded.
+ * @param[in] epsilon precision.
+ *
+ * @return Floating-point value of rounded @p value.
+ */
+template <typename T, typename = typename std::enable_if<std::is_floating_point<T>::value>::type>
+inline T round_half_even(T value, T epsilon = std::numeric_limits<T>::epsilon())
+{
+    T positive_value = std::abs(value);
+    T ipart          = 0;
+    std::modf(positive_value, &ipart);
+    // If 'value' is exactly halfway between two integers
+    if(std::abs(positive_value - (ipart + 0.5f)) < epsilon)
+    {
+        // If 'ipart' is even then return 'ipart'
+        if(std::fmod(ipart, 2.f) < epsilon)
+        {
+            return cpp11::copysign(ipart, value);
+        }
+        // Else return the nearest even integer
+        return cpp11::copysign(std::ceil(ipart + 0.5f), value);
+    }
+    // Otherwise use the usual round to closest
+    return cpp11::copysign(cpp11::round(positive_value), value);
+}
+} // namespace cpp11
+
+namespace cpp14
+{
+/** make_unqiue is missing in CPP11. Reimplement it according to the standard
+ * proposal.
+ */
+template <class T>
+struct _Unique_if
+{
+    typedef std::unique_ptr<T> _Single_object;
+};
+
+template <class T>
+struct _Unique_if<T[]>
+{
+    typedef std::unique_ptr<T[]> _Unknown_bound;
+};
+
+template <class T, size_t N>
+struct _Unique_if<T[N]>
+{
+    typedef void _Known_bound;
+};
+
+template <class T, class... Args>
+typename _Unique_if<T>::_Single_object
+make_unique(Args &&... args)
+{
+    return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+
+template <class T>
+typename _Unique_if<T>::_Unknown_bound
+make_unique(size_t n)
+{
+    typedef typename std::remove_extent<T>::type U;
+    return std::unique_ptr<T>(new U[n]());
+}
+
+template <class T, class... Args>
+typename _Unique_if<T>::_Known_bound
+make_unique(Args &&...) = delete;
+} // namespace cpp14
+
+namespace traits
+{
+// *INDENT-OFF*
+// clang-format off
+template <typename T> struct promote { };
+template <> struct promote<uint8_t> { using type = uint16_t; };
+template <> struct promote<int8_t> { using type = int16_t; };
+template <> struct promote<uint16_t> { using type = uint32_t; };
+template <> struct promote<int16_t> { using type = int32_t; };
+template <> struct promote<uint32_t> { using type = uint64_t; };
+template <> struct promote<int32_t> { using type = int64_t; };
+template <> struct promote<float> { using type = float; };
+
+template <typename T>
+using promote_t = typename promote<T>::type;
+
+template <typename T>
+using make_signed_conditional_t = typename std::conditional<std::is_integral<T>::value, std::make_signed<T>, std::common_type<T>>::type;
+// clang-format on
+// *INDENT-ON*
+}
+
+/** Look up the format corresponding to a channel.
+ *
+ * @param[in] channel Channel type.
+ *
+ * @return Format that contains the given channel.
+ */
+inline Format get_format_for_channel(Channel channel)
+{
+    switch(channel)
+    {
+        case Channel::R:
+        case Channel::G:
+        case Channel::B:
+            return Format::RGB888;
+        default:
+            throw std::runtime_error("Unsupported channel");
+    }
+}
+
+/** Return the format of a channel.
+ *
+ * @param[in] channel Channel type.
+ *
+ * @return Format of the given channel.
+ */
+inline Format get_channel_format(Channel channel)
+{
+    switch(channel)
+    {
+        case Channel::R:
+        case Channel::G:
+        case Channel::B:
+            return Format::U8;
+        default:
+            throw std::runtime_error("Unsupported channel");
+    }
+}
+
+/** Base case of foldl.
+ *
+ * @return value.
+ */
+template <typename F, typename T>
+inline T foldl(F &&, const T &value)
+{
+    return value;
+}
+
+/** Base case of foldl.
+ *
+ * @return func(value1, value2).
+ */
+template <typename F, typename T, typename U>
+inline auto foldl(F &&func, T &&value1, U &&value2) -> decltype(func(value1, value2))
+{
+    return func(value1, value2);
+}
+
+/** Fold left.
+ *
+ * @param[in] func    Binary function to be called.
+ * @param[in] initial Initial value.
+ * @param[in] value   Argument passed to the function.
+ * @param[in] values  Remaining arguments.
+ */
+template <typename F, typename I, typename T, typename... Vs>
+inline I foldl(F &&func, I &&initial, T &&value, Vs &&... values)
+{
+    return foldl(std::forward<F>(func), func(std::forward<I>(initial), std::forward<T>(value)), std::forward<Vs>(values)...);
+}
+
+/** Create a valid region covering the enitre tensor shape.
+ *
+ * @param[in] shape Shape used as size of the valid region.
+ *
+ * @return A valid region starting at (0, 0, ...) with size of @p shape.
+ */
+inline ValidRegion shape_to_valid_region(TensorShape shape)
+{
+    Coordinates anchor;
+    anchor.set(std::max<int>(0, shape.num_dimensions() - 1), 0);
+    return ValidRegion(std::move(anchor), std::move(shape));
+}
+
+/** Create a valid region covering the tensor shape with UNDEFINED border mode and specified border size.
+ *
+ * @param[in] shape       Shape used as size of the valid region.
+ * @param[in] border_size Border size used to specify the region to exclude.
+ *
+ * @return A valid region starting at (@p border_size.left, @p border_size.top, ...) with reduced size of @p shape.
+ */
+inline ValidRegion shape_to_valid_region_undefined_border(TensorShape shape, BorderSize border_size)
+{
+    ARM_COMPUTE_ERROR_ON(shape.num_dimensions() < 2);
+    Coordinates anchor;
+    anchor.set(std::max<int>(0, shape.num_dimensions() - 1), 0);
+    anchor.set(0, border_size.left);
+    anchor.set(1, border_size.top);
+    shape.set(0, shape.x() - border_size.left - border_size.right);
+    shape.set(1, shape.y() - border_size.top - border_size.bottom);
+    return ValidRegion(std::move(anchor), shape);
+}
+
+/** Calculate the required padding given the available @p size and the required.
+ * @p step.
+ *
+ * @param[in] size Available size.
+ * @param[in] step Required step size.
+ *
+ * @return Difference between next greater multiple of @p step and @p size.
+ */
+inline int required_padding(int size, int step)
+{
+    return ((size + step - 1) / step) * step - size;
+}
+
+/** Calculate the required padding for writing operation with UNDEFINED border mode.
+ *
+ * @param[in] size        Available size.
+ * @param[in] step        Required step size; number of elements to write at each iteration.
+ * @param[in] border_size Border size.
+ *
+ * @return Required padding size plus border size.
+ */
+inline int required_padding_undefined_border_write(int size, int step, int border_size)
+{
+    return required_padding(size, step) + border_size;
+}
+
+/** Calculate the required padding for reading operation with UNDEFINED border mode.
+ *
+ * @param[in] size         Available size.
+ * @param[in] read_step    Required step size; number of elements to read at each iteration.
+ * @param[in] process_step Required step size; number of elements to process at each iteration.
+ *
+ * @return Required padding size.
+ */
+inline int required_padding_undefined_border_read(int size, int read_step, int process_step)
+{
+    return required_padding(size, process_step) + read_step - process_step;
+}
+
+/** Write the value after casting the pointer according to @p data_type.
+ *
+ * @warning The type of the value must match the specified data type.
+ *
+ * @param[out] ptr       Pointer to memory where the @p value will be written.
+ * @param[in]  value     Value that will be written.
+ * @param[in]  data_type Data type that will be written.
+ */
+template <typename T>
+void store_value_with_data_type(void *ptr, T value, DataType data_type)
+{
+    switch(data_type)
+    {
+        case DataType::U8:
+            *reinterpret_cast<uint8_t *>(ptr) = value;
+            break;
+        case DataType::S8:
+        case DataType::QS8:
+            *reinterpret_cast<int8_t *>(ptr) = value;
+            break;
+        case DataType::U16:
+            *reinterpret_cast<uint16_t *>(ptr) = value;
+            break;
+        case DataType::S16:
+            *reinterpret_cast<int16_t *>(ptr) = value;
+            break;
+        case DataType::U32:
+            *reinterpret_cast<uint32_t *>(ptr) = value;
+            break;
+        case DataType::S32:
+            *reinterpret_cast<int32_t *>(ptr) = value;
+            break;
+        case DataType::U64:
+            *reinterpret_cast<uint64_t *>(ptr) = value;
+            break;
+        case DataType::S64:
+            *reinterpret_cast<int64_t *>(ptr) = value;
+            break;
+#ifdef ENABLE_FP16
+        case DataType::F16:
+            *reinterpret_cast<float16_t *>(ptr) = value;
+            break;
+#endif /* ENABLE_FP16 */
+        case DataType::F32:
+            *reinterpret_cast<float *>(ptr) = value;
+            break;
+        case DataType::F64:
+            *reinterpret_cast<double *>(ptr) = value;
+            break;
+        case DataType::SIZET:
+            *reinterpret_cast<size_t *>(ptr) = value;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("NOT SUPPORTED!");
+    }
+}
+
+/** Saturate a value of type T against the numeric limits of type U.
+ *
+ * @param[in] val Value to be saturated.
+ *
+ * @return saturated value.
+ */
+template <typename U, typename T>
+T saturate_cast(T val)
+{
+    if(val > static_cast<T>(std::numeric_limits<U>::max()))
+    {
+        val = static_cast<T>(std::numeric_limits<U>::max());
+    }
+    if(val < static_cast<T>(std::numeric_limits<U>::lowest()))
+    {
+        val = static_cast<T>(std::numeric_limits<U>::lowest());
+    }
+    return val;
+}
+
+/** Find the signed promoted common type.
+ */
+template <typename... T>
+struct common_promoted_signed_type
+{
+    using common_type       = typename std::common_type<T...>::type;
+    using promoted_type     = traits::promote_t<common_type>;
+    using intermediate_type = typename traits::make_signed_conditional_t<promoted_type>::type;
+};
+
+/** Convert a linear index into n-dimensional coordinates.
+ *
+ * @param[in] shape Shape of the n-dimensional tensor.
+ * @param[in] index Linear index specifying the i-th element.
+ *
+ * @return n-dimensional coordinates.
+ */
+inline Coordinates index2coord(const TensorShape &shape, int index)
+{
+    int num_elements = shape.total_size();
+
+    ARM_COMPUTE_ERROR_ON_MSG(index < 0 || index >= num_elements, "Index has to be in [0, num_elements]");
+    ARM_COMPUTE_ERROR_ON_MSG(num_elements == 0, "Cannot create coordinate from empty shape");
+
+    Coordinates coord{ 0 };
+
+    for(int d = shape.num_dimensions() - 1; d >= 0; --d)
+    {
+        num_elements /= shape[d];
+        coord.set(d, index / num_elements);
+        index %= num_elements;
+    }
+
+    return coord;
+}
+
+/** Linearise the given coordinate.
+ *
+ * Transforms the given coordinate into a linear offset in terms of
+ * elements.
+ *
+ * @param[in] shape Shape of the n-dimensional tensor.
+ * @param[in] coord The to be converted coordinate.
+ *
+ * @return Linear offset to the element.
+ */
+inline int coord2index(const TensorShape &shape, const Coordinates &coord)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(shape.total_size() == 0, "Cannot get index from empty shape");
+    ARM_COMPUTE_ERROR_ON_MSG(coord.num_dimensions() == 0, "Cannot get index of empty coordinate");
+
+    int index    = 0;
+    int dim_size = 1;
+
+    for(unsigned int i = 0; i < coord.num_dimensions(); ++i)
+    {
+        index += coord[i] * dim_size;
+        dim_size *= shape[i];
+    }
+
+    return index;
+}
+
+/** Check if a coordinate is within a valid region */
+inline bool is_in_valid_region(const ValidRegion &valid_region, const Coordinates &coord)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(valid_region.shape.num_dimensions() != coord.num_dimensions(), "Shapes of valid region and coordinates do not agree");
+    for(int d = 0; static_cast<size_t>(d) < coord.num_dimensions(); ++d)
+    {
+        if(coord[d] < valid_region.start(d) || coord[d] >= valid_region.end(d))
+        {
+            return false;
+        }
+    }
+    return true;
+}
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/benchmark/CL/ActivationLayer.cpp b/tests/benchmark/CL/ActivationLayer.cpp
new file mode 100644
index 0000000000..5180d3d900
--- /dev/null
+++ b/tests/benchmark/CL/ActivationLayer.cpp
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+#include "benchmark/common/ActivationLayer.h"
+
+namespace
+{
+using ActivationLayerAlexNet   = ActivationLayer<AlexNetActivationLayerDataset, CLTensor, CLAccessor, CLActivationLayer>;
+using ActivationLayerLeNet5    = ActivationLayer<LeNet5ActivationLayerDataset, CLTensor, CLAccessor, CLActivationLayer>;
+using ActivationLayerGoogLeNet = ActivationLayer<GoogLeNetActivationLayerDataset, CLTensor, CLAccessor, CLActivationLayer>;
+} // namespace
+
+BENCHMARK_DEFINE_F(ActivationLayerAlexNet, cl_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        act_layer.run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ActivationLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 4, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(ActivationLayerLeNet5, cl_lenet5)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        act_layer.run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ActivationLayerLeNet5, cl_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5ActivationLayerDataset, 0, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(ActivationLayerGoogLeNet, cl_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        act_layer.run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 4, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 5, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 6, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 7, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 8, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 9, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 10, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 11, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 12, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 13, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 14, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 15, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 16, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 17, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 18, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 19, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 20, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 21, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 22, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 23, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 24, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 25, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 26, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 27, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 28, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 29, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 30, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 31, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 32, 1, 4, 8>);
diff --git a/tests/benchmark/CL/BitwiseAnd.cpp b/tests/benchmark/CL/BitwiseAnd.cpp
new file mode 100644
index 0000000000..a3deb3eb5b
--- /dev/null
+++ b/tests/benchmark/CL/BitwiseAnd.cpp
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLBitwiseAnd.h"
+
+#include "benchmark/benchmark_api.h"
+
+#include <memory>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+namespace
+{
+template <typename DataSet>
+class BitwiseAnd : public ::benchmark::Fixture
+{
+public:
+    void SetUp(::benchmark::State &state) override
+    {
+        ::benchmark::Fixture::SetUp(state);
+
+        profiler.add(std::make_shared<WallClockTimer>());
+
+        const std::string image_name = *(DataSet().begin() + state.range(0));
+
+        // Create tensors
+        src1 = create_tensor(image_name, DataType::U8);
+        src2 = create_tensor(image_name, DataType::U8);
+        dst  = create_tensor(image_name, DataType::U8);
+
+        // Create and configure function
+        band.configure(&src1, &src2, &dst);
+
+        // Allocate tensors
+        src1.allocator()->allocate();
+        src2.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        // Fill source tensors
+        library->fill(CLAccessor(src1), image_name, Channel::R);
+        library->fill(CLAccessor(src2), image_name, Channel::G);
+    }
+
+    void TearDown(::benchmark::State &state) override
+    {
+        profiler.submit(state);
+
+        ::benchmark::Fixture::TearDown(state);
+    }
+
+    CLBitwiseAnd band{};
+    Profiler     profiler{};
+
+private:
+    CLTensor src1{};
+    CLTensor src2{};
+    CLTensor dst{};
+};
+
+using BitwiseAndSmall = BitwiseAnd<SmallImages>;
+using BitwiseAndLarge = BitwiseAnd<LargeImages>;
+} // namespace
+
+BENCHMARK_DEFINE_F(BitwiseAndSmall, cl_bitwise_and)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        band.run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(BitwiseAndSmall, cl_bitwise_and)
+->Threads(1)
+->Apply(DataSetArgs<SmallImages>);
+
+BENCHMARK_DEFINE_F(BitwiseAndLarge, cl_bitwise_and)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        band.run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(BitwiseAndLarge, cl_bitwise_and)
+->Threads(1)
+->Apply(DataSetArgs<LargeImages>);
diff --git a/tests/benchmark/CL/CMakeLists.txt b/tests/benchmark/CL/CMakeLists.txt
new file mode 100644
index 0000000000..8493309f40
--- /dev/null
+++ b/tests/benchmark/CL/CMakeLists.txt
@@ -0,0 +1,57 @@
+# Copyright (c) 2017 ARM Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+cmake_minimum_required (VERSION 3.1)
+
+include_directories(${CMAKE_SOURCE_DIR}/../include)
+
+set(arm_compute_test_benchmark_TARGET_DEFINITIONS
+    ${arm_compute_test_benchmark_TARGET_DEFINITIONS}
+    -DOPENCL
+    PARENT_SCOPE
+)
+
+set(arm_compute_test_benchmark_TARGET_INCLUDES
+    ${arm_compute_test_benchmark_TARGET_INCLUDES}
+    ${CMAKE_SOURCE_DIR}/../include
+    PARENT_SCOPE
+)
+
+set(arm_compute_test_benchmark_OPENCL_SOURCE_FILES
+    ${CMAKE_SOURCE_DIR}/CL/CLAccessor.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/Bitwise/BitwiseAnd.cpp
+)
+
+add_library(arm_compute_test_benchmark_OPENCL OBJECT
+    ${arm_compute_test_benchmark_OPENCL_SOURCE_FILES}
+)
+
+set(arm_compute_test_benchmark_TARGET_OBJECTS
+    ${arm_compute_test_benchmark_TARGET_OBJECTS}
+    $<TARGET_OBJECTS:arm_compute_test_benchmark_OPENCL>
+    PARENT_SCOPE
+)
+
+set(arm_compute_test_benchmark_TARGET_LIBRARIES
+    ${arm_compute_test_benchmark_TARGET_LIBRARIES}
+    OpenCL
+    PARENT_SCOPE
+)
diff --git a/tests/benchmark/CL/ConvolutionLayer.cpp b/tests/benchmark/CL/ConvolutionLayer.cpp
new file mode 100644
index 0000000000..e1f4fabdc3
--- /dev/null
+++ b/tests/benchmark/CL/ConvolutionLayer.cpp
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+#include "benchmark/common/ConvolutionLayer.h"
+
+namespace
+{
+using ConvolutionLayerAlexNet    = ConvolutionLayer<AlexNetConvolutionLayerDataset, CLTensor, CLAccessor, CLConvolutionLayer>;
+using ConvolutionLayerLeNet5     = ConvolutionLayer<LeNet5ConvolutionLayerDataset, CLTensor, CLAccessor, CLConvolutionLayer>;
+using ConvolutionLayerGoogLeNet1 = ConvolutionLayer<GoogLeNetConvolutionLayerDataset1, CLTensor, CLAccessor, CLConvolutionLayer>;
+using ConvolutionLayerGoogLeNet2 = ConvolutionLayer<GoogLeNetConvolutionLayerDataset2, CLTensor, CLAccessor, CLConvolutionLayer>;
+} // namespace
+
+BENCHMARK_DEFINE_F(ConvolutionLayerAlexNet, cl_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        conv_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 4, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(ConvolutionLayerLeNet5, cl_lenet5)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        conv_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ConvolutionLayerLeNet5, cl_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5ConvolutionLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerLeNet5, cl_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5ConvolutionLayerDataset, 1, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        conv_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_DEFINE_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        conv_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 4, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 5, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 6, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 7, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 8, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 9, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 10, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 11, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 12, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 13, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 14, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 15, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 16, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 17, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 18, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 19, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 20, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 21, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 22, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 23, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 24, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 25, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 26, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 27, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 28, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 29, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 30, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 31, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 4, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 5, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 6, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 7, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 8, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 9, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 10, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 11, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 12, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 13, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 14, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 15, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 16, 1, 4, 8>);
diff --git a/tests/benchmark/CL/FullyConnectedLayer.cpp b/tests/benchmark/CL/FullyConnectedLayer.cpp
new file mode 100644
index 0000000000..6e8c89fa0b
--- /dev/null
+++ b/tests/benchmark/CL/FullyConnectedLayer.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+
+#include "benchmark/benchmark_api.h"
+
+#include <memory>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+#include "benchmark/common/FullyConnectedLayer.h"
+
+namespace
+{
+using FullyConnectedLayerAlexNet   = FullyConnectedLayer<AlexNetFullyConnectedLayerDataset, CLTensor, CLAccessor, CLFullyConnectedLayer>;
+using FullyConnectedLayerLeNet5    = FullyConnectedLayer<LeNet5FullyConnectedLayerDataset, CLTensor, CLAccessor, CLFullyConnectedLayer>;
+using FullyConnectedLayerGoogLeNet = FullyConnectedLayer<GoogLeNetFullyConnectedLayerDataset, CLTensor, CLAccessor, CLFullyConnectedLayer>;
+} // namespace
+
+BENCHMARK_DEFINE_F(FullyConnectedLayerAlexNet, cl_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        fc_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(FullyConnectedLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetFullyConnectedLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(FullyConnectedLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetFullyConnectedLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(FullyConnectedLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetFullyConnectedLayerDataset, 2, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(FullyConnectedLayerLeNet5, cl_lenet5)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        fc_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(FullyConnectedLayerLeNet5, cl_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5FullyConnectedLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(FullyConnectedLayerLeNet5, cl_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5FullyConnectedLayerDataset, 1, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(FullyConnectedLayerGoogLeNet, cl_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        fc_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(FullyConnectedLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetFullyConnectedLayerDataset, 0, 1, 4, 8>);
diff --git a/tests/benchmark/CL/GEMM.cpp b/tests/benchmark/CL/GEMM.cpp
new file mode 100644
index 0000000000..b90556df48
--- /dev/null
+++ b/tests/benchmark/CL/GEMM.cpp
@@ -0,0 +1,492 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLGEMM.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+#include "benchmark/CL/GEMM.h"
+
+namespace
+{
+using GEMMFP16GoogLeNet1 = GEMM<GoogLeNetGEMMDataset1, CLTensor, CLAccessor, CLGEMM, DataType::F16>;
+using GEMMFP16GoogLeNet2 = GEMM<GoogLeNetGEMMDataset2, CLTensor, CLAccessor, CLGEMM, DataType::F16>;
+using GEMMFP32GoogLeNet1 = GEMM<GoogLeNetGEMMDataset1, CLTensor, CLAccessor, CLGEMM, DataType::F32>;
+using GEMMFP32GoogLeNet2 = GEMM<GoogLeNetGEMMDataset2, CLTensor, CLAccessor, CLGEMM, DataType::F32>;
+} // namespace
+
+BENCHMARK_DEFINE_F(GEMMFP16GoogLeNet1, cl_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        gemm_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_DEFINE_F(GEMMFP16GoogLeNet2, cl_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        gemm_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 0>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 1>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 2>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 3>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 4>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 5>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 6>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 7>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 8>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 9>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 10>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 11>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 12>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 13>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 14>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 15>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 16>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 17>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 18>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 19>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 20>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 21>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 22>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 23>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 24>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 25>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 26>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 27>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 28>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 29>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 30>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 31>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 0>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 1>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 2>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 3>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 4>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 5>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 6>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 7>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 8>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 9>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 10>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 11>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 12>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 13>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 14>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 15>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 16>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 17>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 18>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 19>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 20>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 21>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 22>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 23>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 24>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 25>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 26>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 27>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 28>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 29>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 30>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 31>);
+
+BENCHMARK_DEFINE_F(GEMMFP32GoogLeNet1, cl_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        gemm_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_DEFINE_F(GEMMFP32GoogLeNet2, cl_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        gemm_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 0>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 1>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 2>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 3>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 4>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 5>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 6>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 7>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 8>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 9>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 10>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 11>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 12>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 13>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 14>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 15>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 16>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 17>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 18>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 19>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 20>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 21>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 22>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 23>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 24>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 25>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 26>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 27>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 28>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 29>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 30>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 31>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 0>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 1>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 2>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 3>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 4>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 5>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 6>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 7>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 8>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 9>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 10>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 11>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 12>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 13>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 14>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 15>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 16>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 17>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 18>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 19>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 20>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 21>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 22>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 23>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 24>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 25>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 26>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 27>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 28>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 29>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 30>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 31>);
diff --git a/tests/benchmark/CL/GEMM.h b/tests/benchmark/CL/GEMM.h
new file mode 100644
index 0000000000..02a339609c
--- /dev/null
+++ b/tests/benchmark/CL/GEMM.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_CL_GEMM_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_CL_GEMM_H__
+
+#include "TensorLibrary.h"
+#include "Utils.h"
+#include "dataset/GEMMDataset.h"
+
+#include <memory>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+// FIXME: Merge with NEON/GEMM.h into common/GEMM.h after adding F16 support to NEON GEMM and QS8 support to CL GEMM
+template <typename DataSet, typename TensorType, typename Accessor, typename Function, DataType data_type>
+class GEMM : public ::benchmark::Fixture
+{
+public:
+    void SetUp(::benchmark::State &state) override
+    {
+        ARM_COMPUTE_ERROR_ON_MSG(data_type != DataType::F16 && data_type != DataType::F32, "Unsupported data type for GEMM operation");
+
+        profiler.add(std::make_shared<WallClockTimer>());
+
+        const GEMMDataObject gemm_obj = *(DataSet().begin() + state.range(0));
+
+        TensorShape shape_a = gemm_obj.shape_a;
+        TensorShape shape_b = gemm_obj.shape_b;
+        TensorShape shape_c = gemm_obj.shape_c;
+        TensorShape shape_d = gemm_obj.shape_d;
+
+        // Create tensors
+        a = create_tensor(shape_a, data_type);
+        b = create_tensor(shape_b, data_type);
+        c = create_tensor(shape_c, data_type);
+        d = create_tensor(shape_d, data_type);
+
+        // Create and configure function
+        gemm_layer = std::unique_ptr<Function>(new Function());
+        gemm_layer->configure(&a, &b, &c, &d, gemm_obj.alpha, gemm_obj.beta);
+
+        // Allocate tensors
+        a.allocator()->allocate();
+        b.allocator()->allocate();
+        c.allocator()->allocate();
+        d.allocator()->allocate();
+    }
+
+    void TearDown(::benchmark::State &state) override
+    {
+        gemm_layer.reset();
+
+        a.allocator()->free();
+        b.allocator()->free();
+        c.allocator()->free();
+        d.allocator()->free();
+
+        profiler.submit(state);
+    }
+
+    std::unique_ptr<Function> gemm_layer{ nullptr };
+    Profiler                  profiler{};
+
+private:
+    TensorType a{};
+    TensorType b{};
+    TensorType c{};
+    TensorType d{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_BENCHMARK_CL_GEMM_H__
diff --git a/tests/benchmark/CL/NormalizationLayer.cpp b/tests/benchmark/CL/NormalizationLayer.cpp
new file mode 100644
index 0000000000..81d3c65912
--- /dev/null
+++ b/tests/benchmark/CL/NormalizationLayer.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+#include "benchmark/common/NormalizationLayer.h"
+
+namespace
+{
+using NormalizationLayerAlexNet   = NormalizationLayer<AlexNetNormalizationLayerDataset, CLTensor, CLAccessor, CLNormalizationLayer>;
+using NormalizationLayerGoogLeNet = NormalizationLayer<GoogLeNetNormalizationLayerDataset, CLTensor, CLAccessor, CLNormalizationLayer>;
+
+} // namespace
+
+BENCHMARK_DEFINE_F(NormalizationLayerAlexNet, cl_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        norm_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(NormalizationLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetNormalizationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(NormalizationLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetNormalizationLayerDataset, 1, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(NormalizationLayerGoogLeNet, cl_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        norm_layer->run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(NormalizationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetNormalizationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(NormalizationLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetNormalizationLayerDataset, 1, 1, 4, 8>);
diff --git a/tests/benchmark/CL/PoolingLayer.cpp b/tests/benchmark/CL/PoolingLayer.cpp
new file mode 100644
index 0000000000..5285f279e7
--- /dev/null
+++ b/tests/benchmark/CL/PoolingLayer.cpp
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+#include "benchmark/common/PoolingLayer.h"
+
+namespace
+{
+using PoolingLayerAlexNet   = PoolingLayer<AlexNetPoolingLayerDataset, CLTensor, CLAccessor, CLPoolingLayer>;
+using PoolingLayerLeNet5    = PoolingLayer<LeNet5PoolingLayerDataset, CLTensor, CLAccessor, CLPoolingLayer>;
+using PoolingLayerGoogLeNet = PoolingLayer<GoogLeNetPoolingLayerDataset, CLTensor, CLAccessor, CLPoolingLayer>;
+} // namespace
+
+BENCHMARK_DEFINE_F(PoolingLayerAlexNet, cl_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        pool_layer.run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(PoolingLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetPoolingLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetPoolingLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerAlexNet, cl_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetPoolingLayerDataset, 2, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(PoolingLayerLeNet5, cl_lenet5)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        pool_layer.run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(PoolingLayerLeNet5, cl_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5PoolingLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerLeNet5, cl_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5PoolingLayerDataset, 1, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(PoolingLayerGoogLeNet, cl_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        pool_layer.run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+// FIXME: Add support for 7x7 pooling layer pool5/7x7_s1
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 4, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 5, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 6, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 7, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 8, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, cl_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 9, 1, 4, 8>);
diff --git a/tests/benchmark/CMakeLists.txt b/tests/benchmark/CMakeLists.txt
new file mode 100644
index 0000000000..115333a1b0
--- /dev/null
+++ b/tests/benchmark/CMakeLists.txt
@@ -0,0 +1,100 @@
+# Copyright (c) 2017 ARM Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+cmake_minimum_required (VERSION 3.1)
+
+add_library(benchmark STATIC IMPORTED)
+set_target_properties(benchmark PROPERTIES
+    IMPORTED_LOCATION "${CMAKE_SOURCE_DIR}/../3rdparty/linux/armv7a/libbenchmark.a"
+)
+
+add_library(OpenCL SHARED IMPORTED)
+set_target_properties(OpenCL PROPERTIES
+    IMPORTED_LOCATION "${CMAKE_SOURCE_DIR}/../build/opencl-1.2-stubs/libOpenCL.so"
+    IMPORTED_NO_SONAME 1
+)
+
+option(ENABLE_PMU_COUNTER "Compile with PMU counter support")
+
+set(ARM_COMPUTE_TARGETS_TO_MEASURE "all" CACHE STRING "Semicolon-separated list of targets to include in validation.")
+
+set(ARM_COMPUTE_ALL_TARGETS
+    NEON
+    CL
+)
+
+if(ARM_COMPUTE_TARGETS_TO_MEASURE STREQUAL "all")
+    set(ARM_COMPUTE_TARGETS_TO_MEASURE ${ARM_COMPUTE_ALL_TARGETS})
+endif()
+
+list(REMOVE_DUPLICATES ARM_COMPUTE_TARGETS_TO_MEASURE)
+
+foreach(TARGET ${ARM_COMPUTE_TARGETS_TO_MEASURE})
+    list(FIND ARM_COMPUTE_ALL_TARGETS ${TARGET} idx)
+
+    if(${idx} LESS 0)
+        message(FATAL_ERROR "The target '${TARGET}' does not exist. It should be one of\n${ARM_COMPUTE_ALL_TARGETS}")
+    else()
+        add_subdirectory(${TARGET})
+    endif()
+endforeach()
+
+set(arm_compute_test_benchmark_SOURCE_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/Datasets.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/Instrument.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/Profiler.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/Profiler.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/PerformanceProgramOptions.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/PerformanceProgramOptions.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/PerformanceUserConfiguration.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/PerformanceUserConfiguration.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/WallClockTimer.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/WallClockTimer.cpp
+)
+
+if(${ENABLE_PMU_COUNTER})
+    list(APPEND arm_compute_test_benchmark_SOURCE_FILES
+        ${CMAKE_CURRENT_SOURCE_DIR}/PMUCounter.h
+        ${CMAKE_CURRENT_SOURCE_DIR}/PMUCounter.cpp
+    )
+endif()
+
+add_library(arm_compute_test_benchmark OBJECT
+    ${arm_compute_test_benchmark_SOURCE_FILES}
+)
+
+add_definitions(${arm_compute_test_benchmark_TARGET_DEFINITIONS})
+include_directories(${arm_compute_test_benchmark_TARGET_INCLUDES})
+
+add_executable(arm_compute_benchmark
+    $<TARGET_OBJECTS:arm_compute_test_benchmark>
+    ${arm_compute_test_benchmark_TARGET_OBJECTS}
+    $<TARGET_OBJECTS:tensor_library>
+    $<TARGET_OBJECTS:arm_compute_test>
+)
+
+target_link_libraries(arm_compute_benchmark
+    benchmark
+    boost_program_options
+    arm_compute
+    ${arm_compute_test_benchmark_TARGET_LIBRARIES}
+)
diff --git a/tests/benchmark/Datasets.h b/tests/benchmark/Datasets.h
new file mode 100644
index 0000000000..e7bfb6f10f
--- /dev/null
+++ b/tests/benchmark/Datasets.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_DATASETS_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_DATASETS_H__
+
+#include "dataset/ActivationLayerDataset.h"
+#include "dataset/BorderModeDataset.h"
+#include "dataset/ConvolutionLayerDataset.h"
+#include "dataset/DataTypeDatasets.h"
+#include "dataset/FullyConnectedLayerDataset.h"
+#include "dataset/GEMMDataset.h"
+#include "dataset/ImageDatasets.h"
+#include "dataset/InterpolationPolicyDataset.h"
+#include "dataset/NormalizationLayerDataset.h"
+#include "dataset/PoolingLayerDataset.h"
+#include "dataset/ShapeDatasets.h"
+
+#include "benchmark/benchmark_api.h"
+
+#include <array>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+template <typename DataSet, int N>
+void DataSetArg(::benchmark::internal::Benchmark *b)
+{
+    b->Arg(N);
+    b->ArgName(std::string(*(DataSet().begin() + N)));
+}
+
+template <typename DataSet, int N, unsigned int... Args>
+void DataSetArgBatched(::benchmark::internal::Benchmark *b)
+{
+    constexpr std::array<unsigned int, sizeof...(Args)> batches{ { Args... } };
+    for(const auto &el : batches)
+    {
+        b->Args({ N, static_cast<int>(el) });
+    }
+    b->ArgNames({ std::string(*(DataSet().begin() + N)), "batch_size" });
+}
+
+template <typename DataSet>
+void DataSetArgs(::benchmark::internal::Benchmark *b)
+{
+    for(size_t i = 0; i < DataSet().size(); ++i)
+    {
+        b->Arg(i);
+        b->ArgName(*(DataSet().begin() + i));
+    }
+}
+}
+}
+}
+#endif
diff --git a/tests/benchmark/Instrument.h b/tests/benchmark/Instrument.h
new file mode 100644
index 0000000000..39b0088670
--- /dev/null
+++ b/tests/benchmark/Instrument.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_INSTRUMENT_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_INSTRUMENT_H__
+
+#include "Utils.h"
+
+#include <memory>
+#include <string>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+/** Interface for classes that can be used to measure performance. */
+class Instrument
+{
+public:
+    /** Interface defining a measurement, e.g. time, cycles, ... */
+    class IMeasurement
+    {
+    public:
+        IMeasurement()                     = default;
+        IMeasurement(const IMeasurement &) = default;
+        IMeasurement(IMeasurement &&)      = default;
+        IMeasurement &operator=(const IMeasurement &) = default;
+        IMeasurement &operator=(IMeasurement &&) = default;
+        virtual ~IMeasurement()                  = default;
+
+        virtual operator double() const = 0;
+    };
+
+    /** Implementation of a Measurement class for arihtmetic types. */
+    template <typename T>
+    class Measurement : public IMeasurement
+    {
+    public:
+        /** Store the given value as measurement.
+         *
+         * @param[in] value Measured value.
+         */
+        Measurement(T value);
+
+        operator double() const override;
+
+    private:
+        T _value;
+    };
+
+    Instrument()                   = default;
+    Instrument(const Instrument &) = default;
+    Instrument(Instrument &&)      = default;
+    Instrument &operator=(const Instrument &) = default;
+    Instrument &operator=(Instrument &&) = default;
+    virtual ~Instrument()                = default;
+
+    /** Identifier for the instrument */
+    virtual std::string id() const = 0;
+
+    /** Start measuring. */
+    virtual void start() = 0;
+
+    /** Stop measuring. */
+    virtual void stop() = 0;
+
+    /** Return the latest measurement. */
+    virtual std::unique_ptr<IMeasurement> get_measurement() const = 0;
+};
+
+template <typename T>
+Instrument::Measurement<T>::Measurement(T value)
+    : _value{ value }
+{
+}
+
+template <typename T>
+Instrument::Measurement<T>::operator double() const
+{
+    return _value;
+}
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/benchmark/NEON/ActivationLayer.cpp b/tests/benchmark/NEON/ActivationLayer.cpp
new file mode 100644
index 0000000000..8faed9f831
--- /dev/null
+++ b/tests/benchmark/NEON/ActivationLayer.cpp
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::neon;
+
+#include "benchmark/common/ActivationLayer.h"
+
+namespace
+{
+using ActivationLayerAlexNetF32 = ActivationLayer<AlexNetActivationLayerDataset, Tensor, NEAccessor, NEActivationLayer>;
+using ActivationLayerAlexNetQS8 = ActivationLayer<AlexNetActivationLayerDataset, Tensor, NEAccessor, NEActivationLayer, DataType::QS8>;
+using ActivationLayerLeNet5     = ActivationLayer<LeNet5ActivationLayerDataset, Tensor, NEAccessor, NEActivationLayer, DataType::F32>;
+using ActivationLayerGoogLeNet  = ActivationLayer<GoogLeNetActivationLayerDataset, Tensor, NEAccessor, NEActivationLayer, DataType::F32>;
+} // namespace
+
+// F32
+BENCHMARK_DEFINE_F(ActivationLayerAlexNetF32, neon_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        act_layer.run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ActivationLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 4, 1, 4, 8>);
+
+// QS8
+BENCHMARK_DEFINE_F(ActivationLayerAlexNetQS8, neon_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        act_layer.run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ActivationLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetActivationLayerDataset, 4, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(ActivationLayerLeNet5, neon_lenet5)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        act_layer.run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ActivationLayerLeNet5, neon_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5ActivationLayerDataset, 0, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(ActivationLayerGoogLeNet, neon_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        act_layer.run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 4, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 5, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 6, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 7, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 8, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 9, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 10, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 11, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 12, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 13, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 14, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 15, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 16, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 17, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 18, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 19, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 20, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 21, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 22, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 23, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 24, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 25, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 26, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 27, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 28, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 29, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 30, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 31, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ActivationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetActivationLayerDataset, 32, 1, 4, 8>);
diff --git a/tests/benchmark/NEON/BitwiseAnd.cpp b/tests/benchmark/NEON/BitwiseAnd.cpp
new file mode 100644
index 0000000000..dba3d1ebea
--- /dev/null
+++ b/tests/benchmark/NEON/BitwiseAnd.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEBitwiseAnd.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "benchmark/benchmark_api.h"
+
+#include <memory>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::neon;
+
+namespace
+{
+template <typename DataSet>
+class BitwiseAnd : public ::benchmark::Fixture
+{
+public:
+    void SetUp(::benchmark::State &state) override
+    {
+        profiler.add(std::make_shared<WallClockTimer>());
+
+        const std::string image_name = *(DataSet().begin() + state.range(0));
+
+        // Create tensors
+        src1 = create_tensor(image_name, DataType::U8);
+        src2 = create_tensor(image_name, DataType::U8);
+        dst  = create_tensor(image_name, DataType::U8);
+
+        // Create and configure function
+        band.configure(&src1, &src2, &dst);
+
+        // Allocate tensors
+        src1.allocator()->allocate();
+        src2.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        // Fill source tensors
+        library->fill(NEAccessor(src1), image_name, Channel::R);
+        library->fill(NEAccessor(src2), image_name, Channel::G);
+    }
+
+    void TearDown(::benchmark::State &state) override
+    {
+        profiler.submit(state);
+    }
+
+    NEBitwiseAnd band{};
+    Profiler     profiler{};
+
+private:
+    Tensor src1{};
+    Tensor src2{};
+    Tensor dst{};
+};
+
+using BitwiseAndSmall = BitwiseAnd<SmallImages>;
+using BitwiseAndLarge = BitwiseAnd<LargeImages>;
+} // namespace
+
+BENCHMARK_DEFINE_F(BitwiseAndSmall, neon_bitwise_and)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        band.run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(BitwiseAndSmall, neon_bitwise_and)
+->Threads(1)
+->Apply(DataSetArgs<SmallImages>);
+
+BENCHMARK_DEFINE_F(BitwiseAndLarge, neon_bitwise_and)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        band.run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(BitwiseAndLarge, neon_bitwise_and)
+->Threads(1)
+->Apply(DataSetArgs<LargeImages>);
diff --git a/tests/benchmark/NEON/CMakeLists.txt b/tests/benchmark/NEON/CMakeLists.txt
new file mode 100644
index 0000000000..2cb3eb36c9
--- /dev/null
+++ b/tests/benchmark/NEON/CMakeLists.txt
@@ -0,0 +1,37 @@
+# Copyright (c) 2017 ARM Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+cmake_minimum_required (VERSION 3.1)
+
+set(arm_compute_test_benchmark_NEON_SOURCE_FILES
+    ${CMAKE_SOURCE_DIR}/NEON/NEAccessor.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/Bitwise/BitwiseAnd.cpp
+)
+
+add_library(arm_compute_test_benchmark_NEON OBJECT
+    ${arm_compute_test_benchmark_NEON_SOURCE_FILES}
+)
+
+SET(arm_compute_test_benchmark_TARGET_OBJECTS
+    ${arm_compute_test_benchmark_TARGET_OBJECTS}
+    $<TARGET_OBJECTS:arm_compute_test_benchmark_NEON>
+    PARENT_SCOPE
+)
diff --git a/tests/benchmark/NEON/ConvolutionLayer.cpp b/tests/benchmark/NEON/ConvolutionLayer.cpp
new file mode 100644
index 0000000000..0cfff8494b
--- /dev/null
+++ b/tests/benchmark/NEON/ConvolutionLayer.cpp
@@ -0,0 +1,303 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::neon;
+
+#include "benchmark/common/ConvolutionLayer.h"
+
+namespace
+{
+using ConvolutionLayerAlexNetF32 = ConvolutionLayer<AlexNetConvolutionLayerDataset, Tensor, NEAccessor, NEConvolutionLayer>;
+using ConvolutionLayerAlexNetQS8 = ConvolutionLayer<AlexNetConvolutionLayerDataset, Tensor, NEAccessor, NEConvolutionLayer, DataType::QS8>;
+using ConvolutionLayerLeNet5     = ConvolutionLayer<LeNet5ConvolutionLayerDataset, Tensor, NEAccessor, NEConvolutionLayer>;
+using ConvolutionLayerGoogLeNet1 = ConvolutionLayer<GoogLeNetConvolutionLayerDataset1, Tensor, NEAccessor, NEConvolutionLayer>;
+using ConvolutionLayerGoogLeNet2 = ConvolutionLayer<GoogLeNetConvolutionLayerDataset2, Tensor, NEAccessor, NEConvolutionLayer>;
+} // namespace
+
+// F32
+BENCHMARK_DEFINE_F(ConvolutionLayerAlexNetF32, neon_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        conv_layer->run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 4, 1, 4, 8>);
+
+// QS8
+BENCHMARK_DEFINE_F(ConvolutionLayerAlexNetQS8, neon_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        conv_layer->run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 4, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(ConvolutionLayerLeNet5, neon_lenet5)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        conv_layer->run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ConvolutionLayerLeNet5, neon_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5ConvolutionLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerLeNet5, neon_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5ConvolutionLayerDataset, 1, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        conv_layer->run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_DEFINE_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        conv_layer->run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 4, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 5, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 6, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 7, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 8, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 9, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 10, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 11, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 12, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 13, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 14, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 15, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 16, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 17, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 18, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 19, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 20, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 21, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 22, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 23, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 24, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 25, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 26, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 27, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 28, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 29, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 30, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset1, 31, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 4, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 5, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 6, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 7, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 8, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 9, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 10, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 11, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 12, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 13, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 14, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 15, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerGoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetConvolutionLayerDataset2, 16, 1, 4, 8>);
diff --git a/tests/benchmark/NEON/ConvolutionLayerDirect.cpp b/tests/benchmark/NEON/ConvolutionLayerDirect.cpp
new file mode 100644
index 0000000000..bc56e844d8
--- /dev/null
+++ b/tests/benchmark/NEON/ConvolutionLayerDirect.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+#include "dataset/ConvolutionLayerDataset.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::neon;
+
+#include "benchmark/common/ConvolutionLayer.h"
+
+namespace
+{
+using ConvolutionLayerDirectAlexNet = ConvolutionLayer<AlexNetConvolutionLayerDataset, Tensor, NEAccessor, NEDirectConvolutionLayer>;
+} // namespace
+
+BENCHMARK_DEFINE_F(ConvolutionLayerDirectAlexNet, neon_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        conv_layer->run();
+        profiler.stop();
+    }
+}
+
+// Registr only the 3x3 convolution layers
+BENCHMARK_REGISTER_F(ConvolutionLayerDirectAlexNet, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerDirectAlexNet, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(ConvolutionLayerDirectAlexNet, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetConvolutionLayerDataset, 4, 1, 4, 8>);
diff --git a/tests/benchmark/NEON/FullyConnectedLayer.cpp b/tests/benchmark/NEON/FullyConnectedLayer.cpp
new file mode 100644
index 0000000000..85979203ac
--- /dev/null
+++ b/tests/benchmark/NEON/FullyConnectedLayer.cpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::neon;
+
+#include "benchmark/common/FullyConnectedLayer.h"
+
+namespace
+{
+using FullyConnectedLayerAlexNetF32 = FullyConnectedLayer<AlexNetFullyConnectedLayerDataset, Tensor, NEAccessor, NEFullyConnectedLayer>;
+using FullyConnectedLayerAlexNetQS8 = FullyConnectedLayer<AlexNetFullyConnectedLayerDataset, Tensor, NEAccessor, NEFullyConnectedLayer, DataType::QS8>;
+using FullyConnectedLayerLeNet5     = FullyConnectedLayer<LeNet5FullyConnectedLayerDataset, Tensor, NEAccessor, NEFullyConnectedLayer>;
+using FullyConnectedLayerGoogLeNet  = FullyConnectedLayer<GoogLeNetFullyConnectedLayerDataset, Tensor, NEAccessor, NEFullyConnectedLayer>;
+} // namespace
+
+// F32
+BENCHMARK_DEFINE_F(FullyConnectedLayerAlexNetF32, neon_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        fc_layer->run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(FullyConnectedLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetFullyConnectedLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(FullyConnectedLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetFullyConnectedLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(FullyConnectedLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetFullyConnectedLayerDataset, 2, 1, 4, 8>);
+
+// QS8
+BENCHMARK_DEFINE_F(FullyConnectedLayerAlexNetQS8, neon_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        fc_layer->run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(FullyConnectedLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetFullyConnectedLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(FullyConnectedLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetFullyConnectedLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(FullyConnectedLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetFullyConnectedLayerDataset, 2, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(FullyConnectedLayerLeNet5, neon_lenet5)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        fc_layer->run();
+        profiler.stop();
+    }
+}
+BENCHMARK_REGISTER_F(FullyConnectedLayerLeNet5, neon_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5FullyConnectedLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(FullyConnectedLayerLeNet5, neon_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5FullyConnectedLayerDataset, 1, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(FullyConnectedLayerGoogLeNet, neon_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        fc_layer->run();
+        profiler.stop();
+    }
+}
+BENCHMARK_REGISTER_F(FullyConnectedLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetFullyConnectedLayerDataset, 0, 1, 4, 8>);
diff --git a/tests/benchmark/NEON/GEMM.cpp b/tests/benchmark/NEON/GEMM.cpp
new file mode 100644
index 0000000000..9190309f1c
--- /dev/null
+++ b/tests/benchmark/NEON/GEMM.cpp
@@ -0,0 +1,709 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::neon;
+
+#include "benchmark/NEON/GEMM.h"
+
+namespace
+{
+#ifdef ENABLE_FP16
+using GEMMFP16GoogLeNet1 = GEMM<GoogLeNetGEMMDataset1, Tensor, NEAccessor, NEGEMM, DataType::F16>;
+using GEMMFP16GoogLeNet2 = GEMM<GoogLeNetGEMMDataset2, Tensor, NEAccessor, NEGEMM, DataType::F16>;
+#endif /* ENABLE_FP16 */
+using GEMMFP32GoogLeNet1 = GEMM<GoogLeNetGEMMDataset1, Tensor, NEAccessor, NEGEMM, DataType::F32>;
+using GEMMFP32GoogLeNet2 = GEMM<GoogLeNetGEMMDataset2, Tensor, NEAccessor, NEGEMM, DataType::F32>;
+using GEMMQS8GoogLeNet1  = GEMM<GoogLeNetGEMMDataset1, Tensor, NEAccessor, NEGEMM, DataType::QS8>;
+using GEMMQS8GoogLeNet2  = GEMM<GoogLeNetGEMMDataset2, Tensor, NEAccessor, NEGEMM, DataType::QS8>;
+} // namespace
+#ifdef ENABLE_FP16
+BENCHMARK_DEFINE_F(GEMMFP16GoogLeNet1, neon_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        gemm_layer->run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_DEFINE_F(GEMMFP16GoogLeNet2, neon_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        gemm_layer->run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 0>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 1>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 2>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 3>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 4>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 5>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 6>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 7>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 8>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 9>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 10>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 11>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 12>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 13>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 14>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 15>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 16>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 17>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 18>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 19>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 20>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 21>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 22>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 23>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 24>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 25>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 26>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 27>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 28>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 29>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 30>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 31>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 0>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 1>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 2>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 3>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 4>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 5>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 6>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 7>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 8>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 9>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 10>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 11>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 12>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 13>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 14>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 15>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 16>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 17>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 18>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 19>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 20>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 21>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 22>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 23>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 24>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 25>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 26>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 27>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 28>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 29>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 30>);
+BENCHMARK_REGISTER_F(GEMMFP16GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 31>);
+#endif /* ENABLE_FP16 */
+
+BENCHMARK_DEFINE_F(GEMMFP32GoogLeNet1, neon_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        gemm_layer->run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_DEFINE_F(GEMMFP32GoogLeNet2, neon_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        gemm_layer->run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 0>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 1>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 2>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 3>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 4>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 5>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 6>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 7>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 8>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 9>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 10>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 11>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 12>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 13>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 14>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 15>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 16>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 17>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 18>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 19>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 20>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 21>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 22>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 23>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 24>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 25>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 26>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 27>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 28>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 29>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 30>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 31>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 0>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 1>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 2>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 3>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 4>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 5>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 6>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 7>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 8>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 9>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 10>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 11>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 12>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 13>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 14>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 15>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 16>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 17>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 18>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 19>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 20>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 21>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 22>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 23>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 24>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 25>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 26>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 27>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 28>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 29>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 30>);
+BENCHMARK_REGISTER_F(GEMMFP32GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 31>);
+
+BENCHMARK_DEFINE_F(GEMMQS8GoogLeNet1, neon_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        gemm_layer->run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_DEFINE_F(GEMMQS8GoogLeNet2, neon_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        gemm_layer->run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 0>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 1>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 2>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 3>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 4>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 5>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 6>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 7>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 8>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 9>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 10>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 11>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 12>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 13>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 14>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 15>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 16>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 17>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 18>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 19>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 20>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 21>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 22>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 23>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 24>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 25>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 26>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 27>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 28>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 29>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 30>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet1, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset1, 31>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 0>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 1>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 2>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 3>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 4>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 5>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 6>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 7>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 8>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 9>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 10>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 11>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 12>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 13>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 14>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 15>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 16>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 17>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 18>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 19>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 20>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 21>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 22>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 23>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 24>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 25>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 26>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 27>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 28>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 29>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 30>);
+BENCHMARK_REGISTER_F(GEMMQS8GoogLeNet2, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArg<GoogLeNetGEMMDataset2, 31>);
diff --git a/tests/benchmark/NEON/GEMM.h b/tests/benchmark/NEON/GEMM.h
new file mode 100644
index 0000000000..24d196523f
--- /dev/null
+++ b/tests/benchmark/NEON/GEMM.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_NEON_GEMM_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_NEON_GEMM_H__
+
+#include "TensorLibrary.h"
+#include "Utils.h"
+#include "dataset/GEMMDataset.h"
+
+#include <memory>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+// FIXME: Merge with CL/GEMM.h into common/GEMM.h after adding F16 support to NEON GEMM and QS8 support to CL GEMM
+template <typename DataSet, typename TensorType, typename Accessor, typename Function, DataType data_type>
+class GEMM : public ::benchmark::Fixture
+{
+public:
+    void SetUp(::benchmark::State &state) override
+    {
+#ifdef ENABLE_FP16
+        ARM_COMPUTE_ERROR_ON_MSG(data_type != DataType::F16 && data_type != DataType::F32 && data_type != DataType::QS8, "Unsupported data type for GEMM operation");
+#else  /* ENABLE_FP16 */
+        ARM_COMPUTE_ERROR_ON_MSG(data_type != DataType::F32 && data_type != DataType::QS8, "Unsupported data type for GEMM operation");
+#endif /* ENABLE_FP16 */
+
+        profiler.add(std::make_shared<WallClockTimer>());
+
+        const GEMMDataObject gemm_obj = *(DataSet().begin() + state.range(0));
+
+        TensorShape shape_a = gemm_obj.shape_a;
+        TensorShape shape_b = gemm_obj.shape_b;
+        TensorShape shape_c = gemm_obj.shape_c;
+        TensorShape shape_d = gemm_obj.shape_d;
+
+        // Create tensors
+        a = create_tensor(shape_a, data_type, 1, 4);
+        b = create_tensor(shape_b, data_type, 1, 4);
+        c = create_tensor(shape_c, data_type, 1, 4);
+        d = create_tensor(shape_d, data_type, 1, 4);
+
+        // Create and configure function
+        gemm_layer = std::unique_ptr<Function>(new Function());
+        gemm_layer->configure(&a, &b, &c, &d, gemm_obj.alpha, gemm_obj.beta);
+
+        // Allocate tensors
+        a.allocator()->allocate();
+        b.allocator()->allocate();
+        c.allocator()->allocate();
+        d.allocator()->allocate();
+    }
+
+    void TearDown(::benchmark::State &state) override
+    {
+        gemm_layer.reset();
+
+        a.allocator()->free();
+        b.allocator()->free();
+        c.allocator()->free();
+        d.allocator()->free();
+
+        profiler.submit(state);
+    }
+
+    std::unique_ptr<Function> gemm_layer{ nullptr };
+    Profiler                  profiler{};
+
+private:
+    TensorType a{};
+    TensorType b{};
+    TensorType c{};
+    TensorType d{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_BENCHMARK_NEON_GEMM_H__
diff --git a/tests/benchmark/NEON/NormalizationLayer.cpp b/tests/benchmark/NEON/NormalizationLayer.cpp
new file mode 100644
index 0000000000..46dc56b84d
--- /dev/null
+++ b/tests/benchmark/NEON/NormalizationLayer.cpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::neon;
+
+#include "benchmark/common/NormalizationLayer.h"
+
+namespace
+{
+using NormalizationLayerAlexNetF32 = NormalizationLayer<AlexNetNormalizationLayerDataset, Tensor, NEAccessor, NENormalizationLayer>;
+using NormalizationLayerAlexNetQS8 = NormalizationLayer<AlexNetNormalizationLayerDataset, Tensor, NEAccessor, NENormalizationLayer, DataType::QS8>;
+using NormalizationLayerGoogLeNet  = NormalizationLayer<GoogLeNetNormalizationLayerDataset, Tensor, NEAccessor, NENormalizationLayer>;
+} // namespace
+
+// F32
+BENCHMARK_DEFINE_F(NormalizationLayerAlexNetF32, neon_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        norm_layer->run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(NormalizationLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetNormalizationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(NormalizationLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetNormalizationLayerDataset, 1, 1, 4, 8>);
+
+// QS8
+BENCHMARK_DEFINE_F(NormalizationLayerAlexNetQS8, neon_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        norm_layer->run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(NormalizationLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetNormalizationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(NormalizationLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetNormalizationLayerDataset, 1, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(NormalizationLayerGoogLeNet, neon_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        norm_layer->run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(NormalizationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetNormalizationLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(NormalizationLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetNormalizationLayerDataset, 1, 1, 4, 8>);
diff --git a/tests/benchmark/NEON/PoolingLayer.cpp b/tests/benchmark/NEON/PoolingLayer.cpp
new file mode 100644
index 0000000000..9b071317b4
--- /dev/null
+++ b/tests/benchmark/NEON/PoolingLayer.cpp
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::neon;
+
+#include "benchmark/common/PoolingLayer.h"
+
+namespace
+{
+using PoolingLayerAlexNetF32 = PoolingLayer<AlexNetPoolingLayerDataset, Tensor, NEAccessor, NEPoolingLayer>;
+using PoolingLayerAlexNetQS8 = PoolingLayer<AlexNetPoolingLayerDataset, Tensor, NEAccessor, NEPoolingLayer, DataType::QS8>;
+using PoolingLayerLeNet5     = PoolingLayer<LeNet5PoolingLayerDataset, Tensor, NEAccessor, NEPoolingLayer>;
+using PoolingLayerGoogLeNet  = PoolingLayer<GoogLeNetPoolingLayerDataset, Tensor, NEAccessor, NEPoolingLayer>;
+} // namespace
+
+// F32
+BENCHMARK_DEFINE_F(PoolingLayerAlexNetF32, neon_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        pool_layer.run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(PoolingLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetPoolingLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetPoolingLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerAlexNetF32, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetPoolingLayerDataset, 2, 1, 4, 8>);
+
+// QS8
+BENCHMARK_DEFINE_F(PoolingLayerAlexNetQS8, neon_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        pool_layer.run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(PoolingLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetPoolingLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetPoolingLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerAlexNetQS8, neon_alexnet)
+->Threads(1)
+->Apply(DataSetArgBatched<AlexNetPoolingLayerDataset, 2, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(PoolingLayerLeNet5, neon_lenet5)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        pool_layer.run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(PoolingLayerLeNet5, neon_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5PoolingLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerLeNet5, neon_lenet5)
+->Threads(1)
+->Apply(DataSetArgBatched<LeNet5PoolingLayerDataset, 1, 1, 4, 8>);
+
+BENCHMARK_DEFINE_F(PoolingLayerGoogLeNet, neon_googlenet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run function
+        profiler.start();
+        pool_layer.run();
+        profiler.stop();
+    }
+}
+
+// FIXME: Add support for 7x7 pooling layer pool5/7x7_s1
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 0, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 1, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 2, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 3, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 4, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 5, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 6, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 7, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 8, 1, 4, 8>);
+BENCHMARK_REGISTER_F(PoolingLayerGoogLeNet, neon_googlenet)
+->Threads(1)
+->Apply(DataSetArgBatched<GoogLeNetPoolingLayerDataset, 9, 1, 4, 8>);
diff --git a/tests/benchmark/PMUCounter.cpp b/tests/benchmark/PMUCounter.cpp
new file mode 100644
index 0000000000..e87dae82e6
--- /dev/null
+++ b/tests/benchmark/PMUCounter.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "PMUCounter.h"
+
+#include "Utils.h"
+
+#define _GNU_SOURCE 1
+#include <asm/unistd.h>
+#include <csignal>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <fcntl.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/perf_event.h>
+#include <stdexcept>
+#include <sys/ioctl.h>
+#include <unistd.h>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+CycleCounter::CycleCounter()
+{
+    const pid_t pid = getpid();
+
+    struct perf_event_attr perf_config
+    {
+    };
+    memset(&perf_config, 0, sizeof(struct perf_event_attr));
+
+    perf_config.config = PERF_COUNT_HW_CPU_CYCLES;
+    perf_config.size   = sizeof(struct perf_event_attr);
+    perf_config.type   = PERF_TYPE_HARDWARE;
+    // The inherit bit specifies that this counter should count events of child
+    // tasks as well as the task specified
+    perf_config.inherit = 1;
+    // Enables saving of event counts on context switch for inherited tasks
+    perf_config.inherit_stat = 1;
+
+    _fd = syscall(__NR_perf_event_open, &perf_config, pid, -1, -1, 0);
+
+    if(_fd < 0)
+    {
+        throw std::runtime_error("perf_event_open for cycles failed");
+    }
+}
+
+std::string CycleCounter::id() const
+{
+    return "Cycle Counter";
+}
+
+void CycleCounter::start()
+{
+    ioctl(_fd, PERF_EVENT_IOC_RESET, 0);
+    ioctl(_fd, PERF_EVENT_IOC_ENABLE, 0);
+}
+
+void CycleCounter::stop()
+{
+    ioctl(_fd, PERF_EVENT_IOC_DISABLE, 0);
+    read(_fd, &_cycles, sizeof(_cycles));
+}
+
+std::unique_ptr<Instrument::IMeasurement> CycleCounter::get_measurement() const
+{
+    return ::arm_compute::test::cpp14::make_unique<Instrument::Measurement<long long>>(_cycles);
+}
+
+InstructionCounter::InstructionCounter()
+{
+    const pid_t pid = getpid();
+
+    struct perf_event_attr perf_config
+    {
+    };
+    memset(&perf_config, 0, sizeof(struct perf_event_attr));
+
+    perf_config.config = PERF_COUNT_HW_INSTRUCTIONS;
+    perf_config.size   = sizeof(struct perf_event_attr);
+    perf_config.type   = PERF_TYPE_HARDWARE;
+    // The inherit bit specifies that this counter should count events of child
+    // tasks as well as the task specified
+    perf_config.inherit = 1;
+    // Enables saving of event counts on context switch for inherited tasks
+    perf_config.inherit_stat = 1;
+
+    _fd = syscall(__NR_perf_event_open, &perf_config, pid, -1, -1, 0);
+
+    if(_fd < 0)
+    {
+        throw std::runtime_error("perf_event_open for instructions failed");
+    }
+}
+
+std::string InstructionCounter::id() const
+{
+    return "Instruction Counter";
+}
+
+void InstructionCounter::start()
+{
+    ioctl(_fd, PERF_EVENT_IOC_RESET, 0);
+    ioctl(_fd, PERF_EVENT_IOC_ENABLE, 0);
+}
+
+void InstructionCounter::stop()
+{
+    ioctl(_fd, PERF_EVENT_IOC_DISABLE, 0);
+    read(_fd, &_instructions, sizeof(_instructions));
+}
+
+std::unique_ptr<Instrument::IMeasurement> InstructionCounter::get_measurement() const
+{
+    return std::unique_ptr<Instrument::IMeasurement>(new Instrument::Measurement<long long>(_instructions));
+}
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/benchmark/PMUCounter.h b/tests/benchmark/PMUCounter.h
new file mode 100644
index 0000000000..de45f319f6
--- /dev/null
+++ b/tests/benchmark/PMUCounter.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_PMU_COUNTER_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_PMU_COUNTER_H__
+
+#include "Instrument.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+/** Implementation of an instrument to count CPU cycles. */
+class CycleCounter : public Instrument
+{
+public:
+    /** Initialise the cycle counter. */
+    CycleCounter();
+
+    std::string                               id() const override;
+    void                                      start() override;
+    void                                      stop() override;
+    std::unique_ptr<Instrument::IMeasurement> get_measurement() const override;
+
+private:
+    long      _fd{ -1 };
+    long long _cycles{ 0 };
+};
+
+/** Implementation of an instrument to count executed CPU instructions. */
+class InstructionCounter : public Instrument
+{
+public:
+    /** Initialise the instruction counter. */
+    InstructionCounter();
+
+    std::string                               id() const override;
+    void                                      start() override;
+    void                                      stop() override;
+    std::unique_ptr<Instrument::IMeasurement> get_measurement() const override;
+
+private:
+    long      _fd{ -1 };
+    long long _instructions{ 0 };
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/benchmark/PerformanceProgramOptions.cpp b/tests/benchmark/PerformanceProgramOptions.cpp
new file mode 100644
index 0000000000..b4becc3c69
--- /dev/null
+++ b/tests/benchmark/PerformanceProgramOptions.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "PerformanceProgramOptions.h"
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Weffc++"
+#pragma GCC diagnostic ignored "-Wnon-virtual-dtor"
+#pragma GCC diagnostic ignored "-Wctor-dtor-privacy"
+#include "boost/program_options.hpp"
+#pragma GCC diagnostic pop
+
+namespace arm_compute
+{
+namespace test
+{
+namespace performance
+{
+PerformanceProgramOptions::PerformanceProgramOptions()
+{
+    boost::program_options::options_description options("Performance options");
+    options.add_options()("runs", boost::program_options::value<unsigned int>()->default_value(1), "Repetitions per test");
+    options.add_options()("threads", boost::program_options::value<unsigned int>()->default_value(1), "Number of parallel CPU threads");
+    add_options(options);
+}
+} // namespace performance
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/benchmark/PerformanceProgramOptions.h b/tests/benchmark/PerformanceProgramOptions.h
new file mode 100644
index 0000000000..671e263bb2
--- /dev/null
+++ b/tests/benchmark/PerformanceProgramOptions.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_PERFORMANCE_PROGRAM_OPTIONS_H__
+#define __ARM_COMPUTE_TEST_PERFORMANCE_PROGRAM_OPTIONS_H__
+
+#include "ProgramOptions.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace performance
+{
+/** Subclass of @ref ProgramOptions that adds performance specific options. */
+class PerformanceProgramOptions : public ProgramOptions
+{
+public:
+    /** Defines additonal options. */
+    PerformanceProgramOptions();
+};
+} // namespace performance
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/benchmark/PerformanceUserConfiguration.cpp b/tests/benchmark/PerformanceUserConfiguration.cpp
new file mode 100644
index 0000000000..ca412d660a
--- /dev/null
+++ b/tests/benchmark/PerformanceUserConfiguration.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "PerformanceUserConfiguration.h"
+
+#include "ProgramOptions.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace performance
+{
+PerformanceUserConfiguration::PerformanceUserConfiguration(const ProgramOptions &options)
+    : UserConfiguration(options)
+{
+    unsigned int tmp_runs = 0;
+    if(options.get("runs", tmp_runs))
+    {
+        runs = tmp_runs;
+    }
+}
+} // namespace performance
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/benchmark/PerformanceUserConfiguration.h b/tests/benchmark/PerformanceUserConfiguration.h
new file mode 100644
index 0000000000..a140d404c8
--- /dev/null
+++ b/tests/benchmark/PerformanceUserConfiguration.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_PERFORMANCE_PERFORMANCE_USER_CONFIGURATION_H__
+#define __ARM_COMPUTE_TEST_PERFORMANCE_PERFORMANCE_USER_CONFIGURATION_H__
+
+#include "UserConfiguration.h"
+
+namespace arm_compute
+{
+namespace test
+{
+class ProgramOptions;
+
+namespace performance
+{
+/** Specialisation of @ref UserConfiguration to provide performance specific
+ * configuration options.
+ */
+struct PerformanceUserConfiguration : public UserConfiguration
+{
+    PerformanceUserConfiguration() = default;
+
+    /** Initialise the configuration according to the program options.
+     *
+     * @param[in] options Parsed command line options.
+     */
+    PerformanceUserConfiguration(const ProgramOptions &options);
+
+    Option<unsigned int> runs{};
+};
+} // namespace performance
+
+extern performance::PerformanceUserConfiguration user_config;
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/benchmark/Profiler.cpp b/tests/benchmark/Profiler.cpp
new file mode 100644
index 0000000000..f3ce94164f
--- /dev/null
+++ b/tests/benchmark/Profiler.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Profiler.h"
+
+#include <iostream>
+#include <utility>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+void Profiler::add(const std::shared_ptr<Instrument> &instrument)
+{
+    _instruments.push_back(instrument);
+}
+
+void Profiler::start()
+{
+    for(auto &instrument : _instruments)
+    {
+        instrument->start();
+    }
+}
+
+void Profiler::stop()
+{
+    for(auto &instrument : _instruments)
+    {
+        instrument->stop();
+    }
+
+    for(const auto &instrument : _instruments)
+    {
+        _measurements[instrument->id()].push_back(*instrument->get_measurement());
+    }
+}
+
+void Profiler::submit(::benchmark::State &state)
+{
+    for(auto &instrument : _measurements)
+    {
+        double sum_values = std::accumulate(instrument.second.begin(), instrument.second.end(), 0.);
+        size_t num_values = instrument.second.size();
+
+        if(num_values > 2)
+        {
+            auto minmax_values                        = std::minmax_element(instrument.second.begin(), instrument.second.end());
+            state.counters[instrument.first + "_min"] = *minmax_values.first;
+            state.counters[instrument.first + "_max"] = *minmax_values.second;
+            sum_values -= *minmax_values.first + *minmax_values.second;
+            num_values -= 2;
+        }
+        state.counters[instrument.first] = sum_values / num_values;
+        instrument.second.clear();
+    }
+}
+
+const Profiler::MeasurementsMap &Profiler::measurements() const
+{
+    return _measurements;
+}
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/benchmark/Profiler.h b/tests/benchmark/Profiler.h
new file mode 100644
index 0000000000..03922f4704
--- /dev/null
+++ b/tests/benchmark/Profiler.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_PROFILER_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_PROFILER_H__
+
+#include "Instrument.h"
+
+#include "benchmark/benchmark_api.h"
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+class Profiler
+{
+public:
+    /** Mapping from instrument ids to their measurements. */
+    using MeasurementsMap = std::map<std::string, std::vector<double>>;
+
+    /** Add @p instrument to the performance montior.
+     *
+     * All added instruments will be used when @ref start or @ref stop are
+     * called to make measurements.
+     *
+     * @param[in] instrument Instrument to be used to measure performance.
+     */
+    void add(const std::shared_ptr<Instrument> &instrument);
+
+    /** Start all added instruments to measure performance. */
+    void start();
+
+    /** Stop all added instruments. */
+    void stop();
+
+    /** Commit all measured values to the current active test. */
+    void submit(::benchmark::State &state);
+
+    /** Return measurements for all instruments. */
+    const MeasurementsMap &measurements() const;
+
+private:
+    std::vector<std::shared_ptr<Instrument>> _instruments{};
+    MeasurementsMap                          _measurements{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/benchmark/WallClockTimer.cpp b/tests/benchmark/WallClockTimer.cpp
new file mode 100644
index 0000000000..9ab53d0b3c
--- /dev/null
+++ b/tests/benchmark/WallClockTimer.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "WallClockTimer.h"
+
+#include "Utils.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+std::string WallClockTimer::id() const
+{
+    return "Wall clock";
+}
+
+void WallClockTimer::start()
+{
+    _start = std::chrono::high_resolution_clock::now();
+}
+
+void WallClockTimer::stop()
+{
+    _stop = std::chrono::high_resolution_clock::now();
+}
+
+std::unique_ptr<Instrument::IMeasurement> WallClockTimer::get_measurement() const
+{
+    const std::chrono::duration<float, std::milli> delta = _stop - _start;
+    return ::arm_compute::test::cpp14::make_unique<Instrument::Measurement<float>>(delta.count());
+}
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/benchmark/WallClockTimer.h b/tests/benchmark/WallClockTimer.h
new file mode 100644
index 0000000000..cf6828e88d
--- /dev/null
+++ b/tests/benchmark/WallClockTimer.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_WALL_CLOCK_TIMER_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_WALL_CLOCK_TIMER_H__
+
+#include "Instrument.h"
+
+#include <chrono>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+/** Implementation of an instrument to measure elapsed wall-clock time in milliseconds. */
+class WallClockTimer : public Instrument
+{
+public:
+    std::string                               id() const override;
+    void                                      start() override;
+    void                                      stop() override;
+    std::unique_ptr<Instrument::IMeasurement> get_measurement() const override;
+
+private:
+    std::chrono::high_resolution_clock::time_point _start{};
+    std::chrono::high_resolution_clock::time_point _stop{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/benchmark/common/ActivationLayer.h b/tests/benchmark/common/ActivationLayer.h
new file mode 100644
index 0000000000..7edfb6ef3c
--- /dev/null
+++ b/tests/benchmark/common/ActivationLayer.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_ACTIVATION_LAYER_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_ACTIVATION_LAYER_H__
+
+#include "TensorLibrary.h"
+#include "Utils.h"
+#include "dataset/ActivationLayerDataset.h"
+
+#include <memory>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+template <typename DataSet, typename TensorType, typename Accessor, typename Function, DataType dt = DataType::F32>
+class ActivationLayer : public ::benchmark::Fixture
+{
+public:
+    void SetUp(::benchmark::State &state) override
+    {
+        profiler.add(std::make_shared<WallClockTimer>());
+
+        const ActivationLayerDataObject act_obj = *(DataSet().begin() + state.range(0));
+
+        // Set batched in source and destination shapes
+        const unsigned int batches              = state.range(1);
+        const unsigned int fixed_point_position = 4;
+        TensorShape        shape                = act_obj.shape;
+        shape.set(shape.num_dimensions(), batches);
+
+        // Create tensors
+        src = create_tensor(shape, dt, 1, fixed_point_position);
+        dst = create_tensor(shape, dt, 1, fixed_point_position);
+
+        // Create and configure function
+        act_layer.configure(&src, &dst, act_obj.info);
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        // Fill tensors
+        library->fill_tensor_uniform(Accessor(src), 0);
+    }
+
+    void TearDown(::benchmark::State &state) override
+    {
+        src.allocator()->free();
+        dst.allocator()->free();
+
+        profiler.submit(state);
+    }
+
+    Function act_layer{};
+    Profiler profiler{};
+
+private:
+    TensorType src{};
+    TensorType dst{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_BENCHMARK_ACTIVATION_LAYER_H__
diff --git a/tests/benchmark/common/ConvolutionLayer.h b/tests/benchmark/common/ConvolutionLayer.h
new file mode 100644
index 0000000000..594c62c50e
--- /dev/null
+++ b/tests/benchmark/common/ConvolutionLayer.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_CONVOLUTION_LAYER_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_CONVOLUTION_LAYER_H__
+
+#include "TensorLibrary.h"
+#include "Utils.h"
+#include "dataset/ConvolutionLayerDataset.h"
+
+#include <memory>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+template <typename DataSet, typename TensorType, typename Accessor, typename Function, DataType dt = DataType::F32>
+class ConvolutionLayer : public ::benchmark::Fixture
+{
+public:
+    void SetUp(::benchmark::State &state) override
+    {
+        profiler.add(std::make_shared<WallClockTimer>());
+
+        const ConvolutionLayerDataObject conv_obj = *(DataSet().begin() + state.range(0));
+
+        // Set batched in source and destination shapes
+        const unsigned int batches              = state.range(1);
+        const unsigned int fixed_point_position = 4;
+        TensorShape        src_shape            = conv_obj.src_shape;
+        TensorShape        dst_shape            = conv_obj.dst_shape;
+        src_shape.set(3 /* batch */, batches);
+        dst_shape.set(3 /* batch */, batches);
+
+        // Create tensors
+        src     = create_tensor(src_shape, dt, 1, fixed_point_position);
+        weights = create_tensor(conv_obj.weights_shape, dt, 1, fixed_point_position);
+        bias    = create_tensor(conv_obj.bias_shape, dt, 1, fixed_point_position);
+        dst     = create_tensor(dst_shape, dt, 1, fixed_point_position);
+
+        // Create and configure function
+        conv_layer = std::unique_ptr<Function>(new Function());
+        conv_layer->configure(&src, &weights, &bias, &dst, conv_obj.info);
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        weights.allocator()->allocate();
+        bias.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        // Fill tensors
+        library->fill_tensor_uniform(Accessor(src), 0);
+        library->fill_tensor_uniform(Accessor(weights), 1);
+        library->fill_tensor_uniform(Accessor(bias), 2);
+    }
+
+    void TearDown(::benchmark::State &state) override
+    {
+        conv_layer.reset();
+
+        src.allocator()->free();
+        weights.allocator()->free();
+        bias.allocator()->free();
+        dst.allocator()->free();
+
+        profiler.submit(state);
+    }
+
+    std::unique_ptr<Function> conv_layer{ nullptr };
+    Profiler                  profiler{};
+
+private:
+    TensorType src{};
+    TensorType weights{};
+    TensorType bias{};
+    TensorType dst{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_BENCHMARK_CONVOLUTION_LAYER_H__
diff --git a/tests/benchmark/common/FullyConnectedLayer.h b/tests/benchmark/common/FullyConnectedLayer.h
new file mode 100644
index 0000000000..88adf83f2a
--- /dev/null
+++ b/tests/benchmark/common/FullyConnectedLayer.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_FULLYCONNECTED_LAYER_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_FULLYCONNECTED_LAYER_H__
+
+#include "TensorLibrary.h"
+#include "Utils.h"
+#include "dataset/ConvolutionLayerDataset.h"
+
+#include <memory>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+template <typename DataSet, typename TensorType, typename Accessor, typename Function, DataType dt = DataType::F32>
+class FullyConnectedLayer : public ::benchmark::Fixture
+{
+public:
+    void SetUp(::benchmark::State &state) override
+    {
+        profiler.add(std::make_shared<WallClockTimer>());
+
+        const FullyConnectedLayerDataObject fc_obj = *(DataSet().begin() + state.range(0));
+
+        // Set batched in source and destination shapes
+        const unsigned int batches              = state.range(1);
+        const unsigned int fixed_point_position = 4;
+        TensorShape        src_shape            = fc_obj.src_shape;
+        TensorShape        dst_shape            = fc_obj.dst_shape;
+        src_shape.set(src_shape.num_dimensions(), batches);
+        dst_shape.set(dst_shape.num_dimensions(), batches);
+
+        // Create tensors
+        src     = create_tensor(src_shape, dt, 1, fixed_point_position);
+        weights = create_tensor(fc_obj.weights_shape, dt, 1, fixed_point_position);
+        bias    = create_tensor(fc_obj.bias_shape, dt, 1, fixed_point_position);
+        dst     = create_tensor(dst_shape, dt, 1, fixed_point_position);
+
+        // Create and configure function
+        fc_layer = std::unique_ptr<Function>(new Function());
+        fc_layer->configure(&src, &weights, &bias, &dst);
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        weights.allocator()->allocate();
+        bias.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        // Fill tensors
+        library->fill_tensor_uniform(Accessor(src), 0);
+        library->fill_tensor_uniform(Accessor(weights), 1);
+        library->fill_tensor_uniform(Accessor(bias), 2);
+    }
+
+    void TearDown(::benchmark::State &state) override
+    {
+        fc_layer.reset();
+
+        src.allocator()->free();
+        weights.allocator()->free();
+        bias.allocator()->free();
+        dst.allocator()->free();
+
+        profiler.submit(state);
+    }
+
+    std::unique_ptr<Function> fc_layer{ nullptr };
+    Profiler                  profiler{};
+
+private:
+    TensorType src{};
+    TensorType weights{};
+    TensorType bias{};
+    TensorType dst{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_BENCHMARK_FULLYCONNECTED_LAYER_H__
diff --git a/tests/benchmark/common/NormalizationLayer.h b/tests/benchmark/common/NormalizationLayer.h
new file mode 100644
index 0000000000..4593fb7df3
--- /dev/null
+++ b/tests/benchmark/common/NormalizationLayer.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_NORMALIZATION_LAYER_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_NORMALIZATION_LAYER_H__
+
+#include "TensorLibrary.h"
+#include "Utils.h"
+#include "dataset/NormalizationLayerDataset.h"
+
+#include <memory>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+template <typename DataSet, typename TensorType, typename Accessor, typename Function, DataType dt = DataType::F32>
+class NormalizationLayer : public ::benchmark::Fixture
+{
+public:
+    void SetUp(::benchmark::State &state) override
+    {
+        profiler.add(std::make_shared<WallClockTimer>());
+
+        const NormalizationLayerDataObject norm_obj = *(DataSet().begin() + state.range(0));
+
+        // Set batched in source and destination shapes
+        const unsigned int batches              = state.range(1);
+        const unsigned int fixed_point_position = 4;
+        TensorShape        shape                = norm_obj.shape;
+        shape.set(shape.num_dimensions(), batches);
+
+        // Create tensors
+        src = create_tensor(shape, dt, 1, fixed_point_position);
+        dst = create_tensor(shape, dt, 1, fixed_point_position);
+
+        // Create and configure function
+        norm_layer = std::unique_ptr<Function>(new Function());
+        norm_layer->configure(&src, &dst, norm_obj.info);
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        // Fill tensors
+        library->fill_tensor_uniform(Accessor(src), 0);
+    }
+
+    void TearDown(::benchmark::State &state) override
+    {
+        norm_layer.reset();
+
+        src.allocator()->free();
+        dst.allocator()->free();
+
+        profiler.submit(state);
+    }
+
+    std::unique_ptr<Function> norm_layer{ nullptr };
+    Profiler                  profiler{};
+
+private:
+    TensorType src{};
+    TensorType dst{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_BENCHMARK_NORMALIZATION_LAYER_H__
diff --git a/tests/benchmark/common/PoolingLayer.h b/tests/benchmark/common/PoolingLayer.h
new file mode 100644
index 0000000000..5bb332fd6b
--- /dev/null
+++ b/tests/benchmark/common/PoolingLayer.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_POOLING_LAYER_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_POOLING_LAYER_H__
+
+#include "TensorLibrary.h"
+#include "Utils.h"
+#include "dataset/PoolingLayerDataset.h"
+
+#include <memory>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+template <typename DataSet, typename TensorType, typename Accessor, typename Function, DataType dt = DataType::F32>
+class PoolingLayer : public ::benchmark::Fixture
+{
+public:
+    void SetUp(::benchmark::State &state) override
+    {
+        profiler.add(std::make_shared<WallClockTimer>());
+
+        const PoolingLayerDataObject pool_obj = *(DataSet().begin() + state.range(0));
+
+        // Set batched in source and destination shapes
+        const unsigned int batches              = state.range(1);
+        const unsigned int fixed_point_position = 4;
+        TensorShape        src_shape            = pool_obj.src_shape;
+        TensorShape        dst_shape            = pool_obj.dst_shape;
+        src_shape.set(src_shape.num_dimensions(), batches);
+        dst_shape.set(dst_shape.num_dimensions(), batches);
+
+        // Create tensors
+        src = create_tensor(src_shape, dt, 1, fixed_point_position);
+        dst = create_tensor(dst_shape, dt, 1, fixed_point_position);
+
+        // Create and configure function
+        pool_layer.configure(&src, &dst, pool_obj.info);
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        // Fill tensors
+        library->fill_tensor_uniform(Accessor(src), 0);
+    }
+
+    void TearDown(::benchmark::State &state) override
+    {
+        // Free allocators
+        src.allocator()->free();
+        dst.allocator()->free();
+
+        profiler.submit(state);
+    }
+
+    Function pool_layer{};
+    Profiler profiler{};
+
+private:
+    TensorType src{};
+    TensorType dst{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_BENCHMARK_POOLING_LAYER_H__
diff --git a/tests/benchmark/main.cpp b/tests/benchmark/main.cpp
new file mode 100644
index 0000000000..acde259d9b
--- /dev/null
+++ b/tests/benchmark/main.cpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "PMUCounter.h"
+#include "PerformanceProgramOptions.h"
+#include "PerformanceUserConfiguration.h"
+#include "TensorLibrary.h"
+#include "Utils.h"
+#include "WallClockTimer.h"
+
+#include "benchmark/benchmark_api.h"
+
+#ifdef OPENCL
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#endif
+#include "arm_compute/runtime/Scheduler.h"
+
+#include <iostream>
+#include <memory>
+
+using namespace arm_compute::test;
+using namespace arm_compute::test::performance;
+
+namespace arm_compute
+{
+namespace test
+{
+PerformanceUserConfiguration   user_config;
+std::unique_ptr<TensorLibrary> library;
+} // namespace test
+} // namespace arm_compute
+
+int main(int argc, char **argv)
+{
+    PerformanceProgramOptions options;
+    try
+    {
+        options.parse_commandline(argc, argv);
+
+        if(options.wants_help())
+        {
+            std::cout << "Usage: " << argv[0] << " [options] PATH\n";
+            std::cout << options.get_help() << "\n";
+        }
+
+        user_config = PerformanceUserConfiguration(options);
+    }
+    catch(const boost::program_options::required_option &err)
+    {
+        std::cerr << "Error: " << err.what() << "\n";
+        std::cout << "\nUsage: " << argv[0] << " [options] PATH\n";
+        std::cout << options.get_help() << "\n";
+        return 1;
+    }
+
+    ::benchmark::Initialize(&argc, argv);
+
+    if(user_config.seed.is_set())
+    {
+        library = cpp14::make_unique<TensorLibrary>(user_config.path.get(), user_config.seed);
+    }
+    else
+    {
+        library = cpp14::make_unique<TensorLibrary>(user_config.path.get());
+    }
+
+#ifdef OPENCL
+    arm_compute::CLScheduler::get().default_init();
+#endif
+
+    std::cout << "Using " << user_config.threads << " CPU " << (user_config.threads == 1 ? "thread" : "threads") << "\n";
+    arm_compute::Scheduler::get().set_num_threads(user_config.threads);
+
+    ::benchmark::RunSpecifiedBenchmarks();
+}
diff --git a/tests/benchmark/system_tests/CL/AlexNet.cpp b/tests/benchmark/system_tests/CL/AlexNet.cpp
new file mode 100644
index 0000000000..fe0b9913de
--- /dev/null
+++ b/tests/benchmark/system_tests/CL/AlexNet.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLSubTensor.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+#include "benchmark/system_tests/common/AlexNet.h"
+
+namespace
+{
+using AlexNetSystemTest = AlexNetFixture<ICLTensor,
+      CLTensor,
+      CLSubTensor,
+      CLAccessor,
+      CLActivationLayer,
+      CLConvolutionLayer,
+      CLFullyConnectedLayer,
+      CLNormalizationLayer,
+      CLPoolingLayer,
+      CLSoftmaxLayer>;
+} // namespace
+
+BENCHMARK_DEFINE_F(AlexNetSystemTest, cl_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run AlexNet
+        profiler.start();
+        network.run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(AlexNetSystemTest, cl_alexnet)
+->Threads(1)
+->Iterations(10)
+->ArgName("batch_size")
+->Arg(1)
+->Arg(4)
+->Arg(8);
+\ No newline at end of file
diff --git a/tests/benchmark/system_tests/CL/LeNet5.cpp b/tests/benchmark/system_tests/CL/LeNet5.cpp
new file mode 100644
index 0000000000..d65a7dde6c
--- /dev/null
+++ b/tests/benchmark/system_tests/CL/LeNet5.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::cl;
+
+#include "benchmark/system_tests/common/LeNet5.h"
+
+namespace
+{
+using LeNet5SystemTest = LeNet5Fixture<CLTensor,
+      CLAccessor,
+      CLActivationLayer,
+      CLConvolutionLayer,
+      CLFullyConnectedLayer,
+      CLPoolingLayer,
+      CLSoftmaxLayer>;
+} // namespace
+
+BENCHMARK_DEFINE_F(LeNet5SystemTest, cl_lenet5)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run LeNet5
+        profiler.start();
+        network.run();
+        CLScheduler::get().sync();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(LeNet5SystemTest, cl_lenet5)
+->Threads(1)
+->Iterations(10)
+->ArgName("batch_size")
+->Arg(1)
+->Arg(16)
+->Arg(32);
diff --git a/tests/benchmark/system_tests/NEON/AlexNet.cpp b/tests/benchmark/system_tests/NEON/AlexNet.cpp
new file mode 100644
index 0000000000..2d222e7309
--- /dev/null
+++ b/tests/benchmark/system_tests/NEON/AlexNet.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
+#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
+#include "arm_compute/runtime/SubTensor.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::neon;
+
+#include "benchmark/system_tests/common/AlexNet.h"
+
+namespace
+{
+using AlexNetSystemTestF32 = AlexNetFixture<ITensor,
+      Tensor,
+      SubTensor,
+      NEAccessor,
+      NEActivationLayer,
+      NEConvolutionLayer,
+      NEFullyConnectedLayer,
+      NENormalizationLayer,
+      NEPoolingLayer,
+      NESoftmaxLayer,
+      DataType::F32>;
+
+using AlexNetSystemTestQS8 = AlexNetFixture<ITensor,
+      Tensor,
+      SubTensor,
+      NEAccessor,
+      NEActivationLayer,
+      NEConvolutionLayer,
+      NEFullyConnectedLayer,
+      NENormalizationLayer,
+      NEPoolingLayer,
+      NESoftmaxLayer,
+      DataType::QS8>;
+} // namespace
+
+// F32
+BENCHMARK_DEFINE_F(AlexNetSystemTestF32, neon_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run AlexNet
+        profiler.start();
+        network.run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(AlexNetSystemTestF32, neon_alexnet)
+->Threads(1)
+->Iterations(10)
+->ArgName("batch_size")
+->Arg(1)
+->Arg(4)
+->Arg(8);
+
+// QS8
+BENCHMARK_DEFINE_F(AlexNetSystemTestQS8, neon_alexnet)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run AlexNet
+        profiler.start();
+        network.run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(AlexNetSystemTestQS8, neon_alexnet)
+->Threads(1)
+->Iterations(10)
+->ArgName("batch_size")
+->Arg(1)
+->Arg(4)
+->Arg(8);
+\ No newline at end of file
diff --git a/tests/benchmark/system_tests/NEON/LeNet5.cpp b/tests/benchmark/system_tests/NEON/LeNet5.cpp
new file mode 100644
index 0000000000..5170f05a70
--- /dev/null
+++ b/tests/benchmark/system_tests/NEON/LeNet5.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "benchmark/Datasets.h"
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
+#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "benchmark/benchmark_api.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+using namespace arm_compute::test::neon;
+
+#include "benchmark/system_tests/common/LeNet5.h"
+
+namespace
+{
+using LeNet5SystemTest = LeNet5Fixture<Tensor,
+      NEAccessor,
+      NEActivationLayer,
+      NEConvolutionLayer,
+      NEFullyConnectedLayer,
+      NEPoolingLayer,
+      NESoftmaxLayer>;
+} // namespace
+
+BENCHMARK_DEFINE_F(LeNet5SystemTest, neon_lenet5)
+(::benchmark::State &state)
+{
+    while(state.KeepRunning())
+    {
+        // Run LeNet5
+        profiler.start();
+        network.run();
+        profiler.stop();
+    }
+}
+
+BENCHMARK_REGISTER_F(LeNet5SystemTest, neon_lenet5)
+->Threads(1)
+->Iterations(10)
+->ArgName("batch_size")
+->Arg(1)
+->Arg(16)
+->Arg(32);
diff --git a/tests/benchmark/system_tests/common/AlexNet.h b/tests/benchmark/system_tests/common/AlexNet.h
new file mode 100644
index 0000000000..9c93dc7228
--- /dev/null
+++ b/tests/benchmark/system_tests/common/AlexNet.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_ALEXNET_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_ALEXNET_H__
+
+#include "TensorLibrary.h"
+#include "Utils.h"
+
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "model_objects/AlexNet.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+template <typename ITensorType,
+          typename TensorType,
+          typename SubTensorType,
+          typename Accessor,
+          typename ActivationLayerFunction,
+          typename ConvolutionLayerFunction,
+          typename FullyConnectedLayerFunction,
+          typename NormalizationLayerFunction,
+          typename PoolingLayerFunction,
+          typename SoftmaxLayerFunction,
+          DataType dt = DataType::F32>
+class AlexNetFixture : public ::benchmark::Fixture
+{
+public:
+    void SetUp(::benchmark::State &state) override
+    {
+        profiler.add(std::make_shared<WallClockTimer>());
+
+        const unsigned int batches            = static_cast<unsigned int>(state.range(0));
+        const bool         weights_transposed = true;
+
+        network.init_weights(batches, weights_transposed);
+        network.build();
+        network.allocate();
+        network.fill_random();
+    }
+
+    void TearDown(::benchmark::State &state) override
+    {
+        profiler.submit(state);
+        network.clear();
+    }
+
+    Profiler profiler{};
+    model_objects::AlexNet<ITensorType,
+                  TensorType,
+                  SubTensorType,
+                  Accessor,
+                  ActivationLayerFunction,
+                  ConvolutionLayerFunction,
+                  FullyConnectedLayerFunction,
+                  NormalizationLayerFunction,
+                  PoolingLayerFunction,
+                  SoftmaxLayerFunction,
+                  dt>
+                  network{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_BENCHMARK_ALEXNET_H__
diff --git a/tests/benchmark/system_tests/common/LeNet5.h b/tests/benchmark/system_tests/common/LeNet5.h
new file mode 100644
index 0000000000..db34f6813a
--- /dev/null
+++ b/tests/benchmark/system_tests/common/LeNet5.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BENCHMARK_LENET5_H__
+#define __ARM_COMPUTE_TEST_BENCHMARK_LENET5_H__
+
+#include "TensorLibrary.h"
+#include "Utils.h"
+
+#include "benchmark/Profiler.h"
+#include "benchmark/WallClockTimer.h"
+
+#include "model_objects/LeNet5.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::benchmark;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace benchmark
+{
+template <typename TensorType,
+          typename Accessor,
+          typename ActivationLayerFunction,
+          typename ConvolutionLayerFunction,
+          typename FullyConnectedLayerFunction,
+          typename PoolingLayerFunction,
+          typename SoftmaxLayerFunction>
+class LeNet5Fixture : public ::benchmark::Fixture
+{
+public:
+    void SetUp(::benchmark::State &state) override
+    {
+        profiler.add(std::make_shared<WallClockTimer>());
+
+        network.build(static_cast<unsigned int>(state.range(0)));
+        network.fill_random();
+    }
+
+    void TearDown(::benchmark::State &state) override
+    {
+        profiler.submit(state);
+        network.clear();
+    }
+
+    Profiler profiler{};
+    model_objects::LeNet5<TensorType,
+                  Accessor,
+                  ActivationLayerFunction,
+                  ConvolutionLayerFunction,
+                  FullyConnectedLayerFunction,
+                  PoolingLayerFunction,
+                  SoftmaxLayerFunction>
+                  network{};
+};
+} // namespace benchmark
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_BENCHMARK_LENET5_H__
diff --git a/tests/boost_wrapper.h b/tests/boost_wrapper.h
new file mode 100644
index 0000000000..b584e4cd1f
--- /dev/null
+++ b/tests/boost_wrapper.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Woverloaded-virtual"
+#pragma GCC diagnostic ignored "-Weffc++"
+#pragma GCC diagnostic ignored "-Wctor-dtor-privacy"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#include "boost/test/unit_test.hpp"
+#include "boost/variant.hpp"
+#include "boost/variant/multivisitors.hpp"
+#pragma GCC diagnostic pop
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Weffc++"
+#include "boost/test/data/test_case.hpp"
+#pragma GCC diagnostic pop
+
+#include "boost/test/data/monomorphic.hpp"
diff --git a/tests/dataset/ActivationFunctionDataset.h b/tests/dataset/ActivationFunctionDataset.h
new file mode 100644
index 0000000000..11e4baac78
--- /dev/null
+++ b/tests/dataset/ActivationFunctionDataset.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_DATASET_ACTIVATION_FUNCTION_DATASET_H__
+#define __ARM_COMPUTE_TEST_DATASET_ACTIVATION_FUNCTION_DATASET_H__
+
+#include "arm_compute/core/Types.h"
+#include "dataset/GenericDataset.h"
+
+#ifdef BOOST
+#include "boost_wrapper.h"
+#endif
+
+namespace arm_compute
+{
+namespace test
+{
+/** Data set containing all possible activation functions.
+ *
+ * Can be used as input for Boost data test cases to automatically run a test
+ * case on all activation functions.
+ */
+class ActivationFunctions final : public GenericDataset<ActivationLayerInfo::ActivationFunction, 9>
+{
+public:
+    ActivationFunctions()
+        : GenericDataset
+    {
+        ActivationLayerInfo::ActivationFunction::ABS,
+        ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+        ActivationLayerInfo::ActivationFunction::LINEAR,
+        ActivationLayerInfo::ActivationFunction::LOGISTIC,
+        ActivationLayerInfo::ActivationFunction::RELU,
+        ActivationLayerInfo::ActivationFunction::SOFT_RELU,
+        ActivationLayerInfo::ActivationFunction::SQRT,
+        ActivationLayerInfo::ActivationFunction::SQUARE,
+        ActivationLayerInfo::ActivationFunction::TANH
+    }
+    {
+    }
+
+    ~ActivationFunctions() = default;
+};
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_DATASET_ACTIVATION_FUNCTION_DATASET_H__
diff --git a/tests/dataset/ActivationLayerDataset.h b/tests/dataset/ActivationLayerDataset.h
new file mode 100644
index 0000000000..9d7cffba14
--- /dev/null
+++ b/tests/dataset/ActivationLayerDataset.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_DATASET_ACTIVATION_LAYER_DATASET_H__
+#define __ARM_COMPUTE_TEST_DATASET_ACTIVATION_LAYER_DATASET_H__
+
+#include "TypePrinter.h"
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "dataset/GenericDataset.h"
+
+#include <sstream>
+#include <type_traits>
+
+#ifdef BOOST
+#include "boost_wrapper.h"
+#endif
+
+namespace arm_compute
+{
+namespace test
+{
+class ActivationLayerDataObject
+{
+public:
+    operator std::string() const
+    {
+        std::stringstream ss;
+        ss << "ActivationLayer";
+        ss << "_I" << shape;
+        ss << "_F_" << info.activation();
+        return ss.str();
+    }
+
+public:
+    TensorShape         shape;
+    ActivationLayerInfo info;
+};
+
+template <unsigned int Size>
+using ActivationLayerDataset = GenericDataset<ActivationLayerDataObject, Size>;
+
+class AlexNetActivationLayerDataset final : public ActivationLayerDataset<5>
+{
+public:
+    AlexNetActivationLayerDataset()
+        : GenericDataset
+    {
+        ActivationLayerDataObject{ TensorShape(55U, 55U, 96U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        ActivationLayerDataObject{ TensorShape(27U, 27U, 256U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        ActivationLayerDataObject{ TensorShape(13U, 13U, 384U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        ActivationLayerDataObject{ TensorShape(13U, 13U, 256U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        ActivationLayerDataObject{ TensorShape(4096U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+    }
+    {
+    }
+
+    ~AlexNetActivationLayerDataset() = default;
+};
+
+class LeNet5ActivationLayerDataset final : public ActivationLayerDataset<1>
+{
+public:
+    LeNet5ActivationLayerDataset()
+        : GenericDataset
+    {
+        ActivationLayerDataObject{ TensorShape(500U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+    }
+    {
+    }
+
+    ~LeNet5ActivationLayerDataset() = default;
+};
+
+class GoogLeNetActivationLayerDataset final : public ActivationLayerDataset<33>
+{
+public:
+    GoogLeNetActivationLayerDataset()
+        : GenericDataset
+    {
+        // conv1/relu_7x7
+        ActivationLayerDataObject{ TensorShape(112U, 112U, 64U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        // conv2/relu_3x3_reduce
+        ActivationLayerDataObject{ TensorShape(56U, 56U, 64U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        // conv2/relu_3x3
+        ActivationLayerDataObject{ TensorShape(56U, 56U, 192U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        // inception_3a/relu_1x1, inception_3b/relu_pool_proj
+        ActivationLayerDataObject{ TensorShape(28U, 28U, 64U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        // inception_3a/relu_3x3_reduce, inception_3b/relu_5x5
+        ActivationLayerDataObject{ TensorShape(28U, 28U, 96U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        // inception_3a/relu_3x3, inception_3b/relu_1x1, inception_3b/relu_3x3_reduce
+        ActivationLayerDataObject{ TensorShape(28U, 28U, 128U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        // inception_3a/relu_5x5_reduce
+        ActivationLayerDataObject{ TensorShape(28U, 28U, 16U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        // inception_3a/relu_5x5, inception_3a/relu_pool_proj, inception_3b/relu_5x5_reduce
+        ActivationLayerDataObject{ TensorShape(28U, 28U, 32U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        // inception_3b/relu_3x3
+        ActivationLayerDataObject{ TensorShape(28U, 28U, 192U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        // inception_4a/relu_1x1
+        ActivationLayerDataObject{ TensorShape(14U, 14U, 192U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        // inception_4a/relu_3x3_reduce
+        ActivationLayerDataObject{ TensorShape(14U, 14U, 96U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        // inception_4a/relu_3x3
+        ActivationLayerDataObject{ TensorShape(14U, 14U, 208U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        // inception_4a/relu_5x5_reduce
+        ActivationLayerDataObject{ TensorShape(14U, 14U, 16U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        // inception_4a/relu_5x5
+        ActivationLayerDataObject{ TensorShape(14U, 14U, 48U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        // inception_4a/relu_pool_proj, inception_4b/relu_5x5, inception_4b/relu_pool_proj, inception_4c/relu_5x5, inception_4c/relu_pool_proj, inception_4d/relu_5x5, inception_4d/relu_pool_proj
+        ActivationLayerDataObject{ TensorShape(14U, 14U, 64U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        // inception_4b/relu_1x1, inception_4e/relu_3x3_reduce
+        ActivationLayerDataObject{ TensorShape(14U, 14U, 160U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        // inception_4b/relu_3x3_reduce, inception_4d/relu_1x1
+        ActivationLayerDataObject{ TensorShape(14U, 14U, 112U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        // inception_4b/relu_3x3
+        ActivationLayerDataObject{ TensorShape(14U, 14U, 224U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        // inception_4b/relu_5x5_reduce, inception_4c/relu_5x5_reduce
+        ActivationLayerDataObject{ TensorShape(14U, 14U, 24U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        // inception_4c/relu_1x1, inception_4c/relu_3x3_reduce, inception_4e/relu_5x5, inception_4e/relu_pool_proj
+        ActivationLayerDataObject{ TensorShape(14U, 14U, 128U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        // inception_4c/relu_3x3, inception_4e/relu_1x1
+        ActivationLayerDataObject{ TensorShape(14U, 14U, 256U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        // inception_4d/relu_3x3_reduce
+        ActivationLayerDataObject{ TensorShape(14U, 14U, 144U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        // inception_4d/relu_3x3
+        ActivationLayerDataObject{ TensorShape(14U, 14U, 288U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        // inception_4d/relu_5x5_reduce, inception_4e/relu_5x5_reduce
+        ActivationLayerDataObject{ TensorShape(14U, 14U, 32U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        // inception_4e/relu_3x3
+        ActivationLayerDataObject{ TensorShape(14U, 14U, 320U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        // inception_5a/relu_1x1
+        ActivationLayerDataObject{ TensorShape(7U, 7U, 256U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        // inception_5a/relu_3x3_reduce
+        ActivationLayerDataObject{ TensorShape(7U, 7U, 160U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        // inception_5a/relu_3x3
+        ActivationLayerDataObject{ TensorShape(7U, 7U, 320U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        // inception_5a/relu_5x5_reduce
+        ActivationLayerDataObject{ TensorShape(7U, 7U, 32U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        // inception_5a/relu_5x5, inception_5a/relu_pool_proj, inception_5b/relu_5x5, inception_5b/relu_pool_proj
+        ActivationLayerDataObject{ TensorShape(7U, 7U, 128U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        // inception_5b/relu_1x1, inception_5b/relu_3x3
+        ActivationLayerDataObject{ TensorShape(7U, 7U, 384U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        // inception_5b/relu_3x3_reduce
+        ActivationLayerDataObject{ TensorShape(7U, 7U, 192U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) },
+        // inception_5b/relu_5x5_reduce
+        ActivationLayerDataObject{ TensorShape(7U, 7U, 48U), ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) }
+    }
+    {
+    }
+
+    ~GoogLeNetActivationLayerDataset() = default;
+};
+
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_DATASET_ACTIVATION_LAYER_DATASET_H__
diff --git a/tests/dataset/BatchNormalizationLayerDataset.h b/tests/dataset/BatchNormalizationLayerDataset.h
new file mode 100644
index 0000000000..4323b8fe93
--- /dev/null
+++ b/tests/dataset/BatchNormalizationLayerDataset.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_DATASET_BATCH_NORMALIZATION_LAYER_DATASET_H__
+#define __ARM_COMPUTE_TEST_DATASET_BATCH_NORMALIZATION_LAYER_DATASET_H__
+
+#include "TypePrinter.h"
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "dataset/GenericDataset.h"
+
+#include <ostream>
+#include <sstream>
+
+#ifdef BOOST
+#include "boost_wrapper.h"
+#endif
+
+namespace arm_compute
+{
+namespace test
+{
+class BatchNormalizationLayerDataObject
+{
+public:
+    operator std::string() const
+    {
+        std::stringstream ss;
+        ss << "BatchNormalizationLayer";
+        ss << "_I" << shape0;
+        ss << "_I" << shape1;
+        ss << "_I" << epsilon;
+        return ss.str();
+    }
+
+    friend std::ostream &operator<<(std::ostream &s, const BatchNormalizationLayerDataObject &obj)
+    {
+        s << static_cast<std::string>(obj);
+        return s;
+    }
+
+public:
+    TensorShape shape0;
+    TensorShape shape1;
+    float       epsilon;
+};
+
+template <unsigned int Size>
+using BatchNormalizationLayerDataset = GenericDataset<BatchNormalizationLayerDataObject, Size>;
+
+class RandomBatchNormalizationLayerDataset final : public BatchNormalizationLayerDataset<3>
+{
+public:
+    RandomBatchNormalizationLayerDataset()
+        : GenericDataset
+    {
+        BatchNormalizationLayerDataObject{ TensorShape(15U, 16U, 2U, 12U), TensorShape(2U), 0.1f },
+        BatchNormalizationLayerDataObject{ TensorShape(21U, 11U, 12U, 7U), TensorShape(12U), 0.1f },
+        BatchNormalizationLayerDataObject{ TensorShape(7U, 3U, 6U, 11U), TensorShape(6U), 0.1f },
+    }
+    {
+    }
+
+    ~RandomBatchNormalizationLayerDataset() = default;
+};
+
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_DATASET_BATCH_NORMALIZATION_LAYER_DATASET_H__
diff --git a/tests/dataset/BorderModeDataset.h b/tests/dataset/BorderModeDataset.h
new file mode 100644
index 0000000000..37c7a5ba10
--- /dev/null
+++ b/tests/dataset/BorderModeDataset.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_BORDER_MODE_DATASET_H__
+#define __ARM_COMPUTE_TEST_BORDER_MODE_DATASET_H__
+
+#include "arm_compute/core/Types.h"
+
+#ifdef BOOST
+#include "boost_wrapper.h"
+#endif
+
+#include <array>
+
+namespace arm_compute
+{
+namespace test
+{
+/** Data set containing all possible border modes.
+ *
+ * Can be used as input for Boost data test cases to automatically run a test
+ * case on all border modes.
+ */
+class BorderModes
+{
+public:
+    /** Type of the samples in the data set. */
+    using sample = BorderMode;
+
+    /** Dimensionality of the data set. */
+    enum
+    {
+        arity = 1
+    };
+
+    /** Number of samples in the data set. */
+#ifdef BOOST
+    boost::unit_test::data::size_t size() const
+#else
+    unsigned int size() const
+#endif
+    {
+        return _modes.size();
+    }
+
+    /** Type of the iterator used to step through all samples in the data set.
+     * Needs to support operator*() and operator++() which a pointer does.
+     */
+    using iterator = const BorderMode *;
+
+    /** Iterator to the first sample in the data set. */
+    iterator begin() const
+    {
+        return _modes.data();
+    }
+
+private:
+    std::array<BorderMode, 3> _modes{ { BorderMode::UNDEFINED, BorderMode::CONSTANT, BorderMode::REPLICATE } };
+};
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/dataset/ConvertPolicyDataset.h b/tests/dataset/ConvertPolicyDataset.h
new file mode 100644
index 0000000000..697dba615b
--- /dev/null
+++ b/tests/dataset/ConvertPolicyDataset.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_CONVERT_POLICY_DATASETS_H__
+#define __ARM_COMPUTE_TEST_CONVERT_POLICY_DATASETS_H__
+
+#include "arm_compute/core/Types.h"
+
+#include <type_traits>
+
+#ifdef BOOST
+#include "boost_wrapper.h"
+#endif
+
+namespace arm_compute
+{
+namespace test
+{
+/** Data set containing all possible convert/overflow policies.
+ *
+ * Can be used as input for Boost data test cases to automatically run a test
+ * case on different convert policies.
+ */
+class ConvertPolicies
+{
+public:
+    /** Type of the samples in the data set. */
+    using sample = ConvertPolicy;
+
+    /** Dimensionality of the data set. */
+    enum
+    {
+        arity = 1
+    };
+
+    /** Number of samples in the data set. */
+#ifdef BOOST
+    boost::unit_test::data::size_t size() const
+#else
+    unsigned int size() const
+#endif
+    {
+        return _policies.size();
+    }
+
+    /** Type of the iterator used to step through all samples in the data set.
+     * Needs to support operator*() and operator++() which a pointer does.
+     */
+    using iterator = const ConvertPolicy *;
+
+    /** Iterator to the first sample in the data set. */
+    iterator begin() const
+    {
+        return _policies.data();
+    }
+
+private:
+    std::array<ConvertPolicy, 2> _policies{ { ConvertPolicy::WRAP, ConvertPolicy::SATURATE } };
+};
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/dataset/ConvolutionLayerDataset.h b/tests/dataset/ConvolutionLayerDataset.h
new file mode 100644
index 0000000000..e66117e0d8
--- /dev/null
+++ b/tests/dataset/ConvolutionLayerDataset.h
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_DATASET_CONVOLUTION_LAYER_DATASET_H__
+#define __ARM_COMPUTE_TEST_DATASET_CONVOLUTION_LAYER_DATASET_H__
+
+#include "TypePrinter.h"
+
+#include "arm_compute/core/TensorShape.h"
+#include "dataset/GenericDataset.h"
+#include "dataset/ShapeDatasets.h"
+
+#include <sstream>
+#include <type_traits>
+
+#ifdef BOOST
+#include "boost_wrapper.h"
+#endif
+
+namespace arm_compute
+{
+namespace test
+{
+/** Convolution Layer data object */
+class ConvolutionLayerDataObject
+{
+public:
+    operator std::string() const
+    {
+        std::stringstream ss;
+        ss << "ConvolutionLayer";
+        ss << "_I" << src_shape;
+        ss << "_K" << weights_shape;
+        ss << "_PS" << info;
+        return ss.str();
+    }
+
+    friend std::ostream &operator<<(std::ostream &os, const ConvolutionLayerDataObject &obj)
+    {
+        os << static_cast<std::string>(obj);
+        return os;
+    }
+
+public:
+    TensorShape   src_shape;
+    TensorShape   weights_shape;
+    TensorShape   bias_shape;
+    TensorShape   dst_shape;
+    PadStrideInfo info;
+};
+
+template <unsigned int Size>
+using ConvolutionLayerDataset = GenericDataset<ConvolutionLayerDataObject, Size>;
+
+/** Data set containing small convolution layer shapes */
+class SmallConvolutionLayerDataset final : public ConvolutionLayerDataset<3>
+{
+public:
+    SmallConvolutionLayerDataset()
+        : GenericDataset
+    {
+        ConvolutionLayerDataObject{ TensorShape(23U, 27U, 5U), TensorShape(3U, 3U, 5U, 21U), TensorShape(21U), TensorShape(11U, 25U, 21U), PadStrideInfo(2, 1, 0, 0) },
+        ConvolutionLayerDataObject{ TensorShape(33U, 27U, 7U), TensorShape(5U, 5U, 7U, 11U), TensorShape(11U), TensorShape(11U, 12U, 11U), PadStrideInfo(3, 2, 1, 0) },
+        ConvolutionLayerDataObject{ TensorShape(17U, 31U, 2U, 7U), TensorShape(5U, 5U, 2U, 5U), TensorShape(5U), TensorShape(15U, 15U, 5U, 7U), PadStrideInfo(1, 2, 1, 1) }
+    }
+    {
+    }
+
+    ~SmallConvolutionLayerDataset() = default;
+};
+
+/** Data set containing direct convolution tensor shapes. */
+class DirectConvolutionShapes final : public ShapeDataset<3>
+{
+public:
+    DirectConvolutionShapes()
+        : ShapeDataset(TensorShape(3U, 3U, 3U, 2U, 4U, 5U),
+                       TensorShape(32U, 37U, 3U),
+                       TensorShape(13U, 15U, 8U, 3U))
+    {
+    }
+};
+
+/** AlexNet's convolution layers tensor shapes. */
+class AlexNetConvolutionLayerDataset final : public ConvolutionLayerDataset<5>
+{
+public:
+    AlexNetConvolutionLayerDataset()
+        : GenericDataset
+    {
+        ConvolutionLayerDataObject{ TensorShape(227U, 227U, 3U), TensorShape(11U, 11U, 3U, 96U), TensorShape(96U), TensorShape(55U, 55U, 96U), PadStrideInfo(4, 4, 0, 0) },
+        ConvolutionLayerDataObject{ TensorShape(27U, 27U, 96U), TensorShape(5U, 5U, 96U, 256U), TensorShape(256U), TensorShape(27U, 27U, 256U), PadStrideInfo(1, 1, 2, 2) },
+        ConvolutionLayerDataObject{ TensorShape(13U, 13U, 256U), TensorShape(3U, 3U, 256U, 384U), TensorShape(384U), TensorShape(13U, 13U, 384U), PadStrideInfo(1, 1, 1, 1) },
+        ConvolutionLayerDataObject{ TensorShape(13U, 13U, 384U), TensorShape(3U, 3U, 384U, 384U), TensorShape(384U), TensorShape(13U, 13U, 384U), PadStrideInfo(1, 1, 1, 1) },
+        ConvolutionLayerDataObject{ TensorShape(13U, 13U, 384U), TensorShape(3U, 3U, 384U, 256U), TensorShape(256U), TensorShape(13U, 13U, 256U), PadStrideInfo(1, 1, 1, 1) }
+    }
+    {
+    }
+
+    ~AlexNetConvolutionLayerDataset() = default;
+};
+
+/** LeNet5's convolution layers tensor shapes. */
+class LeNet5ConvolutionLayerDataset final : public ConvolutionLayerDataset<2>
+{
+public:
+    LeNet5ConvolutionLayerDataset()
+        : GenericDataset
+    {
+        ConvolutionLayerDataObject{ TensorShape(28U, 28U, 1U), TensorShape(5U, 5U, 1U, 20U), TensorShape(20U), TensorShape(24U, 24U, 20U), PadStrideInfo(1, 1, 0, 0) },
+        ConvolutionLayerDataObject{ TensorShape(12U, 12U, 20U), TensorShape(5U, 5U, 20U, 50U), TensorShape(50U), TensorShape(8U, 8U, 50U), PadStrideInfo(1, 1, 0, 0) },
+    }
+    {
+    }
+
+    ~LeNet5ConvolutionLayerDataset() = default;
+};
+
+/** GoogleLeNet v1 convolution layers tensor shapes (Part 1).
+ *
+ * @note Dataset is split into two to avoid a register allocation failure produced by clang in Android debug builds.
+ */
+class GoogLeNetConvolutionLayerDataset1 final : public ConvolutionLayerDataset<32>
+{
+public:
+    GoogLeNetConvolutionLayerDataset1()
+        : GenericDataset
+    {
+        // conv1/7x7_s2
+        ConvolutionLayerDataObject{ TensorShape(224U, 224U, 3U), TensorShape(7U, 7U, 3U, 64U), TensorShape(64U), TensorShape(112U, 112U, 64U), PadStrideInfo(2, 2, 3, 3) },
+        // conv2/3x3_reduce
+        ConvolutionLayerDataObject{ TensorShape(56U, 56U, 64U), TensorShape(1U, 1U, 64U, 64U), TensorShape(64U), TensorShape(56U, 56U, 64U), PadStrideInfo(1, 1, 0, 0) },
+        // conv2/3x3
+        ConvolutionLayerDataObject{ TensorShape(56U, 56U, 64U), TensorShape(3U, 3U, 64U, 192U), TensorShape(192U), TensorShape(56U, 56U, 192U), PadStrideInfo(1, 1, 1, 1) },
+        // inception_3a/1x1
+        ConvolutionLayerDataObject{ TensorShape(28U, 28U, 192U), TensorShape(1U, 1U, 192U, 64U), TensorShape(64U), TensorShape(28U, 28U, 64U), PadStrideInfo(1, 1, 0, 0) },
+        // inception_3a/3x3_reduce
+        ConvolutionLayerDataObject{ TensorShape(28U, 28U, 192U), TensorShape(1U, 1U, 192U, 96U), TensorShape(96U), TensorShape(28U, 28U, 96U), PadStrideInfo(1, 1, 0, 0) },
+        // inception_3a/3x3
+        ConvolutionLayerDataObject{ TensorShape(28U, 28U, 96U), TensorShape(3U, 3U, 96U, 128U), TensorShape(128U), TensorShape(28U, 28U, 128U), PadStrideInfo(1, 1, 1, 1) },
+        // inception_3a/5x5_reduce
+        ConvolutionLayerDataObject{ TensorShape(28U, 28U, 192U), TensorShape(1U, 1U, 192U, 16U), TensorShape(16U), TensorShape(28U, 28U, 16U), PadStrideInfo(1, 1, 0, 0) },
+        // inception_3a/5x5
+        ConvolutionLayerDataObject{ TensorShape(28U, 28U, 16U), TensorShape(5U, 5U, 16U, 32U), TensorShape(32U), TensorShape(28U, 28U, 32U), PadStrideInfo(1, 1, 2, 2) },
+        // inception_3a/pool_proj
+        ConvolutionLayerDataObject{ TensorShape(28U, 28U, 192U), TensorShape(1U, 1U, 192U, 32U), TensorShape(32U), TensorShape(28U, 28U, 32U), PadStrideInfo(1, 1, 0, 0) },
+        // inception_3b/1x1, inception_3b/3x3_reduce
+        ConvolutionLayerDataObject{ TensorShape(28U, 28U, 256U), TensorShape(1U, 1U, 256U, 128U), TensorShape(128U), TensorShape(28U, 28U, 128U), PadStrideInfo(1, 1, 0, 0) },
+        // inception_3b/3x3
+        ConvolutionLayerDataObject{ TensorShape(28U, 28U, 128U), TensorShape(3U, 3U, 128U, 192U), TensorShape(192U), TensorShape(28U, 28U, 192U), PadStrideInfo(1, 1, 1, 1) },
+        // inception_3b/5x5_reduce
+        ConvolutionLayerDataObject{ TensorShape(28U, 28U, 256U), TensorShape(1U, 1U, 256U, 32U), TensorShape(32U), TensorShape(28U, 28U, 32U), PadStrideInfo(1, 1, 0, 0) },
+        // inception_3b/5x5
+        ConvolutionLayerDataObject{ TensorShape(28U, 28U, 32U), TensorShape(5U, 5U, 32U, 96U), TensorShape(96U), TensorShape(28U, 28U, 96U), PadStrideInfo(1, 1, 2, 2) },
+        // inception_3b/pool_proj
+        ConvolutionLayerDataObject{ TensorShape(28U, 28U, 256U), TensorShape(1U, 1U, 256U, 64U), TensorShape(64U), TensorShape(28U, 28U, 64U), PadStrideInfo(1, 1, 0, 0) },
+        // inception_4a/1x1
+        ConvolutionLayerDataObject{ TensorShape(14U, 14U, 480U), TensorShape(1U, 1U, 480U, 192U), TensorShape(192U), TensorShape(14U, 14U, 192U), PadStrideInfo(1, 1, 0, 0) },
+        // inception_4a/3x3_reduce
+        ConvolutionLayerDataObject{ TensorShape(14U, 14U, 480U), TensorShape(1U, 1U, 480U, 96U), TensorShape(96U), TensorShape(14U, 14U, 96U), PadStrideInfo(1, 1, 0, 0) },
+        // inception_4a/3x3
+        ConvolutionLayerDataObject{ TensorShape(14U, 14U, 96U), TensorShape(3U, 3U, 96U, 208U), TensorShape(208U), TensorShape(14U, 14U, 208U), PadStrideInfo(1, 1, 1, 1) },
+        // inception_4a/5x5_reduce
+        ConvolutionLayerDataObject{ TensorShape(14U, 14U, 480U), TensorShape(1U, 1U, 480U, 16U), TensorShape(16U), TensorShape(14U, 14U, 16U), PadStrideInfo(1, 1, 0, 0) },
+        // inception_4a/5x5
+        ConvolutionLayerDataObject{ TensorShape(14U, 14U, 16U), TensorShape(5U, 5U, 16U, 48U), TensorShape(48U), TensorShape(14U, 14U, 48U), PadStrideInfo(1, 1, 2, 2) },
+        // inception_4a/pool_proj
+        ConvolutionLayerDataObject{ TensorShape(14U, 14U, 480U), TensorShape(1U, 1U, 480U, 64U), TensorShape(64U), TensorShape(14U, 14U, 64U), PadStrideInfo(1, 1, 0, 0) },
+        // inception_4b/1x1
+        ConvolutionLayerDataObject{ TensorShape(14U, 14U, 512U), TensorShape(1U, 1U, 512U, 160U), TensorShape(160U), TensorShape(14U, 14U, 160U), PadStrideInfo(1, 1, 0, 0) },
+        // inception_4b/3x3_reduce, inception_4d/1x1
+        ConvolutionLayerDataObject{ TensorShape(14U, 14U, 512U), TensorShape(1U, 1U, 512U, 112U), TensorShape(112U), TensorShape(14U, 14U, 112U), PadStrideInfo(1, 1, 0, 0) },
+        // inception_4b/3x3
+        ConvolutionLayerDataObject{ TensorShape(14U, 14U, 112U), TensorShape(3U, 3U, 112U, 224U), TensorShape(224U), TensorShape(14U, 14U, 224U), PadStrideInfo(1, 1, 1, 1) },
+        // inception_4b/5x5_reduce, inception_4c/5x5_reduce
+        ConvolutionLayerDataObject{ TensorShape(14U, 14U, 512U), TensorShape(1U, 1U, 512U, 24U), TensorShape(24U), TensorShape(14U, 14U, 24U), PadStrideInfo(1, 1, 0, 0) },
+        // inception_4b/5x5, inception_4c/5x5
+        ConvolutionLayerDataObject{ TensorShape(14U, 14U, 24U), TensorShape(5U, 5U, 24U, 64U), TensorShape(64U), TensorShape(14U, 14U, 64U), PadStrideInfo(1, 1, 2, 2) },
+        // inception_4b/pool_proj, inception_4c/pool_proj, inception_4d/pool_proj
+        ConvolutionLayerDataObject{ TensorShape(14U, 14U, 512U), TensorShape(1U, 1U, 512U, 64U), TensorShape(64U), TensorShape(14U, 14U, 64U), PadStrideInfo(1, 1, 0, 0) },
+        // inception_4c/1x1, inception_4c/3x3_reduce
+        ConvolutionLayerDataObject{ TensorShape(14U, 14U, 512U), TensorShape(1U, 1U, 512U, 128U), TensorShape(128U), TensorShape(14U, 14U, 128U), PadStrideInfo(1, 1, 0, 0) },
+        // inception_4c/3x3
+        ConvolutionLayerDataObject{ TensorShape(14U, 14U, 128U), TensorShape(3U, 3U, 128U, 256U), TensorShape(256U), TensorShape(14U, 14U, 256U), PadStrideInfo(1, 1, 1, 1) },
+        // inception_4d/3x3_reduce
+        ConvolutionLayerDataObject{ TensorShape(14U, 14U, 512U), TensorShape(1U, 1U, 512U, 144U), TensorShape(144U), TensorShape(14U, 14U, 144U), PadStrideInfo(1, 1, 0, 0) },
+        // inception_4d/3x3
+        ConvolutionLayerDataObject{ TensorShape(14U, 14U, 144U), TensorShape(3U, 3U, 144U, 288U), TensorShape(288U), TensorShape(14U, 14U, 288U), PadStrideInfo(1, 1, 1, 1) },
+        // inception_4d/5x5_reduce
+        ConvolutionLayerDataObject{ TensorShape(14U, 14U, 512U), TensorShape(1U, 1U, 512U, 32U), TensorShape(32U), TensorShape(14U, 14U, 32U), PadStrideInfo(1, 1, 0, 0) },
+        // inception_4d/5x5
+        ConvolutionLayerDataObject{ TensorShape(14U, 14U, 32U), TensorShape(5U, 5U, 32U, 64U), TensorShape(64U), TensorShape(14U, 14U, 64U), PadStrideInfo(1, 1, 2, 2) },
+    }
+    {
+    }
+
+    ~GoogLeNetConvolutionLayerDataset1() = default;
+};
+
+/** GoogleLeNet v1 convolution layers tensor shapes (Part 2). */
+class GoogLeNetConvolutionLayerDataset2 final : public ConvolutionLayerDataset<17>
+{
+public:
+    GoogLeNetConvolutionLayerDataset2()
+        : GenericDataset
+    {
+        // inception_4e/1x1
+        ConvolutionLayerDataObject{ TensorShape(14U, 14U, 528U), TensorShape(1U, 1U, 528U, 256U), TensorShape(256U), TensorShape(14U, 14U, 256U), PadStrideInfo(1, 1, 0, 0) },
+        // inception_4e/3x3_reduce
+        ConvolutionLayerDataObject{ TensorShape(14U, 14U, 528U), TensorShape(1U, 1U, 528U, 160U), TensorShape(160U), TensorShape(14U, 14U, 160U), PadStrideInfo(1, 1, 0, 0) },
+        // inception_4e/3x3
+        ConvolutionLayerDataObject{ TensorShape(14U, 14U, 160U), TensorShape(3U, 3U, 160U, 320U), TensorShape(320U), TensorShape(14U, 14U, 320U), PadStrideInfo(1, 1, 1, 1) },
+        // inception_4e/5x5_reduce
+        ConvolutionLayerDataObject{ TensorShape(14U, 14U, 528U), TensorShape(1U, 1U, 528U, 32U), TensorShape(32U), TensorShape(14U, 14U, 32U), PadStrideInfo(1, 1, 0, 0) },
+        // inception_4e/5x5
+        ConvolutionLayerDataObject{ TensorShape(14U, 14U, 32U), TensorShape(5U, 5U, 32U, 128U), TensorShape(128U), TensorShape(14U, 14U, 128U), PadStrideInfo(1, 1, 2, 2) },
+        // inception_4e/pool_proj
+        ConvolutionLayerDataObject{ TensorShape(14U, 14U, 528U), TensorShape(1U, 1U, 528U, 128U), TensorShape(128U), TensorShape(14U, 14U, 128U), PadStrideInfo(1, 1, 0, 0) },
+        // inception_5a/1x1
+        ConvolutionLayerDataObject{ TensorShape(7U, 7U, 832U), TensorShape(1U, 1U, 832U, 256U), TensorShape(256U), TensorShape(7U, 7U, 256U), PadStrideInfo(1, 1, 0, 0) },
+        // inception_5a/3x3_reduce
+        ConvolutionLayerDataObject{ TensorShape(7U, 7U, 832U), TensorShape(1U, 1U, 832U, 160U), TensorShape(160U), TensorShape(7U, 7U, 160U), PadStrideInfo(1, 1, 0, 0) },
+        // inception_5a/3x3
+        ConvolutionLayerDataObject{ TensorShape(7U, 7U, 160U), TensorShape(3U, 3U, 160U, 320U), TensorShape(320U), TensorShape(7U, 7U, 320U), PadStrideInfo(1, 1, 1, 1) },
+        // inception_5a/5x5_reduce
+        ConvolutionLayerDataObject{ TensorShape(7U, 7U, 832U), TensorShape(1U, 1U, 832U, 32U), TensorShape(32U), TensorShape(7U, 7U, 32U), PadStrideInfo(1, 1, 0, 0) },
+        // inception_5a/5x5
+        ConvolutionLayerDataObject{ TensorShape(7U, 7U, 32U), TensorShape(5U, 5U, 32U, 128U), TensorShape(128U), TensorShape(7U, 7U, 128U), PadStrideInfo(1, 1, 2, 2) },
+        // inception_5a/pool_proj, inception_5b/pool_proj
+        ConvolutionLayerDataObject{ TensorShape(7U, 7U, 832U), TensorShape(1U, 1U, 832U, 128U), TensorShape(128U), TensorShape(7U, 7U, 128U), PadStrideInfo(1, 1, 0, 0) },
+        // inception_5b/1x1
+        ConvolutionLayerDataObject{ TensorShape(7U, 7U, 832U), TensorShape(1U, 1U, 832U, 384U), TensorShape(384U), TensorShape(7U, 7U, 384U), PadStrideInfo(1, 1, 0, 0) },
+        // inception_5b/3x3_reduce
+        ConvolutionLayerDataObject{ TensorShape(7U, 7U, 832U), TensorShape(1U, 1U, 832U, 192U), TensorShape(192U), TensorShape(7U, 7U, 192U), PadStrideInfo(1, 1, 0, 0) },
+        // inception_5b/3x3
+        ConvolutionLayerDataObject{ TensorShape(7U, 7U, 192U), TensorShape(3U, 3U, 192U, 384U), TensorShape(384U), TensorShape(7U, 7U, 384U), PadStrideInfo(1, 1, 1, 1) },
+        // inception_5b/5x5_reduce
+        ConvolutionLayerDataObject{ TensorShape(7U, 7U, 832U), TensorShape(1U, 1U, 832U, 48U), TensorShape(48U), TensorShape(7U, 7U, 48U), PadStrideInfo(1, 1, 0, 0) },
+        // inception_5b/5x5
+        ConvolutionLayerDataObject{ TensorShape(7U, 7U, 48U), TensorShape(5U, 5U, 48U, 128U), TensorShape(128U), TensorShape(7U, 7U, 128U), PadStrideInfo(1, 1, 2, 2) }
+    }
+    {
+    }
+
+    ~GoogLeNetConvolutionLayerDataset2() = default;
+};
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_DATASET_CONVOLUTION_LAYER_DATASET_H__
diff --git a/tests/dataset/DataTypeDatasets.h b/tests/dataset/DataTypeDatasets.h
new file mode 100644
index 0000000000..8c63857477
--- /dev/null
+++ b/tests/dataset/DataTypeDatasets.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_DATA_TYPE_DATASET_H__
+#define __ARM_COMPUTE_TEST_DATA_TYPE_DATASET_H__
+
+#include "arm_compute/core/Types.h"
+
+#ifdef BOOST
+#include "boost_wrapper.h"
+#endif
+
+namespace arm_compute
+{
+namespace test
+{
+/** Abstract data set containing data types.
+ *
+ * Can be used as input for Boost data test cases to automatically run a test
+ * case on different data types.
+ */
+template <unsigned int Size>
+class DataTypes
+{
+public:
+    /** Type of the samples in the data set. */
+    using sample = DataType;
+
+    /** Dimensionality of the data set. */
+    enum
+    {
+        arity = 1
+    };
+
+    /** Number of samples in the data set. */
+#ifdef BOOST
+    boost::unit_test::data::size_t size() const
+#else
+    unsigned int size() const
+#endif
+    {
+        return _types.size();
+    }
+
+    /** Type of the iterator used to step through all samples in the data set.
+     * Needs to support operator*() and operator++() which a pointer does.
+     */
+    using iterator = const DataType *;
+
+    /** Iterator to the first sample in the data set. */
+    iterator begin() const
+    {
+        return _types.data();
+    }
+
+protected:
+    /** Protected constructor to make the class abstract. */
+    template <typename... Ts>
+    DataTypes(Ts &&... types)
+        : _types{ { types... } }
+    {
+    }
+
+    /** Protected destructor to prevent deletion of derived classes through a
+     * pointer to the base class.
+     */
+    ~DataTypes() = default;
+
+private:
+    std::array<DataType, Size> _types;
+};
+
+/** Data set containing all data types. */
+class AllDataTypes final : public DataTypes<14>
+{
+public:
+    AllDataTypes()
+        : DataTypes{ DataType::U8, DataType::S8, DataType::U16, DataType::S16,
+                     DataType::U32, DataType::S32, DataType::U64, DataType::S64,
+                     DataType::F16, DataType::F32, DataType::F64, DataType::SIZET,
+                     DataType::QS8, DataType::QS16 }
+    {
+    }
+
+    ~AllDataTypes() = default;
+};
+
+/** Data set containing all unsigned data types. */
+class UnsignedDataTypes final : public DataTypes<4>
+{
+public:
+    UnsignedDataTypes()
+        : DataTypes{ DataType::U8, DataType::U16, DataType::U32, DataType::U64 }
+    {
+    }
+
+    ~UnsignedDataTypes() = default;
+};
+
+/** Data set containing all signed data types. */
+class SignedDataTypes final : public DataTypes<4>
+{
+public:
+    SignedDataTypes()
+        : DataTypes{ DataType::S8, DataType::S16, DataType::S32, DataType::S64 }
+    {
+    }
+
+    ~SignedDataTypes() = default;
+};
+
+/** Data set containing all floating point data types. */
+class FloatDataTypes final : public DataTypes<3>
+{
+public:
+    FloatDataTypes()
+        : DataTypes{ DataType::F16, DataType::F32, DataType::F64 }
+    {
+    }
+
+    ~FloatDataTypes() = default;
+};
+
+/** Data set containing all fixed point data types. */
+class FixedPointDataTypes final : public DataTypes<2>
+{
+public:
+    FixedPointDataTypes()
+        : DataTypes{ DataType::QS8, DataType::QS16 }
+    {
+    }
+
+    ~FixedPointDataTypes() = default;
+};
+
+/** Supported CNN float types. */
+class CNNFloatDataTypes final : public DataTypes<1>
+{
+public:
+    CNNFloatDataTypes()
+        : DataTypes{ DataType::F32 }
+    {
+    }
+
+    ~CNNFloatDataTypes() = default;
+};
+
+/** Supported CNN fixed point types. */
+class CNNFixedPointDataTypes final : public DataTypes<1>
+{
+public:
+    CNNFixedPointDataTypes()
+        : DataTypes{ DataType::QS8 }
+    {
+    }
+
+    ~CNNFixedPointDataTypes() = default;
+};
+
+/** Supported CNN types. */
+class CNNDataTypes final : public DataTypes<2>
+{
+public:
+    CNNDataTypes()
+        : DataTypes{ DataType::F32, DataType::QS8 }
+    {
+    }
+
+    ~CNNDataTypes() = default;
+};
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/dataset/FullyConnectedLayerDataset.h b/tests/dataset/FullyConnectedLayerDataset.h
new file mode 100644
index 0000000000..53b7d022d7
--- /dev/null
+++ b/tests/dataset/FullyConnectedLayerDataset.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_DATASET_FULLY_CONNECTED_LAYER_DATASET_H__
+#define __ARM_COMPUTE_TEST_DATASET_FULLY_CONNECTED_LAYER_DATASET_H__
+
+#include "TypePrinter.h"
+
+#include "arm_compute/core/TensorShape.h"
+#include "dataset/GenericDataset.h"
+
+#include <sstream>
+#include <type_traits>
+
+#ifdef BOOST
+#include "boost_wrapper.h"
+#endif
+
+namespace arm_compute
+{
+namespace test
+{
+class FullyConnectedLayerDataObject
+{
+public:
+    operator std::string() const
+    {
+        std::stringstream ss;
+        ss << "FullyConnectedLayer";
+        ss << "_I" << src_shape;
+        ss << "_K" << weights_shape;
+        return ss.str();
+    }
+
+    friend std::ostream &operator<<(std::ostream &os, const FullyConnectedLayerDataObject &obj)
+    {
+        os << static_cast<std::string>(obj);
+        return os;
+    }
+
+public:
+    TensorShape src_shape;
+    TensorShape weights_shape;
+    TensorShape bias_shape;
+    TensorShape dst_shape;
+    bool        transpose_weights;
+    bool        are_weights_reshaped;
+};
+
+template <unsigned int Size>
+using FullyConnectedLayerDataset = GenericDataset<FullyConnectedLayerDataObject, Size>;
+
+class SmallFullyConnectedLayerDataset final : public FullyConnectedLayerDataset<5>
+{
+public:
+    SmallFullyConnectedLayerDataset()
+        : GenericDataset
+    {
+        FullyConnectedLayerDataObject{ TensorShape(9U, 5U, 7U), TensorShape(315U, 271U), TensorShape(271U), TensorShape(271U), true, false },
+        FullyConnectedLayerDataObject{ TensorShape(9U, 5U, 7U, 3U), TensorShape(315U, 271U), TensorShape(271U), TensorShape(271U, 3U), true, false },
+        FullyConnectedLayerDataObject{ TensorShape(201U), TensorShape(201U, 529U), TensorShape(529U), TensorShape(529U), true, false },
+        FullyConnectedLayerDataObject{ TensorShape(9U, 5U, 7U), TensorShape(315U, 271U), TensorShape(271U), TensorShape(271U), true, true },
+        FullyConnectedLayerDataObject{ TensorShape(201U), TensorShape(201U, 529U), TensorShape(529U), TensorShape(529U), true, true },
+    }
+    {
+    }
+
+    ~SmallFullyConnectedLayerDataset() = default;
+};
+
+class LargeFullyConnectedLayerDataset final : public FullyConnectedLayerDataset<5>
+{
+public:
+    LargeFullyConnectedLayerDataset()
+        : GenericDataset
+    {
+        FullyConnectedLayerDataObject{ TensorShape(9U, 5U, 257U), TensorShape(11565U, 2123U), TensorShape(2123U), TensorShape(2123U), true, false },
+        FullyConnectedLayerDataObject{ TensorShape(9U, 5U, 257U, 2U), TensorShape(11565U, 2123U), TensorShape(2123U), TensorShape(2123U, 2U), true, false },
+        FullyConnectedLayerDataObject{ TensorShape(3127U), TensorShape(3127U, 989U), TensorShape(989U), TensorShape(989U), true, false },
+        FullyConnectedLayerDataObject{ TensorShape(9U, 5U, 257U), TensorShape(11565U, 2123U), TensorShape(2123U), TensorShape(2123U), true, true },
+        FullyConnectedLayerDataObject{ TensorShape(3127U), TensorShape(3127U, 989U), TensorShape(989U), TensorShape(989U), true, true },
+    }
+    {
+    }
+
+    ~LargeFullyConnectedLayerDataset() = default;
+};
+
+class AlexNetFullyConnectedLayerDataset final : public FullyConnectedLayerDataset<3>
+{
+public:
+    AlexNetFullyConnectedLayerDataset()
+        : GenericDataset
+    {
+        FullyConnectedLayerDataObject{ TensorShape(6U, 6U, 256U), TensorShape(9216U, 4096U), TensorShape(4096U), TensorShape(4096U), true },
+        FullyConnectedLayerDataObject{ TensorShape(4096U), TensorShape(4096U, 4096U), TensorShape(4096U), TensorShape(4096U), true },
+        FullyConnectedLayerDataObject{ TensorShape(4096U), TensorShape(4096U, 1000U), TensorShape(1000U), TensorShape(1000U), true },
+    }
+    {
+    }
+
+    ~AlexNetFullyConnectedLayerDataset() = default;
+};
+
+class LeNet5FullyConnectedLayerDataset final : public FullyConnectedLayerDataset<2>
+{
+public:
+    LeNet5FullyConnectedLayerDataset()
+        : GenericDataset
+    {
+        FullyConnectedLayerDataObject{ TensorShape(4U, 4U, 50U), TensorShape(800U, 500U), TensorShape(500U), TensorShape(500U) },
+        FullyConnectedLayerDataObject{ TensorShape(500U), TensorShape(500U, 10U), TensorShape(10U), TensorShape(10U) },
+    }
+    {
+    }
+
+    ~LeNet5FullyConnectedLayerDataset() = default;
+};
+
+class GoogLeNetFullyConnectedLayerDataset final : public FullyConnectedLayerDataset<1>
+{
+public:
+    GoogLeNetFullyConnectedLayerDataset()
+        : GenericDataset
+    {
+        FullyConnectedLayerDataObject{ TensorShape(1024U), TensorShape(1024U, 1000U), TensorShape(1000U), TensorShape(1000U), true },
+    }
+    {
+    }
+
+    ~GoogLeNetFullyConnectedLayerDataset() = default;
+};
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_DATASET_FULLY_CONNECTED_LAYER_DATASET_H__
diff --git a/tests/dataset/GEMMDataset.h b/tests/dataset/GEMMDataset.h
new file mode 100644
index 0000000000..f45bc3e838
--- /dev/null
+++ b/tests/dataset/GEMMDataset.h
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_DATASET_GEMM_DATASET_H__
+#define __ARM_COMPUTE_TEST_DATASET_GEMM_DATASET_H__
+
+#include "TypePrinter.h"
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "dataset/GenericDataset.h"
+
+#include <ostream>
+#include <sstream>
+
+#include <tuple>
+#include <type_traits>
+
+#ifdef BOOST
+#include "boost_wrapper.h"
+#endif
+
+namespace arm_compute
+{
+namespace test
+{
+class GEMMDataObject
+{
+public:
+    //Data object used for matrix multiple
+    //D = alpha * A * B + beta * C;
+    TensorShape shape_a;
+    TensorShape shape_b;
+    TensorShape shape_c;
+    TensorShape shape_d;
+    float       alpha;
+    float       beta;
+
+    operator std::string() const
+    {
+        std::stringstream ss;
+        ss << "GEMM";
+        ss << "_A" << shape_a;
+        ss << "_B" << shape_b;
+        ss << "_C" << shape_c;
+        ss << "_D" << shape_d;
+        ss << "_alpha" << alpha;
+        ss << "_beta" << beta;
+        return ss.str();
+    }
+
+    friend std::ostream &operator<<(std::ostream &os, const GEMMDataObject &obj)
+    {
+        os << static_cast<std::string>(obj);
+        return os;
+    }
+};
+
+class SmallGEMMDataset : public GenericDataset<GEMMDataObject, 4>
+{
+public:
+    SmallGEMMDataset()
+        : GenericDataset
+    {
+        GEMMDataObject{ TensorShape(21u, 13u), TensorShape(33u, 21u), TensorShape(33u, 13u), TensorShape(33u, 13u), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(31u, 1u), TensorShape(23u, 31u), TensorShape(23u, 1u), TensorShape(23u, 1u), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(38u, 12u), TensorShape(21u, 38u), TensorShape(21u, 12u), TensorShape(21u, 12u), 0.2f, 1.2f },
+        GEMMDataObject{ TensorShape(32u, 1u), TensorShape(17u, 32u), TensorShape(17u, 1u), TensorShape(17u, 1u), 0.4f, 0.7f },
+    }
+    {
+    }
+
+    ~SmallGEMMDataset() = default;
+};
+
+class LargeGEMMDataset : public GenericDataset<GEMMDataObject, 4>
+{
+public:
+    LargeGEMMDataset()
+        : GenericDataset
+    {
+        GEMMDataObject{ TensorShape(923u, 429u), TensorShape(871u, 923u), TensorShape(871u, 429u), TensorShape(871u, 429u), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(1021u, 1u), TensorShape(783u, 1021u), TensorShape(783u, 1u), TensorShape(783u, 1u), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(681u, 1023u), TensorShape(213u, 681u), TensorShape(213u, 1023u), TensorShape(213u, 1023u), 0.2f, 1.2f },
+        GEMMDataObject{ TensorShape(941u, 1u), TensorShape(623u, 941u), TensorShape(623u, 1u), TensorShape(623u, 1u), 0.4f, 0.7f },
+    }
+    {
+    }
+
+    ~LargeGEMMDataset() = default;
+};
+
+class GoogLeNetGEMMDataset1 : public GenericDataset<GEMMDataObject, 32>
+{
+public:
+    GoogLeNetGEMMDataset1()
+        : GenericDataset
+    {
+        GEMMDataObject{ TensorShape(147U, 12544U), TensorShape(64U, 147U), TensorShape(64U, 12544U), TensorShape(64U, 12544U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(64U, 3136U), TensorShape(64U, 64U), TensorShape(64U, 3136U), TensorShape(64U, 3136U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(576U, 3136U), TensorShape(192U, 576U), TensorShape(192U, 3136U), TensorShape(192U, 3136U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(192U, 784U), TensorShape(64U, 192U), TensorShape(64U, 784U), TensorShape(64U, 784U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(192U, 784U), TensorShape(96U, 192U), TensorShape(96U, 784U), TensorShape(96U, 784U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(864U, 784U), TensorShape(128U, 864U), TensorShape(128U, 784U), TensorShape(128U, 784U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(192U, 784U), TensorShape(16U, 192U), TensorShape(16U, 784U), TensorShape(16U, 784U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(400U, 784U), TensorShape(32U, 400U), TensorShape(32U, 784U), TensorShape(32U, 784U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(192U, 784U), TensorShape(32U, 192U), TensorShape(32U, 784U), TensorShape(32U, 784U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(256U, 784U), TensorShape(128U, 256U), TensorShape(128U, 784U), TensorShape(128U, 784U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(256U, 784U), TensorShape(128U, 256U), TensorShape(128U, 784U), TensorShape(128U, 784U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(1152U, 784U), TensorShape(192U, 1152U), TensorShape(192U, 784U), TensorShape(192U, 784U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(256U, 784U), TensorShape(32U, 256U), TensorShape(32U, 784U), TensorShape(32U, 784U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(800U, 784U), TensorShape(96U, 800U), TensorShape(96U, 784U), TensorShape(96U, 784U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(256U, 784U), TensorShape(64U, 256U), TensorShape(64U, 784U), TensorShape(64U, 784U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(480U, 196U), TensorShape(192U, 480U), TensorShape(192U, 196U), TensorShape(192U, 196U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(480U, 196U), TensorShape(96U, 480U), TensorShape(96U, 196U), TensorShape(96U, 196U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(864U, 196U), TensorShape(204U, 864U), TensorShape(204U, 196U), TensorShape(204U, 196U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(480U, 196U), TensorShape(16U, 480U), TensorShape(16U, 196U), TensorShape(16U, 196U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(400U, 196U), TensorShape(48U, 400U), TensorShape(48U, 196U), TensorShape(48U, 196U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(480U, 196U), TensorShape(64U, 480U), TensorShape(64U, 196U), TensorShape(64U, 196U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(508U, 196U), TensorShape(160U, 508U), TensorShape(160U, 196U), TensorShape(160U, 196U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(508U, 196U), TensorShape(112U, 508U), TensorShape(112U, 196U), TensorShape(112U, 196U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(1008U, 196U), TensorShape(224U, 1008U), TensorShape(224U, 196U), TensorShape(224U, 196U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(508U, 196U), TensorShape(24U, 508U), TensorShape(24U, 196U), TensorShape(24U, 196U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(600U, 196U), TensorShape(64U, 600U), TensorShape(64U, 196U), TensorShape(64U, 196U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(508U, 196U), TensorShape(64U, 508U), TensorShape(64U, 196U), TensorShape(64U, 196U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(512U, 196U), TensorShape(128U, 512U), TensorShape(128U, 196U), TensorShape(128U, 196U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(512U, 196U), TensorShape(128U, 512U), TensorShape(128U, 196U), TensorShape(128U, 196U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(1152U, 196U), TensorShape(256U, 1152U), TensorShape(256U, 196U), TensorShape(256U, 196U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(512U, 196U), TensorShape(24U, 512U), TensorShape(24U, 196U), TensorShape(24U, 196U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(600U, 196U), TensorShape(64U, 600U), TensorShape(64U, 196U), TensorShape(64U, 196U), 1.0f, 0.0f }
+    }
+    {
+    }
+
+    ~GoogLeNetGEMMDataset1() = default;
+};
+
+class GoogLeNetGEMMDataset2 : public GenericDataset<GEMMDataObject, 32>
+{
+public:
+    GoogLeNetGEMMDataset2()
+        : GenericDataset
+    {
+        GEMMDataObject{ TensorShape(512U, 196U), TensorShape(64U, 512U), TensorShape(64U, 196U), TensorShape(64U, 196U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(512U, 196U), TensorShape(112U, 512U), TensorShape(112U, 196U), TensorShape(112U, 196U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(512U, 196U), TensorShape(144U, 512U), TensorShape(144U, 196U), TensorShape(144U, 196U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(1296U, 196U), TensorShape(288U, 1296U), TensorShape(288U, 196U), TensorShape(288U, 196U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(512U, 196U), TensorShape(32U, 512U), TensorShape(32U, 196U), TensorShape(32U, 196U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(800U, 196U), TensorShape(64U, 800U), TensorShape(64U, 196U), TensorShape(64U, 196U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(512U, 196U), TensorShape(64U, 512U), TensorShape(64U, 196U), TensorShape(64U, 196U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(528U, 196U), TensorShape(256U, 528U), TensorShape(256U, 196U), TensorShape(256U, 196U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(528U, 196U), TensorShape(160U, 528U), TensorShape(160U, 196U), TensorShape(160U, 196U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(1440U, 196U), TensorShape(320U, 1440U), TensorShape(320U, 196U), TensorShape(320U, 196U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(528U, 196U), TensorShape(32U, 528U), TensorShape(32U, 196U), TensorShape(32U, 196U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(800U, 196U), TensorShape(128U, 800U), TensorShape(128U, 196U), TensorShape(128U, 196U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(528U, 196U), TensorShape(128U, 528U), TensorShape(128U, 196U), TensorShape(128U, 196U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(832U, 49U), TensorShape(256U, 832U), TensorShape(256U, 49U), TensorShape(256U, 49U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(832U, 49U), TensorShape(160U, 832U), TensorShape(160U, 49U), TensorShape(160U, 49U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(1440U, 49U), TensorShape(320U, 1440U), TensorShape(320U, 49U), TensorShape(320U, 49U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(832U, 49U), TensorShape(48U, 832U), TensorShape(48U, 49U), TensorShape(48U, 49U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(1200U, 49U), TensorShape(128U, 1200U), TensorShape(128U, 49U), TensorShape(128U, 49U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(832U, 49U), TensorShape(128U, 832U), TensorShape(128U, 49U), TensorShape(128U, 49U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(832U, 49U), TensorShape(384U, 832U), TensorShape(384U, 49U), TensorShape(384U, 49U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(832U, 49U), TensorShape(192U, 832U), TensorShape(192U, 49U), TensorShape(192U, 49U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(1728U, 49U), TensorShape(384U, 1728U), TensorShape(384U, 49U), TensorShape(384U, 49U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(832U, 49U), TensorShape(48U, 832U), TensorShape(48U, 49U), TensorShape(48U, 49U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(1200U, 49U), TensorShape(128U, 1200U), TensorShape(128U, 49U), TensorShape(128U, 49U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(832U, 49U), TensorShape(128U, 832U), TensorShape(128U, 49U), TensorShape(128U, 49U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(508U, 16U), TensorShape(128U, 508U), TensorShape(128U, 16U), TensorShape(128U, 16U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(2048U, 1U), TensorShape(1024U, 2048U), TensorShape(1024U, 1U), TensorShape(1024U, 1U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(1024U, 1U), TensorShape(1008U, 1024U), TensorShape(1008U, 1U), TensorShape(1008U, 1U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(528U, 16U), TensorShape(128U, 528U), TensorShape(128U, 16U), TensorShape(128U, 16U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(2048U, 1U), TensorShape(1024U, 2048U), TensorShape(1024U, 1U), TensorShape(1024U, 1U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(1024U, 1U), TensorShape(1008U, 1024U), TensorShape(1008U, 1U), TensorShape(1008U, 1U), 1.0f, 0.0f },
+        GEMMDataObject{ TensorShape(1024U, 1U), TensorShape(1008U, 1024U), TensorShape(1008U, 1U), TensorShape(1008U, 1U), 1.0f, 0.0f }
+    }
+    {
+    }
+
+    ~GoogLeNetGEMMDataset2() = default;
+};
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_DATASET_GEMM_DATASET_H__
diff --git a/tests/dataset/GenericDataset.h b/tests/dataset/GenericDataset.h
new file mode 100644
index 0000000000..4ca489bd82
--- /dev/null
+++ b/tests/dataset/GenericDataset.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_DATASET_GENERIC_DATASET_H__
+#define __ARM_COMPUTE_TEST_DATASET_GENERIC_DATASET_H__
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+
+#include <type_traits>
+
+#ifdef BOOST
+#include "boost_wrapper.h"
+#endif
+
+namespace arm_compute
+{
+namespace test
+{
+/** Abstract data set containing multiple objects T.
+ *
+ * Can be used as input for Boost data test cases to automatically run a test
+ * case on different configurations.
+ */
+template <class T, unsigned int Size>
+class GenericDataset
+{
+public:
+    /** Type of the samples in the data set. */
+    using sample = T;
+
+    /** Dimensionality of the data set. */
+    enum
+    {
+        arity = 1
+    };
+
+    /** Number of samples in the data set. */
+#ifdef BOOST
+    boost::unit_test::data::size_t size() const
+#else
+    unsigned int size() const
+#endif
+    {
+        return _data.size();
+    }
+
+    /** Type of the iterator used to step through all samples in the data set.
+     * Needs to support operator*() and operator++() which a pointer does.
+     */
+    using iterator = const T *;
+
+    /** Iterator to the first sample in the data set. */
+    iterator begin() const
+    {
+        return _data.data();
+    }
+
+protected:
+    /** Protected constructor to make the class abstract. */
+    template <typename... Ts>
+    GenericDataset(Ts... objs)
+        : _data{ { objs... } }
+    {
+    }
+
+    /** Protected destructor to prevent deletion of derived class through a
+     * pointer to the base class.
+     */
+    ~GenericDataset() = default;
+
+private:
+    std::array<T, Size> _data;
+};
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_DATASET_GENERIC_DATASET_H__
diff --git a/tests/dataset/ImageDatasets.h b/tests/dataset/ImageDatasets.h
new file mode 100644
index 0000000000..555227e83b
--- /dev/null
+++ b/tests/dataset/ImageDatasets.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_IMAGE_DATASETS_H__
+#define __ARM_COMPUTE_TEST_IMAGE_DATASETS_H__
+
+#include <string>
+#include <type_traits>
+
+#ifdef BOOST
+#include "boost_wrapper.h"
+#endif
+
+namespace arm_compute
+{
+namespace test
+{
+/** Abstract data set containing image names.
+ *
+ * Can be used as input for Boost data test cases to automatically run a test
+ * case on different images.
+ */
+template <unsigned int Size>
+class ImageDataset
+{
+public:
+    /** Type of the samples in the data set. */
+    using sample = const std::string;
+
+    /** Dimensionality of the data set. */
+    enum
+    {
+        arity = 1
+    };
+
+    /** Number of samples in the data set. */
+#ifdef BOOST
+    boost::unit_test::data::size_t size() const
+#else
+    unsigned int size() const
+#endif
+    {
+        return _images.size();
+    }
+
+    /** Type of the iterator used to step through all samples in the data set.
+     * Needs to support operator*() and operator++() which a pointer does.
+     */
+    using iterator = const std::string *;
+
+    /** Iterator to the first sample in the data set. */
+    iterator begin() const
+    {
+        return _images.data();
+    }
+
+protected:
+    /** Protected constructor to make the class abstract. */
+    template <typename... Ts>
+    ImageDataset(Ts... images)
+        : _images{ { images... } }
+    {
+    }
+
+    /** Protected destructor to prevent deletion of derived class through a
+     * pointer to the base class.
+     */
+    ~ImageDataset() = default;
+
+private:
+    std::array<std::string, Size> _images;
+};
+
+/** Data set containing names of small images. */
+class SmallImages final : public ImageDataset<2>
+{
+public:
+    SmallImages()
+        : ImageDataset("128x128.ppm", "640x480.ppm")
+    {
+    }
+};
+
+/** Data set containing names of large images. */
+class LargeImages final : public ImageDataset<3>
+{
+public:
+    LargeImages()
+#ifdef INTERNAL_ONLY
+        : ImageDataset("1280x720.ppm", "1920x1080.ppm", "4160x3120.ppm")
+          // The 4k image is too large to distribute
+#else
+        : ImageDataset("1280x720.ppm", "1920x1080.ppm")
+#endif /* INTERNAL_ONLY */
+    {
+    }
+};
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/dataset/InterpolationPolicyDataset.h b/tests/dataset/InterpolationPolicyDataset.h
new file mode 100644
index 0000000000..e6062eae53
--- /dev/null
+++ b/tests/dataset/InterpolationPolicyDataset.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_INTERPOLATION_POLICY_DATASET_H__
+#define __ARM_COMPUTE_TEST_INTERPOLATION_POLICY_DATASET_H__
+
+#include "arm_compute/core/Types.h"
+
+#ifdef BOOST
+#include "boost_wrapper.h"
+#endif
+
+namespace arm_compute
+{
+namespace test
+{
+/** Data set containing all possible interpolation policies.
+ *
+ * Can be used as input for Boost data test cases to automatically run a test
+ * case on all interpolation policies.
+ */
+class InterpolationPolicies
+{
+public:
+    /** Type of the samples in the data set. */
+    using sample = InterpolationPolicy;
+
+    /** Dimensionality of the data set. */
+    enum
+    {
+        arity = 1
+    };
+
+    /** Number of samples in the data set. */
+#ifdef BOOST
+    boost::unit_test::data::size_t size() const
+#else
+    unsigned int size() const
+#endif
+    {
+        return _policies.size();
+    }
+
+    /** Type of the iterator used to step through all samples in the data set.
+     * Needs to support operator*() and operator++() which a pointer does.
+     */
+    using iterator = const InterpolationPolicy *;
+
+    /** Iterator to the first sample in the data set. */
+    iterator begin() const
+    {
+        return _policies.data();
+    }
+
+private:
+    std::array<InterpolationPolicy, 3> _policies{ { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR, InterpolationPolicy::AREA } };
+};
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/dataset/NormalizationLayerDataset.h b/tests/dataset/NormalizationLayerDataset.h
new file mode 100644
index 0000000000..7234f41551
--- /dev/null
+++ b/tests/dataset/NormalizationLayerDataset.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_DATASET_NORMALIZATION_LAYER_DATASET_H__
+#define __ARM_COMPUTE_TEST_DATASET_NORMALIZATION_LAYER_DATASET_H__
+
+#include "TypePrinter.h"
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "dataset/GenericDataset.h"
+
+#include <sstream>
+#include <type_traits>
+
+#ifdef BOOST
+#include "boost_wrapper.h"
+#endif
+
+namespace arm_compute
+{
+namespace test
+{
+class NormalizationLayerDataObject
+{
+public:
+    operator std::string() const
+    {
+        std::stringstream ss;
+        ss << "NormalizationLayer";
+        ss << "_I" << shape;
+        ss << "_F_" << info.type();
+        ss << "_S_" << info.norm_size();
+        return ss.str();
+    }
+
+public:
+    TensorShape            shape;
+    NormalizationLayerInfo info;
+};
+
+template <unsigned int Size>
+using NormalizationLayerDataset = GenericDataset<NormalizationLayerDataObject, Size>;
+
+class GoogLeNetNormalizationLayerDataset final : public NormalizationLayerDataset<2>
+{
+public:
+    GoogLeNetNormalizationLayerDataset()
+        : GenericDataset
+    {
+        // conv2/norm2
+        NormalizationLayerDataObject{ TensorShape(56U, 56U, 192U), NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f) },
+        // pool1/norm1
+        NormalizationLayerDataObject{ TensorShape(56U, 56U, 64U), NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f) }
+    }
+    {
+    }
+
+    ~GoogLeNetNormalizationLayerDataset() = default;
+};
+
+class AlexNetNormalizationLayerDataset final : public NormalizationLayerDataset<2>
+{
+public:
+    AlexNetNormalizationLayerDataset()
+        : GenericDataset
+    {
+        NormalizationLayerDataObject{ TensorShape(55U, 55U, 96U), NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f) },
+        NormalizationLayerDataObject{ TensorShape(27U, 27U, 256U), NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f) },
+    }
+    {
+    }
+
+    ~AlexNetNormalizationLayerDataset() = default;
+};
+
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_DATASET_NORMALIZATION_LAYER_DATASET_H__
diff --git a/tests/dataset/NormalizationTypeDataset.h b/tests/dataset/NormalizationTypeDataset.h
new file mode 100644
index 0000000000..bb1975c6c9
--- /dev/null
+++ b/tests/dataset/NormalizationTypeDataset.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_NORMALIZATION_TYPE_DATASET_H__
+#define __ARM_COMPUTE_TEST_NORMALIZATION_TYPE_DATASET_H__
+
+#include "arm_compute/core/Types.h"
+
+#ifdef BOOST
+#include "boost_wrapper.h"
+#endif
+
+namespace arm_compute
+{
+namespace test
+{
+/** Data set containing all possible normalization types.
+ *
+ * Can be used as input for Boost data test cases to automatically run a test
+ * case on all normalization types.
+ */
+class NormalizationTypes
+{
+public:
+    /** Type of the samples in the data set. */
+    using sample = NormType;
+
+    /** Dimensionality of the data set. */
+    enum
+    {
+        arity = 1
+    };
+
+    /** Number of samples in the data set. */
+#ifdef BOOST
+    boost::unit_test::data::size_t size() const
+#else
+    unsigned int size() const
+#endif
+    {
+        return _types.size();
+    }
+
+    /** Type of the iterator used to step through all samples in the data set.
+     * Needs to support operator*() and operator++() which a pointer does.
+     */
+    using iterator = const NormType *;
+
+    /** Iterator to the first sample in the data set. */
+    iterator begin() const
+    {
+        return _types.data();
+    }
+
+private:
+    std::array<NormType, 3> _types{ { NormType::IN_MAP_1D, NormType::IN_MAP_2D, NormType::CROSS_MAP } };
+};
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/dataset/PoolingLayerDataset.h b/tests/dataset/PoolingLayerDataset.h
new file mode 100644
index 0000000000..0a50a763ce
--- /dev/null
+++ b/tests/dataset/PoolingLayerDataset.h
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_DATASET_POOLING_LAYER_DATASET_H__
+#define __ARM_COMPUTE_TEST_DATASET_POOLING_LAYER_DATASET_H__
+
+#include "TypePrinter.h"
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "dataset/GenericDataset.h"
+
+#include <type_traits>
+
+#ifdef BOOST
+#include "boost_wrapper.h"
+#endif
+
+namespace arm_compute
+{
+namespace test
+{
+class PoolingLayerDataObject
+{
+public:
+    operator std::string() const
+    {
+        std::stringstream ss;
+        ss << "PoolingLayer";
+        ss << "_I" << src_shape;
+        ss << "_S_" << info.pool_size();
+        ss << "_F_" << info.pool_type();
+        ss << "_PS" << info.pad_stride_info();
+        return ss.str();
+    }
+
+    friend std::ostream &operator<<(std::ostream &s, const PoolingLayerDataObject &obj)
+    {
+        s << static_cast<std::string>(obj);
+        return s;
+    }
+
+public:
+    TensorShape      src_shape;
+    TensorShape      dst_shape;
+    PoolingLayerInfo info;
+};
+
+template <unsigned int Size>
+using PoolingLayerDataset = GenericDataset<PoolingLayerDataObject, Size>;
+
+class AlexNetPoolingLayerDataset final : public PoolingLayerDataset<3>
+{
+public:
+    AlexNetPoolingLayerDataset()
+        : GenericDataset
+    {
+        PoolingLayerDataObject{ TensorShape(55U, 55U, 96U), TensorShape(27U, 27U, 96U), PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0)) },
+        PoolingLayerDataObject{ TensorShape(27U, 27U, 256U), TensorShape(13U, 13U, 256U), PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0)) },
+        PoolingLayerDataObject{ TensorShape(13U, 13U, 256U), TensorShape(6U, 6U, 256U), PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0)) },
+    }
+    {
+    }
+
+    ~AlexNetPoolingLayerDataset() = default;
+};
+
+class LeNet5PoolingLayerDataset final : public PoolingLayerDataset<2>
+{
+public:
+    LeNet5PoolingLayerDataset()
+        : GenericDataset
+    {
+        PoolingLayerDataObject{ TensorShape(24U, 24U, 20U), TensorShape(12U, 12U, 20U), PoolingLayerInfo(PoolingType::MAX, 2, PadStrideInfo(2, 2, 0, 0)) },
+        PoolingLayerDataObject{ TensorShape(8U, 8U, 50U), TensorShape(4U, 4U, 50U), PoolingLayerInfo(PoolingType::MAX, 2, PadStrideInfo(2, 2, 0, 0)) },
+    }
+    {
+    }
+
+    ~LeNet5PoolingLayerDataset() = default;
+};
+
+class GoogLeNetPoolingLayerDataset final : public PoolingLayerDataset<10>
+{
+public:
+    GoogLeNetPoolingLayerDataset()
+        : GenericDataset
+    {
+        // FIXME: Add support for 7x7 pooling layer pool5/7x7_s1
+        // pool1/3x3_s2
+        PoolingLayerDataObject{ TensorShape(112U, 112U, 64U), TensorShape(56U, 56U, 64U), PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)) },
+        // pool2/3x3_s2
+        PoolingLayerDataObject{ TensorShape(56U, 56U, 192U), TensorShape(28U, 28U, 192U), PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)) },
+        // inception_3a/pool
+        PoolingLayerDataObject{ TensorShape(28U, 28U, 192U), TensorShape(28U, 28U, 192U), PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL)) },
+        // inception_3b/pool
+        PoolingLayerDataObject{ TensorShape(28U, 28U, 256U), TensorShape(28U, 28U, 256U), PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL)) },
+        // pool3/3x3_s2
+        PoolingLayerDataObject{ TensorShape(28U, 28U, 480U), TensorShape(14U, 14U, 480U), PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)) },
+        // inception_4a/pool
+        PoolingLayerDataObject{ TensorShape(14U, 14U, 480U), TensorShape(14U, 14U, 480U), PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL)) },
+        // inception_4b/pool, inception_4c/pool, inception_4d/pool
+        PoolingLayerDataObject{ TensorShape(14U, 14U, 512U), TensorShape(14U, 14U, 512U), PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL)) },
+        // inception_4e/pool
+        PoolingLayerDataObject{ TensorShape(14U, 14U, 528U), TensorShape(14U, 14U, 528U), PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL)) },
+        // pool4/3x3_s2
+        PoolingLayerDataObject{ TensorShape(14U, 14U, 832U), TensorShape(7U, 7U, 832U), PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)) },
+        // inception_5a/pool, inception_5b/pool
+        PoolingLayerDataObject{ TensorShape(7U, 7U, 832U), TensorShape(7U, 7U, 832U), PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL)) },
+    }
+    {
+    }
+
+    ~GoogLeNetPoolingLayerDataset() = default;
+};
+
+class RandomPoolingLayerDataset final : public PoolingLayerDataset<8>
+{
+public:
+    RandomPoolingLayerDataset()
+        : GenericDataset
+    {
+        PoolingLayerDataObject{ TensorShape(27U, 27U, 16U), TensorShape(13U, 13U, 16U), PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0)) },
+        PoolingLayerDataObject{ TensorShape(13U, 13U, 32U), TensorShape(6U, 6U, 32U), PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0)) },
+        PoolingLayerDataObject{ TensorShape(24U, 24U, 10U), TensorShape(12U, 12U, 10U), PoolingLayerInfo(PoolingType::MAX, 2, PadStrideInfo(2, 2, 0, 0)) },
+        PoolingLayerDataObject{ TensorShape(8U, 8U, 30U), TensorShape(4U, 4U, 30U), PoolingLayerInfo(PoolingType::MAX, 2, PadStrideInfo(2, 2, 0, 0)) },
+        PoolingLayerDataObject{ TensorShape(27U, 27U, 16U), TensorShape(13U, 13U, 16U), PoolingLayerInfo(PoolingType::AVG, 3, PadStrideInfo(2, 2, 0, 0)) },
+        PoolingLayerDataObject{ TensorShape(13U, 13U, 32U), TensorShape(6U, 6U, 32U), PoolingLayerInfo(PoolingType::AVG, 3, PadStrideInfo(2, 2, 0, 0)) },
+        PoolingLayerDataObject{ TensorShape(24U, 24U, 10U), TensorShape(12U, 12U, 10U), PoolingLayerInfo(PoolingType::AVG, 2, PadStrideInfo(2, 2, 0, 0)) },
+        PoolingLayerDataObject{ TensorShape(8U, 8U, 30U), TensorShape(4U, 4U, 30U), PoolingLayerInfo(PoolingType::AVG, 2, PadStrideInfo(2, 2, 0, 0)) },
+    }
+    {
+    }
+
+    ~RandomPoolingLayerDataset() = default;
+};
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_DATASET_POOLING_LAYER_DATASET_H__
diff --git a/tests/dataset/RoundingPolicyDataset.h b/tests/dataset/RoundingPolicyDataset.h
new file mode 100644
index 0000000000..c70872020b
--- /dev/null
+++ b/tests/dataset/RoundingPolicyDataset.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_ROUNDING_POLICY_DATASETS_H__
+#define __ARM_COMPUTE_TEST_ROUNDING_POLICY_DATASETS_H__
+
+#include "arm_compute/core/Types.h"
+
+#include <type_traits>
+
+#ifdef BOOST
+#include "boost_wrapper.h"
+#endif
+
+namespace arm_compute
+{
+namespace test
+{
+/** Data set containing all possible rounding policies.
+ *
+ * Can be used as input for Boost data test cases to automatically run a test
+ * case on different rounding policies.
+ */
+class RoundingPolicies
+{
+public:
+    /** Type of the samples in the data set. */
+    using sample = RoundingPolicy;
+
+    /** Dimensionality of the data set. */
+    enum
+    {
+        arity = 1
+    };
+
+    /** Number of samples in the data set. */
+#ifdef BOOST
+    boost::unit_test::data::size_t size() const
+#else
+    unsigned int size() const
+#endif
+    {
+        return _policies.size();
+    }
+
+    /** Type of the iterator used to step through all samples in the data set.
+     * Needs to support operator*() and operator++() which a pointer does.
+     */
+    using iterator = const RoundingPolicy *;
+
+    /** Iterator to the first sample in the data set. */
+    iterator begin() const
+    {
+        return _policies.data();
+    }
+
+private:
+    std::array<RoundingPolicy, 3> _policies{ { RoundingPolicy::TO_ZERO, RoundingPolicy::TO_NEAREST_UP, RoundingPolicy::TO_NEAREST_EVEN } };
+};
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/dataset/ShapeDatasets.h b/tests/dataset/ShapeDatasets.h
new file mode 100644
index 0000000000..73bdb8ea0e
--- /dev/null
+++ b/tests/dataset/ShapeDatasets.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_SHAPE_DATASETS_H__
+#define __ARM_COMPUTE_TEST_SHAPE_DATASETS_H__
+
+#include "arm_compute/core/TensorShape.h"
+
+#include <type_traits>
+
+#ifdef BOOST
+#include "boost_wrapper.h"
+#endif
+
+namespace arm_compute
+{
+namespace test
+{
+/** Abstract data set containing tensor shapes.
+ *
+ * Can be used as input for Boost data test cases to automatically run a test
+ * case on different tensor shapes.
+ */
+template <unsigned int Size>
+class ShapeDataset
+{
+public:
+    /** Type of the samples in the data set. */
+    using sample = TensorShape;
+
+    /** Dimensionality of the data set. */
+    enum
+    {
+        arity = 1
+    };
+
+    /** Number of samples in the data set. */
+#ifdef BOOST
+    boost::unit_test::data::size_t size() const
+#else
+    unsigned int size() const
+#endif
+    {
+        return _shapes.size();
+    }
+
+    /** Type of the iterator used to step through all samples in the data set.
+     * Needs to support operator*() and operator++() which a pointer does.
+     */
+    using iterator = const TensorShape *;
+
+    /** Iterator to the first sample in the data set. */
+    iterator begin() const
+    {
+        return _shapes.data();
+    }
+
+protected:
+    /** Protected constructor to make the class abstract. */
+    template <typename... Ts>
+    ShapeDataset(Ts... shapes)
+        : _shapes{ { shapes... } }
+    {
+    }
+
+    /** Protected destructor to prevent deletion of derived class through a
+     * pointer to the base class.
+     */
+    ~ShapeDataset() = default;
+
+private:
+    std::array<TensorShape, Size> _shapes;
+};
+
+/** Data set containing one 1D tensor shape. */
+class Small1DShape final : public ShapeDataset<1>
+{
+public:
+    Small1DShape()
+        : ShapeDataset(TensorShape(128U))
+    {
+    }
+};
+
+/** Data set containing small tensor shapes. */
+class SmallShapes final : public ShapeDataset<3>
+{
+public:
+    SmallShapes()
+        : ShapeDataset(TensorShape(5U, 5U),
+                       TensorShape(27U, 13U, 2U),
+                       TensorShape(128U, 64U, 1U, 3U))
+    {
+    }
+};
+
+/** Data set containing large tensor shapes. */
+class LargeShapes final : public ShapeDataset<3>
+{
+public:
+    LargeShapes()
+        : ShapeDataset(TensorShape(1920U, 1080U),
+                       TensorShape(1245U, 652U, 1U, 3U),
+                       TensorShape(4160U, 3120U))
+    {
+    }
+};
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/dataset/ThresholdDataset.h b/tests/dataset/ThresholdDataset.h
new file mode 100644
index 0000000000..956cf3d54d
--- /dev/null
+++ b/tests/dataset/ThresholdDataset.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_DATASET_THRESHOLD_DATASET_H__
+#define __ARM_COMPUTE_TEST_DATASET_THRESHOLD_DATASET_H__
+
+#include "TypePrinter.h"
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "dataset/GenericDataset.h"
+
+#include <ostream>
+#include <sstream>
+
+#include <tuple>
+#include <type_traits>
+
+#ifdef BOOST
+#include "boost_wrapper.h"
+#endif
+
+namespace arm_compute
+{
+namespace test
+{
+class ThresholdDataObject
+{
+public:
+    uint8_t       threshold;
+    uint8_t       false_value;
+    uint8_t       true_value;
+    ThresholdType type;
+    uint8_t       upper;
+
+    operator std::string() const
+    {
+        std::stringstream ss;
+        ss << "Threshold";
+        ss << "_threshold_value" << threshold;
+        ss << "_false_value" << false_value;
+        ss << "_true_value" << true_value;
+        ss << "_type";
+        ss << ((type == ThresholdType::BINARY) ? "binary" : "range");
+        ss << "_upper" << upper;
+        return ss.str();
+    }
+
+    friend std::ostream &operator<<(std::ostream &os, const ThresholdDataObject &obj)
+    {
+        os << static_cast<std::string>(obj);
+        return os;
+    }
+};
+
+class ThresholdDataset : public GenericDataset<ThresholdDataObject, 4>
+{
+public:
+    ThresholdDataset()
+        : GenericDataset
+    {
+        ThresholdDataObject{ 10U, 25U, 3U, ThresholdType::BINARY, 0U },
+        ThresholdDataObject{ 20U, 1U, 0U, ThresholdType::BINARY, 0U },
+        ThresholdDataObject{ 30U, 1U, 0U, ThresholdType::RANGE, 100U },
+        ThresholdDataObject{ 100U, 1U, 0U, ThresholdType::RANGE, 200U },
+    }
+    {
+    }
+
+    ~ThresholdDataset() = default;
+};
+
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_DATASET_THRESHOLD_DATASET_H__
diff --git a/tests/model_objects/AlexNet.h b/tests/model_objects/AlexNet.h
new file mode 100644
index 0000000000..d49ef0645a
--- /dev/null
+++ b/tests/model_objects/AlexNet.h
@@ -0,0 +1,582 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_MODEL_OBJECTS_ALEXNET_H__
+#define __ARM_COMPUTE_TEST_MODEL_OBJECTS_ALEXNET_H__
+
+#include "TensorLibrary.h"
+#include "Utils.h"
+
+#include <memory>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace model_objects
+{
+/** AlexNet model object */
+template <typename ITensorType,
+          typename TensorType,
+          typename SubTensorType,
+          typename Accessor,
+          typename ActivationLayerFunction,
+          typename ConvolutionLayerFunction,
+          typename FullyConnectedLayerFunction,
+          typename NormalizationLayerFunction,
+          typename PoolingLayerFunction,
+          typename SoftmaxLayerFunction,
+          DataType dt                   = DataType::F32,
+          int      fixed_point_position = 4>
+class AlexNet
+{
+public:
+    AlexNet()
+        : _batches(1), _reshaped_weights(false)
+    {
+    }
+
+    void init_weights(unsigned int batches, bool reshaped_weights = false)
+    {
+        _batches          = batches;
+        _reshaped_weights = reshaped_weights;
+
+        // Initialize weights and biases
+        if(!_reshaped_weights)
+        {
+            for(auto &wi : w)
+            {
+                wi = std::unique_ptr<TensorType>(new TensorType());
+            }
+            for(auto &bi : b)
+            {
+                bi = std::unique_ptr<TensorType>(new TensorType());
+            }
+            w[0]->allocator()->init(TensorInfo(TensorShape(11U, 11U, 3U, 96U), 1, dt, fixed_point_position));
+            b[0]->allocator()->init(TensorInfo(TensorShape(96U), 1, dt, fixed_point_position));
+            w[1]->allocator()->init(TensorInfo(TensorShape(5U, 5U, 48U, 256U), 1, dt, fixed_point_position));
+            b[1]->allocator()->init(TensorInfo(TensorShape(256U), 1, dt, fixed_point_position));
+            w[2]->allocator()->init(TensorInfo(TensorShape(3U, 3U, 256U, 384U), 1, dt, fixed_point_position));
+            b[2]->allocator()->init(TensorInfo(TensorShape(384U), 1, dt, fixed_point_position));
+            w[3]->allocator()->init(TensorInfo(TensorShape(3U, 3U, 192U, 384U), 1, dt, fixed_point_position));
+            b[3]->allocator()->init(TensorInfo(TensorShape(384U), 1, dt, fixed_point_position));
+            w[4]->allocator()->init(TensorInfo(TensorShape(3U, 3U, 192U, 256U), 1, dt, fixed_point_position));
+            b[4]->allocator()->init(TensorInfo(TensorShape(256U), 1, dt, fixed_point_position));
+            w[5]->allocator()->init(TensorInfo(TensorShape(9216U, 4096U), 1, dt, fixed_point_position));
+            b[5]->allocator()->init(TensorInfo(TensorShape(4096U), 1, dt, fixed_point_position));
+            w[6]->allocator()->init(TensorInfo(TensorShape(4096U, 4096U), 1, dt, fixed_point_position));
+            b[6]->allocator()->init(TensorInfo(TensorShape(4096U), 1, dt, fixed_point_position));
+            w[7]->allocator()->init(TensorInfo(TensorShape(4096U, 1000U), 1, dt, fixed_point_position));
+            b[7]->allocator()->init(TensorInfo(TensorShape(1000U), 1, dt, fixed_point_position));
+
+            w21 = std::unique_ptr<SubTensorType>(new SubTensorType(w[1].get(), TensorShape(5U, 5U, 48U, 128U), Coordinates()));
+            w22 = std::unique_ptr<SubTensorType>(new SubTensorType(w[1].get(), TensorShape(5U, 5U, 48U, 128U), Coordinates(0, 0, 0, 128)));
+            b21 = std::unique_ptr<SubTensorType>(new SubTensorType(b[1].get(), TensorShape(128U), Coordinates()));
+            b22 = std::unique_ptr<SubTensorType>(new SubTensorType(b[1].get(), TensorShape(128U), Coordinates(128)));
+
+            w41 = std::unique_ptr<SubTensorType>(new SubTensorType(w[3].get(), TensorShape(3U, 3U, 192U, 192U), Coordinates()));
+            w42 = std::unique_ptr<SubTensorType>(new SubTensorType(w[3].get(), TensorShape(3U, 3U, 192U, 192U), Coordinates(0, 0, 0, 192)));
+            b41 = std::unique_ptr<SubTensorType>(new SubTensorType(b[3].get(), TensorShape(192U), Coordinates()));
+            b42 = std::unique_ptr<SubTensorType>(new SubTensorType(b[3].get(), TensorShape(192U), Coordinates(192)));
+
+            w51 = std::unique_ptr<SubTensorType>(new SubTensorType(w[4].get(), TensorShape(3U, 3U, 192U, 128U), Coordinates()));
+            w52 = std::unique_ptr<SubTensorType>(new SubTensorType(w[4].get(), TensorShape(3U, 3U, 192U, 128U), Coordinates(0, 0, 0, 128)));
+            b51 = std::unique_ptr<SubTensorType>(new SubTensorType(b[4].get(), TensorShape(128U), Coordinates()));
+            b52 = std::unique_ptr<SubTensorType>(new SubTensorType(b[4].get(), TensorShape(128U), Coordinates(128)));
+        }
+        else
+        {
+            const unsigned int dt_size = 16 / arm_compute::data_size_from_type(dt);
+
+            // Create tensor for the reshaped weights
+            w[0]            = std::unique_ptr<TensorType>(new TensorType());
+            auto w21_tensor = std::unique_ptr<TensorType>(new TensorType());
+            auto w22_tensor = std::unique_ptr<TensorType>(new TensorType());
+            w[2]            = std::unique_ptr<TensorType>(new TensorType());
+            auto w41_tensor = std::unique_ptr<TensorType>(new TensorType());
+            auto w42_tensor = std::unique_ptr<TensorType>(new TensorType());
+            auto w51_tensor = std::unique_ptr<TensorType>(new TensorType());
+            auto w52_tensor = std::unique_ptr<TensorType>(new TensorType());
+
+            w[0]->allocator()->init(TensorInfo(TensorShape(366U * dt_size, 96U / dt_size), 1, dt, fixed_point_position));
+            w21_tensor->allocator()->init(TensorInfo(TensorShape(1248U * dt_size, 128U / dt_size), 1, dt, fixed_point_position));
+            w22_tensor->allocator()->init(TensorInfo(TensorShape(1248U * dt_size, 128U / dt_size), 1, dt, fixed_point_position));
+            w[2]->allocator()->init(TensorInfo(TensorShape(2560U * dt_size, 384U / dt_size), 1, dt, fixed_point_position));
+            w41_tensor->allocator()->init(TensorInfo(TensorShape(1920U * dt_size, 192U / dt_size), 1, dt, fixed_point_position));
+            w42_tensor->allocator()->init(TensorInfo(TensorShape(1920U * dt_size, 192U / dt_size), 1, dt, fixed_point_position));
+            w51_tensor->allocator()->init(TensorInfo(TensorShape(1920U * dt_size, 128U / dt_size), 1, dt, fixed_point_position));
+            w52_tensor->allocator()->init(TensorInfo(TensorShape(1920U * dt_size, 128U / dt_size), 1, dt, fixed_point_position));
+
+            w21 = std::move(w21_tensor);
+            w22 = std::move(w22_tensor);
+            w41 = std::move(w41_tensor);
+            w42 = std::move(w42_tensor);
+            w51 = std::move(w51_tensor);
+            w52 = std::move(w52_tensor);
+
+            w[5] = std::unique_ptr<TensorType>(new TensorType());
+            w[6] = std::unique_ptr<TensorType>(new TensorType());
+            w[7] = std::unique_ptr<TensorType>(new TensorType());
+            b[5] = std::unique_ptr<TensorType>(new TensorType());
+            b[6] = std::unique_ptr<TensorType>(new TensorType());
+            b[7] = std::unique_ptr<TensorType>(new TensorType());
+
+            b[5]->allocator()->init(TensorInfo(TensorShape(4096U), 1, dt, fixed_point_position));
+            b[6]->allocator()->init(TensorInfo(TensorShape(4096U), 1, dt, fixed_point_position));
+            b[7]->allocator()->init(TensorInfo(TensorShape(1000U), 1, dt, fixed_point_position));
+
+            if(_batches > 1)
+            {
+                w[5]->allocator()->init(TensorInfo(TensorShape(9216U * dt_size, 4096U / dt_size), 1, dt, fixed_point_position));
+                w[6]->allocator()->init(TensorInfo(TensorShape(4096U * dt_size, 4096U / dt_size), 1, dt, fixed_point_position));
+                w[7]->allocator()->init(TensorInfo(TensorShape(4096U * dt_size, 1000U / dt_size), 1, dt, fixed_point_position));
+            }
+            else
+            {
+                w[5]->allocator()->init(TensorInfo(TensorShape(4096U, 9216U), 1, dt, fixed_point_position));
+                w[6]->allocator()->init(TensorInfo(TensorShape(4096U, 4096U), 1, dt, fixed_point_position));
+                w[7]->allocator()->init(TensorInfo(TensorShape(1000U, 4096U), 1, dt, fixed_point_position));
+            }
+        }
+    }
+
+    void build()
+    {
+        input.allocator()->init(TensorInfo(TensorShape(227U, 227U, 3U, _batches), 1, dt, fixed_point_position));
+        output.allocator()->init(TensorInfo(TensorShape(1000U, _batches), 1, dt, fixed_point_position));
+
+        // Initialize intermediate tensors
+        // Layer 1
+        conv1_out.allocator()->init(TensorInfo(TensorShape(55U, 55U, 96U, _batches), 1, dt, fixed_point_position));
+        act1_out.allocator()->init(TensorInfo(TensorShape(55U, 55U, 96U, _batches), 1, dt, fixed_point_position));
+        norm1_out.allocator()->init(TensorInfo(TensorShape(55U, 55U, 96U, _batches), 1, dt, fixed_point_position));
+        pool1_out.allocator()->init(TensorInfo(TensorShape(27U, 27U, 96U, _batches), 1, dt, fixed_point_position));
+        pool11_out = std::unique_ptr<SubTensorType>(new SubTensorType(&pool1_out, TensorShape(27U, 27U, 48U, _batches), Coordinates()));
+        pool12_out = std::unique_ptr<SubTensorType>(new SubTensorType(&pool1_out, TensorShape(27U, 27U, 48U, _batches), Coordinates(0, 0, 48)));
+        // Layer 2
+        conv2_out.allocator()->init(TensorInfo(TensorShape(27U, 27U, 256U, _batches), 1, dt, fixed_point_position));
+        conv21_out = std::unique_ptr<SubTensorType>(new SubTensorType(&conv2_out, TensorShape(27U, 27U, 128U, _batches), Coordinates()));
+        conv22_out = std::unique_ptr<SubTensorType>(new SubTensorType(&conv2_out, TensorShape(27U, 27U, 128U, _batches), Coordinates(0, 0, 128)));
+        act2_out.allocator()->init(TensorInfo(TensorShape(27U, 27U, 256U, _batches), 1, dt, fixed_point_position));
+        norm2_out.allocator()->init(TensorInfo(TensorShape(27U, 27U, 256U, _batches), 1, dt, fixed_point_position));
+        pool2_out.allocator()->init(TensorInfo(TensorShape(13U, 13U, 256U, _batches), 1, dt, fixed_point_position));
+        // Layer 3
+        conv3_out.allocator()->init(TensorInfo(TensorShape(13U, 13U, 384U, _batches), 1, dt, fixed_point_position));
+        act3_out.allocator()->init(TensorInfo(TensorShape(13U, 13U, 384U, _batches), 1, dt, fixed_point_position));
+        act31_out = std::unique_ptr<SubTensorType>(new SubTensorType(&act3_out, TensorShape(13U, 13U, 192U, _batches), Coordinates()));
+        act32_out = std::unique_ptr<SubTensorType>(new SubTensorType(&act3_out, TensorShape(13U, 13U, 192U, _batches), Coordinates(0, 0, 192)));
+        // Layer 4
+        conv4_out.allocator()->init(TensorInfo(TensorShape(13U, 13U, 384U, _batches), 1, dt, fixed_point_position));
+        conv41_out = std::unique_ptr<SubTensorType>(new SubTensorType(&conv4_out, TensorShape(13U, 13U, 192U, _batches), Coordinates()));
+        conv42_out = std::unique_ptr<SubTensorType>(new SubTensorType(&conv4_out, TensorShape(13U, 13U, 192U, _batches), Coordinates(0, 0, 192)));
+        act4_out.allocator()->init(TensorInfo(TensorShape(13U, 13U, 384U, _batches), 1, dt, fixed_point_position));
+        act41_out = std::unique_ptr<SubTensorType>(new SubTensorType(&act4_out, TensorShape(13U, 13U, 192U, _batches), Coordinates()));
+        act42_out = std::unique_ptr<SubTensorType>(new SubTensorType(&act4_out, TensorShape(13U, 13U, 192U, _batches), Coordinates(0, 0, 192)));
+        // Layer 5
+        conv5_out.allocator()->init(TensorInfo(TensorShape(13U, 13U, 256U, _batches), 1, dt, fixed_point_position));
+        conv51_out = std::unique_ptr<SubTensorType>(new SubTensorType(&conv5_out, TensorShape(13U, 13U, 128U, _batches), Coordinates()));
+        conv52_out = std::unique_ptr<SubTensorType>(new SubTensorType(&conv5_out, TensorShape(13U, 13U, 128U, _batches), Coordinates(0, 0, 128)));
+        act5_out.allocator()->init(TensorInfo(TensorShape(13U, 13U, 256U, _batches), 1, dt, fixed_point_position));
+        pool5_out.allocator()->init(TensorInfo(TensorShape(6U, 6U, 256U, _batches), 1, dt, fixed_point_position));
+        // Layer 6
+        fc6_out.allocator()->init(TensorInfo(TensorShape(4096U, _batches), 1, dt, fixed_point_position));
+        act6_out.allocator()->init(TensorInfo(TensorShape(4096U, _batches), 1, dt, fixed_point_position));
+        // Layer 7
+        fc7_out.allocator()->init(TensorInfo(TensorShape(4096U, _batches), 1, dt, fixed_point_position));
+        act7_out.allocator()->init(TensorInfo(TensorShape(4096U, _batches), 1, dt, fixed_point_position));
+        // Layer 8
+        fc8_out.allocator()->init(TensorInfo(TensorShape(1000U, _batches), 1, dt, fixed_point_position));
+
+        // Allocate layers
+        {
+            // Layer 1
+            conv1 = std::unique_ptr<ConvolutionLayerFunction>(new ConvolutionLayerFunction());
+            act1  = std::unique_ptr<ActivationLayerFunction>(new ActivationLayerFunction());
+            norm1 = std::unique_ptr<NormalizationLayerFunction>(new NormalizationLayerFunction());
+            pool1 = std::unique_ptr<PoolingLayerFunction>(new PoolingLayerFunction());
+            // Layer 2
+            conv21 = std::unique_ptr<ConvolutionLayerFunction>(new ConvolutionLayerFunction());
+            conv22 = std::unique_ptr<ConvolutionLayerFunction>(new ConvolutionLayerFunction());
+            act2   = std::unique_ptr<ActivationLayerFunction>(new ActivationLayerFunction());
+            norm2  = std::unique_ptr<NormalizationLayerFunction>(new NormalizationLayerFunction());
+            pool2  = std::unique_ptr<PoolingLayerFunction>(new PoolingLayerFunction());
+            // Layer 3
+            conv3 = std::unique_ptr<ConvolutionLayerFunction>(new ConvolutionLayerFunction());
+            act3  = std::unique_ptr<ActivationLayerFunction>(new ActivationLayerFunction());
+            // Layer 4
+            conv41 = std::unique_ptr<ConvolutionLayerFunction>(new ConvolutionLayerFunction());
+            conv42 = std::unique_ptr<ConvolutionLayerFunction>(new ConvolutionLayerFunction());
+            act4   = std::unique_ptr<ActivationLayerFunction>(new ActivationLayerFunction());
+            // Layer 5
+            conv51 = std::unique_ptr<ConvolutionLayerFunction>(new ConvolutionLayerFunction());
+            conv52 = std::unique_ptr<ConvolutionLayerFunction>(new ConvolutionLayerFunction());
+            act5   = std::unique_ptr<ActivationLayerFunction>(new ActivationLayerFunction());
+            pool5  = std::unique_ptr<PoolingLayerFunction>(new PoolingLayerFunction());
+            // Layer 6
+            fc6  = std::unique_ptr<FullyConnectedLayerFunction>(new FullyConnectedLayerFunction());
+            act6 = std::unique_ptr<ActivationLayerFunction>(new ActivationLayerFunction());
+            // Layer 7
+            fc7  = std::unique_ptr<FullyConnectedLayerFunction>(new FullyConnectedLayerFunction());
+            act7 = std::unique_ptr<ActivationLayerFunction>(new ActivationLayerFunction());
+            // Layer 8
+            fc8 = std::unique_ptr<FullyConnectedLayerFunction>(new FullyConnectedLayerFunction());
+            // Softmax
+            smx = std::unique_ptr<SoftmaxLayerFunction>(new SoftmaxLayerFunction());
+        }
+
+        // Configure Layers
+        {
+            // Layer 1
+            conv1->configure(&input, w[0].get(), b[0].get(), &conv1_out, PadStrideInfo(4, 4, 0, 0), WeightsInfo(_reshaped_weights, 11U));
+            act1->configure(&conv1_out, &act1_out, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+            norm1->configure(&act1_out, &norm1_out, NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f));
+            pool1->configure(&norm1_out, &pool1_out, PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0)));
+            // Layer 2
+            conv21->configure(pool11_out.get(), w21.get(), b21.get(), conv21_out.get(), PadStrideInfo(1, 1, 2, 2), WeightsInfo(_reshaped_weights, 5U));
+            conv22->configure(pool12_out.get(), w22.get(), b22.get(), conv22_out.get(), PadStrideInfo(1, 1, 2, 2), WeightsInfo(_reshaped_weights, 5U));
+            act2->configure(&conv2_out, &act2_out, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+            norm2->configure(&act2_out, &norm2_out, NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f));
+            pool2->configure(&norm2_out, &pool2_out, PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0)));
+            // Layer 3
+            conv3->configure(&pool2_out, w[2].get(), b[2].get(), &conv3_out, PadStrideInfo(1, 1, 1, 1), WeightsInfo(_reshaped_weights, 3U));
+            act3->configure(&conv3_out, &act3_out, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+            // Layer 4
+            conv41->configure(act31_out.get(), w41.get(), b41.get(), conv41_out.get(), PadStrideInfo(1, 1, 1, 1), WeightsInfo(_reshaped_weights, 3U));
+            conv42->configure(act32_out.get(), w42.get(), b42.get(), conv42_out.get(), PadStrideInfo(1, 1, 1, 1), WeightsInfo(_reshaped_weights, 3U));
+            act4->configure(&conv4_out, &act4_out, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+            // Layer 5
+            conv51->configure(act41_out.get(), w51.get(), b51.get(), conv51_out.get(), PadStrideInfo(1, 1, 1, 1), WeightsInfo(_reshaped_weights, 3U));
+            conv52->configure(act42_out.get(), w52.get(), b52.get(), conv52_out.get(), PadStrideInfo(1, 1, 1, 1), WeightsInfo(_reshaped_weights, 3U));
+            act5->configure(&conv5_out, &act5_out, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+            pool5->configure(&act5_out, &pool5_out, PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0)));
+            // Layer 6
+            fc6->configure(&pool5_out, w[5].get(), b[5].get(), &fc6_out, true, _reshaped_weights);
+            act6->configure(&fc6_out, &act6_out, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+            // Layer 7
+            fc7->configure(&act6_out, w[6].get(), b[6].get(), &fc7_out, true, _reshaped_weights);
+            act7->configure(&fc7_out, &act7_out, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+            // Layer 8
+            fc8->configure(&act7_out, w[7].get(), b[7].get(), &fc8_out, true, _reshaped_weights);
+            // Softmax
+            smx->configure(&fc8_out, &output);
+        }
+    }
+
+    void allocate()
+    {
+        input.allocator()->allocate();
+        output.allocator()->allocate();
+        for(auto &wi : w)
+        {
+            if(wi.get())
+            {
+                wi->allocator()->allocate();
+            }
+        }
+        for(auto &bi : b)
+        {
+            if(bi.get())
+            {
+                bi->allocator()->allocate();
+            }
+        }
+        if(_reshaped_weights)
+        {
+            dynamic_cast<TensorType *>(w21.get())->allocator()->allocate();
+            dynamic_cast<TensorType *>(w22.get())->allocator()->allocate();
+            dynamic_cast<TensorType *>(w41.get())->allocator()->allocate();
+            dynamic_cast<TensorType *>(w42.get())->allocator()->allocate();
+            dynamic_cast<TensorType *>(w51.get())->allocator()->allocate();
+            dynamic_cast<TensorType *>(w52.get())->allocator()->allocate();
+        }
+        conv1_out.allocator()->allocate();
+        act1_out.allocator()->allocate();
+        norm1_out.allocator()->allocate();
+        pool1_out.allocator()->allocate();
+        conv2_out.allocator()->allocate();
+        act2_out.allocator()->allocate();
+        norm2_out.allocator()->allocate();
+        pool2_out.allocator()->allocate();
+        conv3_out.allocator()->allocate();
+        act3_out.allocator()->allocate();
+        conv4_out.allocator()->allocate();
+        act4_out.allocator()->allocate();
+        conv5_out.allocator()->allocate();
+        act5_out.allocator()->allocate();
+        pool5_out.allocator()->allocate();
+        fc6_out.allocator()->allocate();
+        act6_out.allocator()->allocate();
+        fc7_out.allocator()->allocate();
+        act7_out.allocator()->allocate();
+        fc8_out.allocator()->allocate();
+    }
+
+    /** Fills the trainable parameters and input with random data. */
+    void fill_random()
+    {
+        library->fill_tensor_uniform(Accessor(input), 0);
+        if(!_reshaped_weights)
+        {
+            for(unsigned int i = 0; i < w.size(); ++i)
+            {
+                library->fill_tensor_uniform(Accessor(*w[i]), i + 1);
+                library->fill_tensor_uniform(Accessor(*b[i]), i + 10);
+            }
+        }
+        else
+        {
+            library->fill_tensor_uniform(Accessor(*w[0]), 1);
+            library->fill_tensor_uniform(Accessor(*w[2]), 2);
+
+            library->fill_tensor_uniform(Accessor(*w[5]), 3);
+            library->fill_tensor_uniform(Accessor(*b[5]), 4);
+            library->fill_tensor_uniform(Accessor(*w[6]), 5);
+            library->fill_tensor_uniform(Accessor(*b[6]), 6);
+            library->fill_tensor_uniform(Accessor(*w[7]), 7);
+            library->fill_tensor_uniform(Accessor(*b[7]), 8);
+
+            library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w21.get())), 9);
+            library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w22.get())), 10);
+            library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w41.get())), 11);
+            library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w42.get())), 12);
+            library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w51.get())), 13);
+            library->fill_tensor_uniform(Accessor(*dynamic_cast<TensorType *>(w52.get())), 14);
+        }
+    }
+
+#ifdef INTERNAL_ONLY
+    /** Fills the trainable parameters from binary files
+     *
+     * @param weights Files names containing the weights data
+     * @param biases  Files names containing the bias data
+     */
+    void fill(std::vector<std::string> weights, std::vector<std::string> biases)
+    {
+        ARM_COMPUTE_ERROR_ON(weights.size() != w.size());
+        ARM_COMPUTE_ERROR_ON(biases.size() != b.size());
+        ARM_COMPUTE_ERROR_ON(_reshaped_weights);
+
+        for(unsigned int i = 0; i < weights.size(); ++i)
+        {
+            library->fill_layer_data(Accessor(*w[i]), weights[i]);
+            library->fill_layer_data(Accessor(*b[i]), biases[i]);
+        }
+    }
+
+    /** Feed input to network from file.
+     *
+     * @param name File name of containing the input data.
+     */
+    void feed(std::string name)
+    {
+        library->fill_layer_data(Accessor(input), name);
+    }
+#endif /* INTERNAL_ONLY */
+
+    /** Get the classification results.
+     *
+     * @return Vector containing the classified labels
+     */
+    std::vector<unsigned int> get_classifications()
+    {
+        std::vector<unsigned int> classified_labels;
+        Accessor                  output_accessor(output);
+
+        Window window;
+        window.set(Window::DimX, Window::Dimension(0, 1, 1));
+        for(unsigned int d = 1; d < output_accessor.shape().num_dimensions(); ++d)
+        {
+            window.set(d, Window::Dimension(0, output_accessor.shape()[d], 1));
+        }
+
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            int               max_idx = 0;
+            float             val     = 0;
+            const void *const out_ptr = output_accessor(id);
+            for(unsigned int l = 0; l < output_accessor.shape().x(); ++l)
+            {
+                float curr_val = reinterpret_cast<const float *>(out_ptr)[l];
+                if(curr_val > val)
+                {
+                    max_idx = l;
+                    val     = curr_val;
+                }
+            }
+            classified_labels.push_back(max_idx);
+        });
+        return classified_labels;
+    }
+
+    /** Clear all allocated memory from the tensor objects */
+    void clear()
+    {
+        conv1.reset();
+        act1.reset();
+        norm1.reset();
+        pool1.reset();
+        conv21.reset();
+        conv22.reset();
+        act2.reset();
+        norm2.reset();
+        pool2.reset();
+        conv3.reset();
+        act3.reset();
+        conv41.reset();
+        conv42.reset();
+        act4.reset();
+        conv51.reset();
+        conv52.reset();
+        act5.reset();
+        pool5.reset();
+        fc6.reset();
+        act6.reset();
+        fc7.reset();
+        act7.reset();
+        fc8.reset();
+        smx.reset();
+
+        // Free allocations
+        input.allocator()->free();
+        output.allocator()->free();
+        for(auto &wi : w)
+        {
+            wi.reset();
+        }
+        for(auto &bi : b)
+        {
+            bi.reset();
+        }
+
+        w21.reset();
+        w22.reset();
+        b21.reset();
+        b21.reset();
+        w41.reset();
+        w42.reset();
+        b41.reset();
+        b42.reset();
+        w51.reset();
+        w52.reset();
+        b51.reset();
+        b52.reset();
+
+        conv1_out.allocator()->free();
+        act1_out.allocator()->free();
+        norm1_out.allocator()->free();
+        pool1_out.allocator()->free();
+        conv2_out.allocator()->free();
+        act2_out.allocator()->free();
+        norm2_out.allocator()->free();
+        pool2_out.allocator()->free();
+        conv3_out.allocator()->free();
+        act3_out.allocator()->free();
+        conv4_out.allocator()->free();
+        act4_out.allocator()->free();
+        conv5_out.allocator()->free();
+        act5_out.allocator()->free();
+        pool5_out.allocator()->free();
+        fc6_out.allocator()->free();
+        act6_out.allocator()->free();
+        fc7_out.allocator()->free();
+        act7_out.allocator()->free();
+        fc8_out.allocator()->free();
+    }
+
+    /** Runs the model */
+    void run()
+    {
+        // Layer 1
+        conv1->run();
+        act1->run();
+        norm1->run();
+        pool1->run();
+        // Layer 2
+        conv21->run();
+        conv22->run();
+        act2->run();
+        norm2->run();
+        pool2->run();
+        // Layer 3
+        conv3->run();
+        act3->run();
+        // Layer 4
+        conv41->run();
+        conv42->run();
+        act4->run();
+        // Layer 5
+        conv51->run();
+        conv52->run();
+        act5->run();
+        pool5->run();
+        // Layer 6
+        fc6->run();
+        act6->run();
+        // Layer 7
+        fc7->run();
+        act7->run();
+        // Layer 8
+        fc8->run();
+        // Softmax
+        smx->run();
+    }
+
+private:
+    unsigned int _batches;
+    bool         _reshaped_weights;
+
+    std::unique_ptr<ActivationLayerFunction>     act1{ nullptr }, act2{ nullptr }, act3{ nullptr }, act4{ nullptr }, act5{ nullptr }, act6{ nullptr }, act7{ nullptr };
+    std::unique_ptr<ConvolutionLayerFunction>    conv1{ nullptr }, conv21{ nullptr }, conv22{ nullptr }, conv3{ nullptr }, conv41{ nullptr }, conv42{ nullptr }, conv51{ nullptr }, conv52{ nullptr };
+    std::unique_ptr<FullyConnectedLayerFunction> fc6{ nullptr }, fc7{ nullptr }, fc8{};
+    std::unique_ptr<NormalizationLayerFunction>  norm1{ nullptr }, norm2{ nullptr };
+    std::unique_ptr<PoolingLayerFunction>        pool1{ nullptr }, pool2{ nullptr }, pool5{ nullptr };
+    std::unique_ptr<SoftmaxLayerFunction>        smx{ nullptr };
+
+    TensorType input{}, output{};
+    std::array<std::unique_ptr<TensorType>, 8> w{}, b{};
+    std::unique_ptr<ITensorType> w21{ nullptr }, w22{ nullptr }, b21{ nullptr }, b22{ nullptr };
+    std::unique_ptr<ITensorType> w41{ nullptr }, w42{ nullptr }, b41{ nullptr }, b42{ nullptr };
+    std::unique_ptr<ITensorType> w51{ nullptr }, w52{ nullptr }, b51{ nullptr }, b52{ nullptr };
+
+    TensorType conv1_out{}, act1_out{}, norm1_out{}, pool1_out{};
+    TensorType conv2_out{}, act2_out{}, pool2_out{}, norm2_out{};
+    TensorType conv3_out{}, act3_out{};
+    TensorType conv4_out{}, act4_out{};
+    TensorType conv5_out{}, act5_out{}, pool5_out{};
+    TensorType fc6_out{}, act6_out{};
+    TensorType fc7_out{}, act7_out{};
+    TensorType fc8_out{};
+
+    std::unique_ptr<SubTensorType> pool11_out{ nullptr }, pool12_out{ nullptr };
+    std::unique_ptr<SubTensorType> conv21_out{ nullptr }, conv22_out{ nullptr };
+    std::unique_ptr<SubTensorType> act31_out{ nullptr }, act32_out{ nullptr };
+    std::unique_ptr<SubTensorType> conv41_out{ nullptr }, conv42_out{ nullptr }, act41_out{ nullptr }, act42_out{ nullptr };
+    std::unique_ptr<SubTensorType> conv51_out{ nullptr }, conv52_out{ nullptr };
+};
+} // namespace model_objects
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_MODEL_OBJECTS_ALEXNET_H__
diff --git a/tests/model_objects/LeNet5.h b/tests/model_objects/LeNet5.h
new file mode 100644
index 0000000000..7d5090f5d0
--- /dev/null
+++ b/tests/model_objects/LeNet5.h
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_MODEL_OBJECTS_LENET5_H__
+#define __ARM_COMPUTE_TEST_MODEL_OBJECTS_LENET5_H__
+
+#include "TensorLibrary.h"
+#include "Utils.h"
+
+#include <memory>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace model_objects
+{
+/** Lenet5 model object */
+template <typename TensorType,
+          typename Accessor,
+          typename ActivationLayerFunction,
+          typename ConvolutionLayerFunction,
+          typename FullyConnectedLayerFunction,
+          typename PoolingLayerFunction,
+          typename SoftmaxLayerFunction>
+class LeNet5
+{
+public:
+    /** Initialize and build the model.
+     *
+     * @param batches Number of batches should handle
+     */
+    void build(unsigned int batches)
+    {
+        // Initialize input, output, weights and biases
+        input.allocator()->init(TensorInfo(TensorShape(28U, 28U, 1U, batches), 1, DataType::F32));
+        output.allocator()->init(TensorInfo(TensorShape(10U, batches), 1, DataType::F32));
+        w[0].allocator()->init(TensorInfo(TensorShape(5U, 5U, 1U, 20U), 1, DataType::F32));
+        b[0].allocator()->init(TensorInfo(TensorShape(20U), 1, DataType::F32));
+        w[1].allocator()->init(TensorInfo(TensorShape(5U, 5U, 20U, 50U), 1, DataType::F32));
+        b[1].allocator()->init(TensorInfo(TensorShape(50U), 1, DataType::F32));
+        w[2].allocator()->init(TensorInfo(TensorShape(800U, 500U), 1, DataType::F32));
+        b[2].allocator()->init(TensorInfo(TensorShape(500U), 1, DataType::F32));
+        w[3].allocator()->init(TensorInfo(TensorShape(500U, 10U), 1, DataType::F32));
+        b[3].allocator()->init(TensorInfo(TensorShape(10U), 1, DataType::F32));
+
+        // Initialize intermediate tensors
+        // Layer 1
+        conv1_out.allocator()->init(TensorInfo(TensorShape(24U, 24U, 20U, batches), 1, DataType::F32));
+        pool1_out.allocator()->init(TensorInfo(TensorShape(12U, 12U, 20U, batches), 1, DataType::F32));
+        // Layer 2
+        conv2_out.allocator()->init(TensorInfo(TensorShape(8U, 8U, 50U, batches), 1, DataType::F32));
+        pool2_out.allocator()->init(TensorInfo(TensorShape(4U, 4U, 50U, batches), 1, DataType::F32));
+        // Layer 3
+        fc1_out.allocator()->init(TensorInfo(TensorShape(500U, batches), 1, DataType::F32));
+        act1_out.allocator()->init(TensorInfo(TensorShape(500U, batches), 1, DataType::F32));
+        // Layer 6
+        fc2_out.allocator()->init(TensorInfo(TensorShape(10U, batches), 1, DataType::F32));
+
+        // Allocate layers
+        {
+            // Layer 1
+            conv1 = std::unique_ptr<ConvolutionLayerFunction>(new ConvolutionLayerFunction());
+            pool1 = std::unique_ptr<PoolingLayerFunction>(new PoolingLayerFunction());
+            // Layer 2
+            conv2 = std::unique_ptr<ConvolutionLayerFunction>(new ConvolutionLayerFunction());
+            pool2 = std::unique_ptr<PoolingLayerFunction>(new PoolingLayerFunction());
+            // Layer 3
+            fc1  = std::unique_ptr<FullyConnectedLayerFunction>(new FullyConnectedLayerFunction());
+            act1 = std::unique_ptr<ActivationLayerFunction>(new ActivationLayerFunction());
+            // Layer 4
+            fc2 = std::unique_ptr<FullyConnectedLayerFunction>(new FullyConnectedLayerFunction());
+            // Softmax
+            smx = std::unique_ptr<SoftmaxLayerFunction>(new SoftmaxLayerFunction());
+        }
+
+        // Configure Layers
+        {
+            conv1->configure(&input, &w[0], &b[0], &conv1_out, PadStrideInfo(1, 1, 0, 0));
+            pool1->configure(&conv1_out, &pool1_out, PoolingLayerInfo(PoolingType::MAX, 2, PadStrideInfo(2, 2, 0, 0)));
+            conv2->configure(&pool1_out, &w[1], &b[1], &conv2_out, PadStrideInfo(1, 1, 0, 0));
+            pool2->configure(&conv2_out, &pool2_out, PoolingLayerInfo(PoolingType::MAX, 2, PadStrideInfo(2, 2, 0, 0)));
+            fc1->configure(&pool2_out, &w[2], &b[2], &fc1_out);
+            act1->configure(&fc1_out, &act1_out, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+            fc2->configure(&act1_out, &w[3], &b[3], &fc2_out);
+            smx->configure(&fc2_out, &output);
+        }
+
+        // Allocate tensors
+        {
+            input.allocator()->allocate();
+            output.allocator()->allocate();
+            for(auto &wi : w)
+            {
+                wi.allocator()->allocate();
+            }
+            for(auto &bi : b)
+            {
+                bi.allocator()->allocate();
+            }
+            conv1_out.allocator()->allocate();
+            pool1_out.allocator()->allocate();
+            conv2_out.allocator()->allocate();
+            pool2_out.allocator()->allocate();
+            fc1_out.allocator()->allocate();
+            act1_out.allocator()->allocate();
+            fc2_out.allocator()->allocate();
+        }
+    }
+
+    /** Fills the trainable parameters and input with random data. */
+    void fill_random()
+    {
+        std::uniform_real_distribution<> distribution(-1, 1);
+        library->fill(Accessor(input), distribution, 0);
+        for(unsigned int i = 0; i < w.size(); ++i)
+        {
+            library->fill(Accessor(w[i]), distribution, i + 1);
+            library->fill(Accessor(b[i]), distribution, i + 10);
+        }
+    }
+
+#ifdef INTERNAL_ONLY
+    /** Fills the trainable parameters from binary files
+     *
+     * @param weights Files names containing the weights data
+     * @param biases  Files names containing the bias data
+     */
+    void fill(std::vector<std::string> weights, std::vector<std::string> biases)
+    {
+        ARM_COMPUTE_ERROR_ON(weights.size() != w.size());
+        ARM_COMPUTE_ERROR_ON(biases.size() != b.size());
+
+        for(unsigned int i = 0; i < weights.size(); ++i)
+        {
+            library->fill_layer_data(Accessor(w[i]), weights[i]);
+            library->fill_layer_data(Accessor(b[i]), biases[i]);
+        }
+    }
+
+    /** Feed input to network from file.
+     *
+     * @param name File name of containing the input data.
+     */
+    void feed(std::string name)
+    {
+        library->fill_layer_data(Accessor(input), name);
+    }
+#endif /* INTERNAL_ONLY */
+
+    /** Get the classification results.
+     *
+     * @return Vector containing the classified labels
+     */
+    std::vector<unsigned int> get_classifications()
+    {
+        std::vector<unsigned int> classified_labels;
+        Accessor                  output_accessor(output);
+
+        Window window;
+        window.set(Window::DimX, Window::Dimension(0, 1, 1));
+        for(unsigned int d = 1; d < output_accessor.shape().num_dimensions(); ++d)
+        {
+            window.set(d, Window::Dimension(0, output_accessor.shape()[d], 1));
+        }
+
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            int               max_idx = 0;
+            float             val     = 0;
+            const void *const out_ptr = output_accessor(id);
+            for(unsigned int l = 0; l < output_accessor.shape().x(); ++l)
+            {
+                float curr_val = reinterpret_cast<const float *>(out_ptr)[l];
+                if(curr_val > val)
+                {
+                    max_idx = l;
+                    val     = curr_val;
+                }
+            }
+            classified_labels.push_back(max_idx);
+        });
+        return classified_labels;
+    }
+
+    /** Clear all allocated memory from the tensor objects */
+    void clear()
+    {
+        conv1.reset();
+        pool1.reset();
+        conv2.reset();
+        pool2.reset();
+        fc1.reset();
+        act1.reset();
+        fc2.reset();
+        smx.reset();
+
+        input.allocator()->free();
+        output.allocator()->free();
+        for(auto &wi : w)
+        {
+            wi.allocator()->free();
+        }
+        for(auto &bi : b)
+        {
+            bi.allocator()->free();
+        }
+
+        conv1_out.allocator()->free();
+        pool1_out.allocator()->free();
+        conv2_out.allocator()->free();
+        pool2_out.allocator()->free();
+        fc1_out.allocator()->free();
+        act1_out.allocator()->free();
+        fc2_out.allocator()->free();
+    }
+
+    /** Runs the model */
+    void run()
+    {
+        // Layer 1
+        conv1->run();
+        pool1->run();
+        // Layer 2
+        conv2->run();
+        pool2->run();
+        // Layer 3
+        fc1->run();
+        act1->run();
+        // Layer 4
+        fc2->run();
+        // Softmax
+        smx->run();
+    }
+
+private:
+    std::unique_ptr<ActivationLayerFunction>     act1{ nullptr };
+    std::unique_ptr<ConvolutionLayerFunction>    conv1{ nullptr }, conv2{ nullptr };
+    std::unique_ptr<FullyConnectedLayerFunction> fc1{ nullptr }, fc2{ nullptr };
+    std::unique_ptr<PoolingLayerFunction>        pool1{ nullptr }, pool2{ nullptr };
+    std::unique_ptr<SoftmaxLayerFunction>        smx{ nullptr };
+
+    TensorType input{}, output{};
+    std::array<TensorType, 4> w{}, b{};
+
+    TensorType conv1_out{}, pool1_out{};
+    TensorType conv2_out{}, pool2_out{};
+    TensorType fc1_out{}, act1_out{};
+    TensorType fc2_out{};
+};
+} // namespace model_objects
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_MODEL_OBJECTS_LENET5_H__
diff --git a/tests/validation/CL/BitwiseAnd.cpp b/tests/validation/CL/BitwiseAnd.cpp
new file mode 100644
index 0000000000..4cd64a2a99
--- /dev/null
+++ b/tests/validation/CL/BitwiseAnd.cpp
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "validation/Datasets.h"
+#include "validation/Reference.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLSubTensor.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLBitwiseAnd.h"
+
+#include "boost_wrapper.h"
+
+#include <random>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::cl;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+/** Compute Neon bitwise and function.
+ *
+ * @param[in] shape Shape of the input and output tensors.
+ *
+ * @return Computed output tensor.
+ */
+CLTensor compute_bitwise_and(const TensorShape &shape)
+{
+    // Create tensors
+    CLTensor src1 = create_tensor(shape, DataType::U8);
+    CLTensor src2 = create_tensor(shape, DataType::U8);
+    CLTensor dst  = create_tensor(shape, DataType::U8);
+
+    // Create and configure function
+    CLBitwiseAnd band;
+    band.configure(&src1, &src2, &dst);
+
+    // Allocate tensors
+    src1.allocator()->allocate();
+    src2.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src1.info()->is_resizable());
+    BOOST_TEST(!src2.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors
+    library->fill_tensor_uniform(CLAccessor(src1), 0);
+    library->fill_tensor_uniform(CLAccessor(src2), 1);
+
+    // Compute function
+    band.run();
+
+    return dst;
+}
+
+/** Compute OpenCL bitwise and function that splits the input and output in two subtensor.
+ *
+ * @param[in] shape Shape of the input and output tensors.
+ *
+ * @return Computed output tensor.
+ */
+CLTensor compute_bitwise_and_subtensor(const TensorShape &shape)
+{
+    // Create tensors
+    CLTensor src1 = create_tensor(shape, DataType::U8);
+    CLTensor src2 = create_tensor(shape, DataType::U8);
+    CLTensor dst  = create_tensor(shape, DataType::U8);
+
+    // Create SubTensors
+    int         coord_z   = shape.z() / 2;
+    TensorShape sub_shape = shape;
+    sub_shape.set(2, coord_z);
+
+    CLSubTensor src1_sub1(&src1, sub_shape, Coordinates());
+    CLSubTensor src1_sub2(&src1, sub_shape, Coordinates(0, 0, coord_z));
+    CLSubTensor src2_sub1(&src2, sub_shape, Coordinates());
+    CLSubTensor src2_sub2(&src2, sub_shape, Coordinates(0, 0, coord_z));
+    CLSubTensor dst_sub1(&dst, sub_shape, Coordinates());
+    CLSubTensor dst_sub2(&dst, sub_shape, Coordinates(0, 0, coord_z));
+
+    // Create and configure function
+    CLBitwiseAnd band1, band2;
+    band1.configure(&src1_sub1, &src2_sub1, &dst_sub1);
+    band2.configure(&src1_sub2, &src2_sub2, &dst_sub2);
+
+    // Allocate tensors
+    src1.allocator()->allocate();
+    src2.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src1.info()->is_resizable());
+    BOOST_TEST(!src2.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors
+    std::uniform_int_distribution<> distribution(0, 255);
+    library->fill(CLAccessor(src1), distribution, 0);
+    library->fill(CLAccessor(src2), distribution, 1);
+
+    // Compute function
+    band1.run();
+    band2.run();
+
+    return dst;
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(CL)
+BOOST_AUTO_TEST_SUITE(BitwiseAnd)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, SmallShapes() + LargeShapes(), shape)
+{
+    // Create tensors
+    CLTensor src1 = create_tensor(shape, DataType::U8);
+    CLTensor src2 = create_tensor(shape, DataType::U8);
+    CLTensor dst  = create_tensor(shape, DataType::U8);
+
+    BOOST_TEST(src1.info()->is_resizable());
+    BOOST_TEST(src2.info()->is_resizable());
+    BOOST_TEST(dst.info()->is_resizable());
+
+    // Create and configure function
+    CLBitwiseAnd band;
+    band.configure(&src1, &src2, &dst);
+
+    // Validate valid region
+    const ValidRegion valid_region = shape_to_valid_region(shape);
+    validate(src1.info()->valid_region(), valid_region);
+    validate(src2.info()->valid_region(), valid_region);
+    validate(dst.info()->valid_region(), valid_region);
+
+    // Validate padding
+    const PaddingSize padding(0, required_padding(shape.x(), 16), 0, 0);
+    validate(src1.info()->padding(), padding);
+    validate(src2.info()->padding(), padding);
+    validate(dst.info()->padding(), padding);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes(), shape)
+{
+    // Compute function
+    CLTensor dst = compute_bitwise_and(shape);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_bitwise_and(shape);
+
+    // Validate output
+    validate(CLAccessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_AUTO_TEST_CASE(RunSubTensor)
+{
+    // Create shape
+    TensorShape shape(27U, 35U, 8U, 2U);
+
+    // Compute function
+    CLTensor dst = compute_bitwise_and_subtensor(shape);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_bitwise_and(shape);
+
+    // Validate output
+    validate(CLAccessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes(), shape)
+{
+    // Compute function
+    CLTensor dst = compute_bitwise_and(shape);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_bitwise_and(shape);
+
+    // Validate output
+    validate(CLAccessor(dst), ref_dst);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/CL/CLFixture.cpp b/tests/validation/CL/CLFixture.cpp
new file mode 100644
index 0000000000..845e16629d
--- /dev/null
+++ b/tests/validation/CL/CLFixture.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "validation/CL/CLFixture.h"
+
+#include "boost_wrapper.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::validation;
+using namespace arm_compute::test::validation::cl;
+
+BOOST_GLOBAL_FIXTURE(CLFixture);
diff --git a/tests/validation/CL/CLFixture.h b/tests/validation/CL/CLFixture.h
new file mode 100644
index 0000000000..138e0566eb
--- /dev/null
+++ b/tests/validation/CL/CLFixture.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_VALIDATION_CL_CLFIXTURE_H__
+#define __ARM_COMPUTE_TEST_VALIDATION_CL_CLFIXTURE_H__
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace cl
+{
+struct CLFixture
+{
+    CLFixture()
+    {
+        CLScheduler::get().default_init();
+    }
+};
+} // namespace cl
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/validation/CL/CMakeLists.txt b/tests/validation/CL/CMakeLists.txt
new file mode 100644
index 0000000000..209b662033
--- /dev/null
+++ b/tests/validation/CL/CMakeLists.txt
@@ -0,0 +1,48 @@
+# Copyright (c) 2017 ARM Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+cmake_minimum_required (VERSION 3.1)
+
+include_directories(${CMAKE_SOURCE_DIR}/../include)
+
+set(arm_compute_test_validation_OPENCL_SOURCE_FILES
+    ${CMAKE_SOURCE_DIR}/CL/CLAccessor.h
+    ${CMAKE_SOURCE_DIR}/CL/Helper.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/CLFixture.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/CLFixture.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/BitwiseAnd.cpp
+)
+
+add_library(arm_compute_test_validation_OPENCL OBJECT
+    ${arm_compute_test_validation_OPENCL_SOURCE_FILES}
+)
+
+set(arm_compute_test_validation_TARGET_OBJECTS
+    ${arm_compute_test_validation_TARGET_OBJECTS}
+    $<TARGET_OBJECTS:arm_compute_test_validation_OPENCL>
+    PARENT_SCOPE
+)
+
+set(arm_compute_test_validation_TARGET_LIBRARIES
+    ${arm_compute_test_validation_TARGET_LIBRARIES}
+    OpenCL
+    PARENT_SCOPE
+)
diff --git a/tests/validation/CL/DepthConvert.cpp b/tests/validation/CL/DepthConvert.cpp
new file mode 100644
index 0000000000..7a421ecf5a
--- /dev/null
+++ b/tests/validation/CL/DepthConvert.cpp
@@ -0,0 +1,413 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "validation/Datasets.h"
+#include "validation/Reference.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/functions/CLDepthConvert.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "boost_wrapper.h"
+
+#include <random>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::cl;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+/** Compute CL depth convert function.
+ *
+ * @param[in] shape  Shape of the input and output tensors.
+ * @param[in] dt_in  Data type of input tensor.
+ * @param[in] dt_out Data type of the output tensor.
+ * @param[in] policy Conversion policy.
+ * @param[in] shift  Value for down/up conversions. Must be 0 <= shift < 8.
+ *
+ * @return Computed output CLtensor.
+ */
+CLTensor compute_depth_convert(const TensorShape &shape, DataType dt_in, DataType dt_out, ConvertPolicy policy, uint32_t shift)
+{
+    // Create tensors
+    CLTensor src = create_tensor(shape, dt_in);
+    CLTensor dst = create_tensor(shape, dt_out);
+
+    // Create and configure function
+    CLDepthConvert depth_convert;
+    depth_convert.configure(&src, &dst, policy, shift);
+
+    // Allocate tensors
+    src.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors
+    library->fill_tensor_uniform(CLAccessor(src), 0);
+
+    // Compute function
+    depth_convert.run();
+
+    return dst;
+}
+/** Configure and validate region/padding function.
+ *
+ * @param[in] shape  Shape of the input and output tensors.
+ * @param[in] dt_in  Data type of input tensor.
+ * @param[in] dt_out Data type of the output tensor.
+ * @param[in] policy Conversion policy.
+ * @param[in] shift  Value for down/up conversions. Must be 0 <= shift < 8.
+ *
+ */
+void compute_configure_validate(const TensorShape &shape, DataType dt_in, DataType dt_out, ConvertPolicy policy, uint32_t shift)
+{
+    // Create tensors
+    CLTensor src = create_tensor(shape, dt_in);
+    CLTensor dst = create_tensor(shape, dt_out);
+
+    BOOST_TEST(src.info()->is_resizable());
+    BOOST_TEST(dst.info()->is_resizable());
+
+    // Create and configure function
+    CLDepthConvert depth_convert;
+    depth_convert.configure(&src, &dst, policy, shift);
+
+    // Validate valid region
+    const ValidRegion valid_region = shape_to_valid_region(shape);
+    validate(src.info()->valid_region(), valid_region);
+    validate(dst.info()->valid_region(), valid_region);
+
+    // Validate padding
+    const PaddingSize padding(0, required_padding(shape.x(), 16), 0, 0);
+    validate(src.info()->padding(), padding);
+    validate(dst.info()->padding(), padding);
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(CL)
+BOOST_AUTO_TEST_SUITE(DepthConvert)
+
+BOOST_AUTO_TEST_SUITE(U8_to_U16)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute configure and validate region/padding
+    compute_configure_validate(shape, DataType::U8, DataType::U16, policy, shift);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute function
+    CLTensor dst = compute_depth_convert(shape, DataType::U8, DataType::U16, policy, shift);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::U8, DataType::U16, policy, shift, 0);
+
+    // Validate output
+    validate(CLAccessor(dst), ref_dst);
+}
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute function
+    CLTensor dst = compute_depth_convert(shape, DataType::U8, DataType::U16, policy, shift);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::U8, DataType::U16, policy, shift, 0);
+
+    // Validate output
+    validate(CLAccessor(dst), ref_dst);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(U8_to_S16)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute configure and validate region/padding
+    compute_configure_validate(shape, DataType::U8, DataType::S16, policy, shift);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute function
+    CLTensor dst = compute_depth_convert(shape, DataType::U8, DataType::S16, policy, shift);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::U8, DataType::S16, policy, shift, 0);
+
+    // Validate output
+    validate(CLAccessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute function
+    CLTensor dst = compute_depth_convert(shape, DataType::U8, DataType::S16, policy, shift);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::U8, DataType::S16, policy, shift, 0);
+
+    // Validate output
+    validate(CLAccessor(dst), ref_dst);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(U8_to_S32)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute configure and validate region/padding
+    compute_configure_validate(shape, DataType::U8, DataType::S32, policy, shift);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute function
+    CLTensor dst = compute_depth_convert(shape, DataType::U8, DataType::S32, policy, shift);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::U8, DataType::S32, policy, shift, 0);
+
+    // Validate output
+    validate(CLAccessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute function
+    CLTensor dst = compute_depth_convert(shape, DataType::U8, DataType::S32, policy, shift);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::U8, DataType::S32, policy, shift, 0);
+
+    // Validate output
+    validate(CLAccessor(dst), ref_dst);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(U16_to_U8)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute configure and validate region/padding
+    compute_configure_validate(shape, DataType::U16, DataType::U8, policy, shift);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute function
+    CLTensor dst = compute_depth_convert(shape, DataType::U16, DataType::U8, policy, shift);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::U16, DataType::U8, policy, shift, 0);
+
+    // Validate output
+    validate(CLAccessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute function
+    CLTensor dst = compute_depth_convert(shape, DataType::U16, DataType::U8, policy, shift);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::U16, DataType::U8, policy, shift, 0);
+
+    // Validate output
+    validate(CLAccessor(dst), ref_dst);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(U16_to_U32)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute configure and validate region/padding
+    compute_configure_validate(shape, DataType::U16, DataType::U32, policy, shift);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute function
+    CLTensor dst = compute_depth_convert(shape, DataType::U16, DataType::U32, policy, shift);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::U16, DataType::U32, policy, shift, 0);
+
+    // Validate output
+    validate(CLAccessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute function
+    CLTensor dst = compute_depth_convert(shape, DataType::U16, DataType::U32, policy, shift);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::U16, DataType::U32, policy, shift, 0);
+
+    // Validate output
+    validate(CLAccessor(dst), ref_dst);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(S16_to_U8)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute configure and validate region/padding
+    compute_configure_validate(shape, DataType::S16, DataType::U8, policy, shift);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute function
+    CLTensor dst = compute_depth_convert(shape, DataType::S16, DataType::U8, policy, shift);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::S16, DataType::U8, policy, shift, 0);
+
+    // Validate output
+    validate(CLAccessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute function
+    CLTensor dst = compute_depth_convert(shape, DataType::S16, DataType::U8, policy, shift);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::S16, DataType::U8, policy, shift, 0);
+
+    // Validate output
+    validate(CLAccessor(dst), ref_dst);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(S16_to_S32)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute configure and validate region/padding
+    compute_configure_validate(shape, DataType::S16, DataType::S32, policy, shift);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute function
+    CLTensor dst = compute_depth_convert(shape, DataType::S16, DataType::S32, policy, shift);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::S16, DataType::S32, policy, shift, 0);
+
+    // Validate output
+    validate(CLAccessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute function
+    CLTensor dst = compute_depth_convert(shape, DataType::S16, DataType::S32, policy, shift);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::S16, DataType::S32, policy, shift, 0);
+
+    // Validate output
+    validate(CLAccessor(dst), ref_dst);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/CL/FillBorder.cpp b/tests/validation/CL/FillBorder.cpp
new file mode 100644
index 0000000000..42b9064982
--- /dev/null
+++ b/tests/validation/CL/FillBorder.cpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "validation/Datasets.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+
+#include "boost_wrapper.h"
+
+#include <random>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::cl;
+using namespace arm_compute::test::validation;
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(CL)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(FillBorder, BorderModes() * boost::unit_test::data::make({ PaddingSize{ 0 }, PaddingSize{ 1, 0, 1, 2 }, PaddingSize{ 10 } }), border_mode, padding)
+{
+    constexpr uint8_t border_value = 42U;
+    constexpr uint8_t tensor_value = 89U;
+    BorderSize        border_size{ 5 };
+
+    // Create tensors
+    CLTensor src = create_tensor(TensorShape{ 10U, 10U, 2U }, DataType::U8);
+
+    src.info()->extend_padding(padding);
+
+    // Allocate tensor
+    src.allocator()->allocate();
+
+    // Check padding is as required
+    validate(src.info()->padding(), padding);
+
+    // Fill tensor with constant value
+    std::uniform_int_distribution<uint8_t> distribution{ tensor_value, tensor_value };
+    library->fill(CLAccessor(src), distribution, 0);
+
+    // Create and configure kernel
+    CLFillBorderKernel fill_border;
+    fill_border.configure(&src, border_size, border_mode, border_value);
+
+    // Run kernel
+    fill_border.run(fill_border.window(), CLScheduler::get().queue());
+
+    // Validate border
+    border_size.limit(padding);
+    validate(CLAccessor(src), border_size, border_mode, &border_value);
+
+    // Validate tensor
+    validate(CLAccessor(src), &tensor_value);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/CL/Threshold.cpp b/tests/validation/CL/Threshold.cpp
new file mode 100644
index 0000000000..a8c77ec10a
--- /dev/null
+++ b/tests/validation/CL/Threshold.cpp
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "dataset/ThresholdDataset.h"
+#include "validation/Datasets.h"
+#include "validation/Reference.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLThreshold.h"
+
+#include "boost_wrapper.h"
+
+#include <random>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::cl;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+/** Compute Threshold function.
+ *
+ * @param[in] shape       Shape of the input and output tensors.
+ * @param[in] threshold   Threshold. When the threshold type is RANGE, this is used as the lower threshold.
+ * @param[in] false_value value to set when the condition is not respected.
+ * @param[in] true_value  value to set when the condition is respected.
+ * @param[in] type        Thresholding type. Either RANGE or BINARY.
+ * @param[in] upper       Upper threshold. Only used when the thresholding type is RANGE.
+ *
+ * @return Computed output tensor.
+ */
+CLTensor compute_threshold(const TensorShape &shape, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
+{
+    // Create tensors
+    CLTensor src = create_tensor(shape, DataType::U8);
+    CLTensor dst = create_tensor(shape, DataType::U8);
+
+    // Create and configure function
+    CLThreshold thrsh;
+    thrsh.configure(&src, &dst, threshold, false_value, true_value, type, upper);
+
+    // Allocate tensors
+    src.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors
+    library->fill_tensor_uniform(CLAccessor(src), 0);
+
+    // Compute function
+    thrsh.run();
+
+    return dst;
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(CL)
+BOOST_AUTO_TEST_SUITE(Threshold)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration,
+                     (SmallShapes() + LargeShapes()) * ThresholdDataset(),
+                     shape, threshold_conf)
+{
+    // Create tensors
+    CLTensor src = create_tensor(shape, DataType::U8);
+    CLTensor dst = create_tensor(shape, DataType::U8);
+
+    BOOST_TEST(src.info()->is_resizable());
+    BOOST_TEST(dst.info()->is_resizable());
+
+    // Create and configure function
+    CLThreshold cl_threshold;
+    cl_threshold.configure(&src, &dst, threshold_conf.threshold, threshold_conf.false_value, threshold_conf.true_value, threshold_conf.type, threshold_conf.upper);
+
+    // Validate valid region
+    const ValidRegion valid_region = shape_to_valid_region(shape);
+    validate(src.info()->valid_region(), valid_region);
+    validate(dst.info()->valid_region(), valid_region);
+
+    // Validate padding
+    const PaddingSize padding(0, required_padding(shape.x(), 16), 0, 0);
+    validate(src.info()->padding(), padding);
+    validate(dst.info()->padding(), padding);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall,
+                     SmallShapes() * ThresholdDataset(),
+                     shape, threshold_conf)
+{
+    // Compute function
+    CLTensor dst = compute_threshold(shape, threshold_conf.threshold, threshold_conf.false_value, threshold_conf.true_value, threshold_conf.type, threshold_conf.upper);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_threshold(shape, threshold_conf.threshold, threshold_conf.false_value, threshold_conf.true_value, threshold_conf.type, threshold_conf.upper);
+
+    // Validate output
+    validate(CLAccessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge,
+                     LargeShapes() * ThresholdDataset(),
+                     shape, threshold_conf)
+{
+    // Compute function
+    CLTensor dst = compute_threshold(shape, threshold_conf.threshold, threshold_conf.false_value, threshold_conf.true_value, threshold_conf.type, threshold_conf.upper);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_threshold(shape, threshold_conf.threshold, threshold_conf.false_value, threshold_conf.true_value, threshold_conf.type, threshold_conf.upper);
+
+    // Validate output
+    validate(CLAccessor(dst), ref_dst);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/CMakeLists.txt b/tests/validation/CMakeLists.txt
new file mode 100644
index 0000000000..3d8f56610b
--- /dev/null
+++ b/tests/validation/CMakeLists.txt
@@ -0,0 +1,96 @@
+# Copyright (c) 2017 ARM Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+cmake_minimum_required (VERSION 3.1)
+
+add_library(openvx SHARED IMPORTED)
+set_target_properties(openvx PROPERTIES
+    IMPORTED_LOCATION "${CMAKE_SOURCE_DIR}/../3rdparty/linux/armv7a/libopenvx.so"
+)
+
+add_library(vxu SHARED IMPORTED)
+set_target_properties(vxu PROPERTIES
+    IMPORTED_LOCATION "${CMAKE_SOURCE_DIR}/../3rdparty/linux/armv7a/libvxu.so"
+)
+
+add_library(OpenCL SHARED IMPORTED)
+set_target_properties(OpenCL PROPERTIES
+    IMPORTED_LOCATION "${CMAKE_SOURCE_DIR}/../build/opencl-1.2-stubs/libOpenCL.so"
+    IMPORTED_NO_SONAME 1
+)
+
+add_definitions(-DBOOST)
+
+set(ARM_COMPUTE_TARGETS_TO_VALIDATE "all" CACHE STRING "Semicolon-separated list of targets to include in validation.")
+
+set(ARM_COMPUTE_ALL_TARGETS
+    NEON
+    CL
+    UNIT
+    VX
+)
+
+if(ARM_COMPUTE_TARGETS_TO_VALIDATE STREQUAL "all")
+    set(ARM_COMPUTE_TARGETS_TO_VALIDATE ${ARM_COMPUTE_ALL_TARGETS})
+endif()
+
+list(REMOVE_DUPLICATES ARM_COMPUTE_TARGETS_TO_VALIDATE)
+
+foreach(TARGET ${ARM_COMPUTE_TARGETS_TO_VALIDATE})
+    list(FIND ARM_COMPUTE_ALL_TARGETS ${TARGET} idx)
+
+    if(${idx} LESS 0)
+        message(FATAL_ERROR "The target '${TARGET}' does not exist. It should be one of\n${ARM_COMPUTE_ALL_TARGETS}")
+    else()
+        add_subdirectory(${TARGET})
+    endif()
+endforeach()
+
+set(arm_compute_test_validation_SOURCE_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/Datasets.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/Reference.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/Reference.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/ReferenceCPP.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/ReferenceCPP.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/Validation.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/Validation.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/ValidationProgramOptions.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/ValidationUserConfiguration.h
+)
+
+add_library(arm_compute_test_validation OBJECT
+    ${arm_compute_test_validation_SOURCE_FILES}
+)
+
+add_executable(arm_compute_validation
+    $<TARGET_OBJECTS:arm_compute_test_validation>
+    ${arm_compute_test_validation_TARGET_OBJECTS}
+    $<TARGET_OBJECTS:tensor_library>
+    $<TARGET_OBJECTS:arm_compute_test>
+)
+
+target_link_libraries(arm_compute_validation
+    boost_unit_test_framework
+    boost_program_options
+    arm_compute
+    ${arm_compute_test_validation_TARGET_LIBRARIES}
+)
diff --git a/tests/validation/Datasets.h b/tests/validation/Datasets.h
new file mode 100644
index 0000000000..ae76fb6be3
--- /dev/null
+++ b/tests/validation/Datasets.h
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_VALIDATION_DATASETS_H__
+#define __ARM_COMPUTE_TEST_VALIDATION_DATASETS_H__
+
+#include "dataset/ActivationFunctionDataset.h"
+#include "dataset/BatchNormalizationLayerDataset.h"
+#include "dataset/BorderModeDataset.h"
+#include "dataset/ConvertPolicyDataset.h"
+#include "dataset/ConvolutionLayerDataset.h"
+#include "dataset/DataTypeDatasets.h"
+#include "dataset/FullyConnectedLayerDataset.h"
+#include "dataset/GEMMDataset.h"
+#include "dataset/ImageDatasets.h"
+#include "dataset/InterpolationPolicyDataset.h"
+#include "dataset/NormalizationTypeDataset.h"
+#include "dataset/PoolingLayerDataset.h"
+#include "dataset/RoundingPolicyDataset.h"
+#include "dataset/ShapeDatasets.h"
+#include "dataset/ThresholdDataset.h"
+
+#include "boost_wrapper.h"
+
+using namespace boost::unit_test::data::monomorphic;
+
+namespace boost
+{
+namespace unit_test
+{
+namespace data
+{
+namespace monomorphic
+{
+/// Register the data set with Boost
+template <>
+struct is_dataset<arm_compute::test::SmallImages> : boost::mpl::true_
+{
+};
+
+/// Register the data set with Boost
+template <>
+struct is_dataset<arm_compute::test::LargeImages> : boost::mpl::true_
+{
+};
+
+/// Register the data set with Boost
+template <>
+struct is_dataset<arm_compute::test::SmallShapes> : boost::mpl::true_
+{
+};
+
+/// Register the data set with Boost
+template <>
+struct is_dataset<arm_compute::test::Small1DShape> : boost::mpl::true_
+{
+};
+
+/// Register the data set with Boost
+template <>
+struct is_dataset<arm_compute::test::LargeShapes> : boost::mpl::true_
+{
+};
+
+/// Register the data set with Boost
+template <>
+struct is_dataset<arm_compute::test::AllDataTypes> : boost::mpl::true_
+{
+};
+
+/// Register the data set with Boost
+template <>
+struct is_dataset<arm_compute::test::UnsignedDataTypes> : boost::mpl::true_
+{
+};
+
+// Register the data set with Boost
+template <>
+struct is_dataset<arm_compute::test::SignedDataTypes> : boost::mpl::true_
+{
+};
+
+/// Register the data set with Boost
+template <>
+struct is_dataset<arm_compute::test::FloatDataTypes> : boost::mpl::true_
+{
+};
+
+/// Register the data set with Boost
+template <>
+struct is_dataset<arm_compute::test::FixedPointDataTypes> : boost::mpl::true_
+{
+};
+
+/// Register the data set with Boost
+template <>
+struct is_dataset<arm_compute::test::CNNFloatDataTypes> : boost::mpl::true_
+{
+};
+
+/// Register the data set with Boost
+template <>
+struct is_dataset<arm_compute::test::CNNFixedPointDataTypes> : boost::mpl::true_
+{
+};
+
+/// Register the data set with Boost
+template <>
+struct is_dataset<arm_compute::test::CNNDataTypes> : boost::mpl::true_
+{
+};
+
+/// Register the data set with Boost
+template <>
+struct is_dataset<arm_compute::test::ActivationFunctions> : boost::mpl::true_
+{
+};
+
+/// Register the data set with Boost
+template <>
+struct is_dataset<arm_compute::test::BorderModes> : boost::mpl::true_
+{
+};
+
+/// Register the data set with Boost
+template <>
+struct is_dataset<arm_compute::test::ConvertPolicies> : boost::mpl::true_
+{
+};
+
+/// Register the data set with Boost
+template <>
+struct is_dataset<arm_compute::test::InterpolationPolicies> : boost::mpl::true_
+{
+};
+
+/// Register the data set with Boost
+template <>
+struct is_dataset<arm_compute::test::NormalizationTypes> : boost::mpl::true_
+{
+};
+
+/// Register the data set with Boost
+template <>
+struct is_dataset<arm_compute::test::RandomPoolingLayerDataset> : boost::mpl::true_
+{
+};
+
+/// Register the data set with Boost
+template <>
+struct is_dataset<arm_compute::test::RoundingPolicies> : boost::mpl::true_
+{
+};
+
+/// Register the data set with Boost
+template <>
+struct is_dataset<arm_compute::test::AlexNetConvolutionLayerDataset> : boost::mpl::true_
+{
+};
+
+/// Register the data set with Boost
+template <>
+struct is_dataset<arm_compute::test::AlexNetFullyConnectedLayerDataset> : boost::mpl::true_
+{
+};
+
+/// Register the data set with Boost
+template <>
+struct is_dataset<arm_compute::test::DirectConvolutionShapes> : boost::mpl::true_
+{
+};
+
+/// Register the data set with Boost
+template <>
+struct is_dataset<arm_compute::test::SmallFullyConnectedLayerDataset> : boost::mpl::true_
+{
+};
+
+/// Register the data set with Boost
+template <>
+struct is_dataset<arm_compute::test::LargeFullyConnectedLayerDataset> : boost::mpl::true_
+{
+};
+
+/// Register the data set with Boost
+template <>
+struct is_dataset<arm_compute::test::SmallConvolutionLayerDataset> : boost::mpl::true_
+{
+};
+
+/// Register the data set with Boost
+template <>
+struct is_dataset<arm_compute::test::SmallGEMMDataset> : boost::mpl::true_
+{
+};
+
+/// Register the data set with Boost
+template <>
+struct is_dataset<arm_compute::test::LargeGEMMDataset> : boost::mpl::true_
+{
+};
+
+/// Register the data set with Boost
+template <>
+struct is_dataset<arm_compute::test::RandomBatchNormalizationLayerDataset> : boost::mpl::true_
+{
+};
+
+/// Register the data set with Boost
+template <>
+struct is_dataset<arm_compute::test::ThresholdDataset> : boost::mpl::true_
+{
+};
+}
+}
+}
+}
+#endif
diff --git a/tests/validation/FixedPoint.h b/tests/validation/FixedPoint.h
new file mode 100644
index 0000000000..380bad04a1
--- /dev/null
+++ b/tests/validation/FixedPoint.h
@@ -0,0 +1,975 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_VALIDATION_FIXEDPOINT_H__
+#define __ARM_COMPUTE_TEST_VALIDATION_FIXEDPOINT_H__
+
+#include "Utils.h"
+
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <limits>
+#include <string>
+#include <type_traits>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace fixed_point_arithmetic
+{
+namespace detail
+{
+// Forward declare structs
+struct functions;
+template <typename T>
+struct constant_expr;
+}
+
+/** Fixed point traits */
+namespace traits
+{
+// Promote types
+// *INDENT-OFF*
+// clang-format off
+template <typename T> struct promote { };
+template <> struct promote<uint8_t> { using type = uint16_t; };
+template <> struct promote<int8_t> { using type = int16_t; };
+template <> struct promote<uint16_t> { using type = uint32_t; };
+template <> struct promote<int16_t> { using type = int32_t; };
+template <> struct promote<uint32_t> { using type = uint64_t; };
+template <> struct promote<int32_t> { using type = int64_t; };
+template <> struct promote<uint64_t> { using type = uint64_t; };
+template <> struct promote<int64_t> { using type = int64_t; };
+// clang-format on
+// *INDENT-ON*
+}
+
+/** Strongly typed enum class representing the overflow policy */
+enum class OverflowPolicy
+{
+    WRAP,    /**< Wrap policy */
+    SATURATE /**< Saturate policy */
+};
+/** Strongly typed enum class representing the rounding policy */
+enum class RoundingPolicy
+{
+    TO_ZERO,        /**< Round to zero policy */
+    TO_NEAREST_EVEN /**< Round to nearest even policy */
+};
+
+/** Arbitrary fixed-point arithmetic class */
+template <typename T>
+class fixed_point
+{
+public:
+    // Static Checks
+    static_assert(std::is_integral<T>::value, "Type is not an integer");
+
+    // Friends
+    friend struct detail::functions;
+    friend struct detail::constant_expr<T>;
+
+    /** Constructor (from different fixed point type)
+     *
+     * @param[in] val Fixed point
+     * @param[in] p   Fixed point precision
+     */
+    template <typename U>
+    fixed_point(fixed_point<U> val, uint8_t p)
+        : _value(0), _fixed_point_position(p)
+    {
+        assert(p > 0 && p < std::numeric_limits<T>::digits);
+        T v = 0;
+
+        if(std::numeric_limits<T>::digits < std::numeric_limits<U>::digits)
+        {
+            val.rescale(p);
+            v = detail::constant_expr<T>::saturate_cast(val.raw());
+        }
+        else
+        {
+            auto v_cast = static_cast<fixed_point<T>>(val);
+            v_cast.rescale(p);
+            v = v_cast.raw();
+        }
+        _value = static_cast<T>(v);
+    }
+    /** Constructor (from integer)
+     *
+     * @param[in] val    Integer value to be represented as fixed point
+     * @param[in] p      Fixed point precision
+     * @param[in] is_raw If true val is a raw fixed point value else an integer
+     */
+    template <typename U, typename = typename std::enable_if<std::is_integral<U>::value>::type>
+    fixed_point(U val, uint8_t p, bool is_raw = false)
+        : _value(val << p), _fixed_point_position(p)
+    {
+        if(is_raw)
+        {
+            _value = val;
+        }
+    }
+    /** Constructor (from float)
+     *
+     * @param[in] val Float value to be represented as fixed point
+     * @param[in] p   Fixed point precision
+     */
+    fixed_point(float val, uint8_t p)
+        : _value(detail::constant_expr<T>::to_fixed(val, p)), _fixed_point_position(p)
+    {
+        assert(p > 0 && p < std::numeric_limits<T>::digits);
+    }
+    /** Constructor (from float string)
+     *
+     * @param[in] str Float string to be represented as fixed point
+     * @param[in] p   Fixed point precision
+     */
+    fixed_point(std::string str, uint8_t p)
+        : _value(detail::constant_expr<T>::to_fixed(arm_compute::test::cpp11::stof(str), p)), _fixed_point_position(p)
+    {
+        assert(p > 0 && p < std::numeric_limits<T>::digits);
+    }
+    /** Default copy constructor */
+    fixed_point &operator=(const fixed_point &) = default;
+    /** Default move constructor */
+    fixed_point &operator=(fixed_point &&) = default;
+    /** Default copy assignment operator */
+    fixed_point(const fixed_point &) = default;
+    /** Default move assignment operator */
+    fixed_point(fixed_point &&) = default;
+
+    /** Float conversion operator
+     *
+     * @return Float representation of fixed point
+     */
+    operator float() const
+    {
+        return detail::constant_expr<T>::to_float(_value, _fixed_point_position);
+    }
+    /** Integer conversion operator
+     *
+     * @return Integer representation of fixed point
+     */
+    template <typename U, typename = typename std::enable_if<std::is_integral<T>::value>::type>
+    operator U() const
+    {
+        return detail::constant_expr<T>::to_int(_value, _fixed_point_position);
+    }
+    /** Convert to different fixed point of different type but same precision
+     *
+     * @note Down-conversion might fail.
+     */
+    template <typename U>
+    operator fixed_point<U>()
+    {
+        U val = static_cast<U>(_value);
+        if(std::numeric_limits<U>::digits < std::numeric_limits<T>::digits)
+        {
+            val = detail::constant_expr<U>::saturate_cast(_value);
+        }
+        return fixed_point<U>(val, _fixed_point_position, true);
+    }
+
+    /** Arithmetic += assignment operator
+     *
+     * @param[in] rhs Fixed point operand
+     *
+     * @return Reference to this fixed point
+     */
+    template <typename U>
+    fixed_point<T> &operator+=(const fixed_point<U> &rhs)
+    {
+        fixed_point<T> val(rhs, _fixed_point_position);
+        _value += val.raw();
+        return *this;
+    }
+    /** Arithmetic -= assignment operator
+     *
+     * @param[in] rhs Fixed point operand
+     *
+     * @return Reference to this fixed point
+     */
+    template <typename U>
+    fixed_point<T> &operator-=(const fixed_point<U> &rhs)
+    {
+        fixed_point<T> val(rhs, _fixed_point_position);
+        _value -= val.raw();
+        return *this;
+    }
+
+    /** Raw value accessor
+     *
+     * @return Raw fixed point value
+     */
+    T raw() const
+    {
+        return _value;
+    }
+    /** Precision accessor
+     *
+     * @return Precision of fixed point
+     */
+    uint8_t precision() const
+    {
+        return _fixed_point_position;
+    }
+    /** Rescale a fixed point to a new precision
+     *
+     * @param[in] p New fixed point precision
+     */
+    void rescale(uint8_t p)
+    {
+        assert(p > 0 && p < std::numeric_limits<T>::digits);
+
+        if(p > _fixed_point_position)
+        {
+            _value <<= (p - _fixed_point_position);
+        }
+        else if(p < _fixed_point_position)
+        {
+            _value >>= (_fixed_point_position - p);
+        }
+
+        _fixed_point_position = p;
+    }
+
+private:
+    T       _value;                /**< Fixed point raw value */
+    uint8_t _fixed_point_position; /**< Fixed point precision */
+};
+
+namespace detail
+{
+/** Count the number of leading zero bits in the given value.
+ *
+ * @param[in] value Input value.
+ *
+ * @return Number of leading zero bits.
+ */
+template <typename T>
+constexpr int clz(T value)
+{
+    using unsigned_T = typename std::make_unsigned<T>::type;
+    // __builtin_clz is available for int. Need to correct reported number to
+    // match the original type.
+    return __builtin_clz(value) - (32 - std::numeric_limits<unsigned_T>::digits);
+}
+
+template <typename T>
+struct constant_expr
+{
+    /** Calculate representation of 1 in fixed point given a fixed point precision
+     *
+     * @param[in] p Fixed point precision
+     *
+     * @return Representation of value 1 in fixed point.
+     */
+    static constexpr T fixed_one(uint8_t p)
+    {
+        return (1 << p);
+    }
+    /** Calculate fixed point precision step given a fixed point precision
+     *
+     * @param[in] p Fixed point precision
+     *
+     * @return Fixed point precision step
+     */
+    static constexpr float fixed_step(uint8_t p)
+    {
+        return (1.0f / static_cast<float>(1 << p));
+    }
+
+    /** Convert a fixed point value to float given its precision.
+     *
+     * @param[in] val Fixed point value
+     * @param[in] p   Fixed point precision
+     *
+     * @return Float representation of the fixed point number
+     */
+    static constexpr float to_float(T val, uint8_t p)
+    {
+        return static_cast<float>(val * fixed_step(p));
+    }
+    /** Convert a fixed point value to integer given its precision.
+     *
+     * @param[in] val Fixed point value
+     * @param[in] p   Fixed point precision
+     *
+     * @return Integer of the fixed point number
+     */
+    static constexpr T to_int(T val, uint8_t p)
+    {
+        return val >> p;
+    }
+    /** Convert a single precision floating point value to a fixed point representation given its precision.
+     *
+     * @param[in] val Floating point value
+     * @param[in] p   Fixed point precision
+     *
+     * @return The raw fixed point representation
+     */
+    static constexpr T to_fixed(float val, uint8_t p)
+    {
+        return static_cast<T>(val * fixed_one(p) + ((val >= 0) ? 0.5 : -0.5));
+    }
+    /** Clamp value between two ranges
+     *
+     * @param[in] val Value to clamp
+     * @param[in] min Minimum value to clamp to
+     * @param[in] max Maximum value to clamp to
+     *
+     * @return clamped value
+     */
+    static constexpr T clamp(T val, T min, T max)
+    {
+        return std::min(std::max(val, min), max);
+    }
+    /** Saturate given number
+     *
+     * @param[in] val Value to saturate
+     *
+     * @return Saturated value
+     */
+    template <typename U>
+    static constexpr T saturate_cast(U val)
+    {
+        return static_cast<T>(std::min<U>(std::max<U>(val, static_cast<U>(std::numeric_limits<T>::min())), static_cast<U>(std::numeric_limits<T>::max())));
+    }
+};
+struct functions
+{
+    /** Output stream operator
+     *
+     * @param[in] s Output stream
+     * @param[in] x Fixed point value
+     *
+     * @return Reference output to updated stream
+     */
+    template <typename T, typename U, typename traits>
+    static std::basic_ostream<T, traits> &write(std::basic_ostream<T, traits> &s, fixed_point<U> &x)
+    {
+        return s << static_cast<float>(x);
+    }
+    /** Signbit of a fixed point number.
+     *
+     * @param[in] x Fixed point number
+     *
+     * @return True if negative else false.
+     */
+    template <typename T>
+    static bool signbit(fixed_point<T> x)
+    {
+        return ((x._value >> std::numeric_limits<T>::digits) != 0);
+    }
+    /** Checks if two fixed point numbers are equal
+     *
+     * @param[in] x First fixed point operand
+     * @param[in] y Second fixed point operand
+     *
+     * @return True if fixed points are equal else false
+     */
+    template <typename T>
+    static bool isequal(fixed_point<T> x, fixed_point<T> y)
+    {
+        uint8_t p = std::min(x._fixed_point_position, y._fixed_point_position);
+        x.rescale(p);
+        y.rescale(p);
+        return (x._value == y._value);
+    }
+    /** Checks if two fixed point number are not equal
+     *
+     * @param[in] x First fixed point operand
+     * @param[in] y Second fixed point operand
+     *
+     * @return True if fixed points are not equal else false
+     */
+    template <typename T>
+    static bool isnotequal(fixed_point<T> x, fixed_point<T> y)
+    {
+        return !isequal(x, y);
+    }
+    /** Checks if one fixed point is greater than the other
+     *
+     * @param[in] x First fixed point operand
+     * @param[in] y Second fixed point operand
+     *
+     * @return True if fixed point is greater than other
+     */
+    template <typename T>
+    static bool isgreater(fixed_point<T> x, fixed_point<T> y)
+    {
+        uint8_t p = std::min(x._fixed_point_position, y._fixed_point_position);
+        x.rescale(p);
+        y.rescale(p);
+        return (x._value > y._value);
+    }
+    /** Checks if one fixed point is greater or equal than the other
+     *
+     * @param[in] x First fixed point operand
+     * @param[in] y Second fixed point operand
+     *
+     * @return True if fixed point is greater or equal than other
+     */
+    template <typename T>
+    static bool isgreaterequal(fixed_point<T> x, fixed_point<T> y)
+    {
+        uint8_t p = std::min(x._fixed_point_position, y._fixed_point_position);
+        x.rescale(p);
+        y.rescale(p);
+        return (x._value >= y._value);
+    }
+    /** Checks if one fixed point is less than the other
+     *
+     * @param[in] x First fixed point operand
+     * @param[in] y Second fixed point operand
+     *
+     * @return True if fixed point is less than other
+     */
+    template <typename T>
+    static bool isless(fixed_point<T> x, fixed_point<T> y)
+    {
+        uint8_t p = std::min(x._fixed_point_position, y._fixed_point_position);
+        x.rescale(p);
+        y.rescale(p);
+        return (x._value < y._value);
+    }
+    /** Checks if one fixed point is less or equal than the other
+     *
+     * @param[in] x First fixed point operand
+     * @param[in] y Second fixed point operand
+     *
+     * @return True if fixed point is less or equal than other
+     */
+    template <typename T>
+    static bool islessequal(fixed_point<T> x, fixed_point<T> y)
+    {
+        uint8_t p = std::min(x._fixed_point_position, y._fixed_point_position);
+        x.rescale(p);
+        y.rescale(p);
+        return (x._value <= y._value);
+    }
+    /** Checks if one fixed point is less or greater than the other
+     *
+     * @param[in] x First fixed point operand
+     * @param[in] y Second fixed point operand
+     *
+     * @return True if fixed point is less or greater than other
+     */
+    template <typename T>
+    static bool islessgreater(fixed_point<T> x, fixed_point<T> y)
+    {
+        return isnotequal(x, y);
+    }
+    /** Clamp fixed point to specific range.
+     *
+     * @param[in] x   Fixed point operand
+     * @param[in] min Minimum value to clamp to
+     * @param[in] max Maximum value to clamp to
+     *
+     * @return Clamped result
+     */
+    template <typename T>
+    static fixed_point<T> clamp(fixed_point<T> x, T min, T max)
+    {
+        return fixed_point<T>(constant_expr<T>::clamp(x._value, min, max), x._fixed_point_position, true);
+    }
+    /** Negate number
+     *
+     * @param[in] x Fixed point operand
+     *
+     * @return Negated fixed point result
+     */
+    template <OverflowPolicy OP = OverflowPolicy::SATURATE, typename T>
+    static fixed_point<T> negate(fixed_point<T> x)
+    {
+        using promoted_T = typename traits::promote<T>::type;
+        promoted_T val   = -x._value;
+        if(OP == OverflowPolicy::SATURATE)
+        {
+            val = constant_expr<T>::saturate_cast(val);
+        }
+        return fixed_point<T>(static_cast<T>(val), x._fixed_point_position, true);
+    }
+    /** Perform addition among two fixed point numbers
+     *
+     * @param[in] x First fixed point operand
+     * @param[in] y Second fixed point operand
+     *
+     * @return Result fixed point with precision equal to minimum precision of both operands
+     */
+    template <OverflowPolicy OP = OverflowPolicy::SATURATE, typename T>
+    static fixed_point<T> add(fixed_point<T> x, fixed_point<T> y)
+    {
+        uint8_t p = std::min(x._fixed_point_position, y._fixed_point_position);
+        x.rescale(p);
+        y.rescale(p);
+        if(OP == OverflowPolicy::SATURATE)
+        {
+            using type = typename traits::promote<T>::type;
+            type val   = static_cast<type>(x._value) + static_cast<type>(y._value);
+            val        = constant_expr<T>::saturate_cast(val);
+            return fixed_point<T>(static_cast<T>(val), p, true);
+        }
+        else
+        {
+            return fixed_point<T>(x._value + y._value, p, true);
+        }
+    }
+    /** Perform subtraction among two fixed point numbers
+     *
+     * @param[in] x First fixed point operand
+     * @param[in] y Second fixed point operand
+     *
+     * @return Result fixed point with precision equal to minimum precision of both operands
+     */
+    template <OverflowPolicy OP = OverflowPolicy::SATURATE, typename T>
+    static fixed_point<T> sub(fixed_point<T> x, fixed_point<T> y)
+    {
+        uint8_t p = std::min(x._fixed_point_position, y._fixed_point_position);
+        x.rescale(p);
+        y.rescale(p);
+        if(OP == OverflowPolicy::SATURATE)
+        {
+            using type = typename traits::promote<T>::type;
+            type val   = static_cast<type>(x._value) - static_cast<type>(y._value);
+            val        = constant_expr<T>::saturate_cast(val);
+            return fixed_point<T>(static_cast<T>(val), p, true);
+        }
+        else
+        {
+            return fixed_point<T>(x._value - y._value, p, true);
+        }
+    }
+    /** Perform multiplication among two fixed point numbers
+     *
+     * @param[in] x First fixed point operand
+     * @param[in] y Second fixed point operand
+     *
+     * @return Result fixed point with precision equal to minimum precision of both operands
+     */
+    template <OverflowPolicy OP = OverflowPolicy::SATURATE, typename T>
+    static fixed_point<T> mul(fixed_point<T> x, fixed_point<T> y)
+    {
+        using promoted_T        = typename traits::promote<T>::type;
+        uint8_t    p_min        = std::min(x._fixed_point_position, y._fixed_point_position);
+        uint8_t    p_max        = std::max(x._fixed_point_position, y._fixed_point_position);
+        promoted_T round_factor = (1 << (p_max - 1));
+        promoted_T val          = ((static_cast<promoted_T>(x._value) * static_cast<promoted_T>(y._value)) + round_factor) >> p_max;
+        if(OP == OverflowPolicy::SATURATE)
+        {
+            val = constant_expr<T>::saturate_cast(val);
+        }
+        return fixed_point<T>(static_cast<T>(val), p_min, true);
+    }
+    /** Perform division among two fixed point numbers
+     *
+     * @param[in] x First fixed point operand
+     * @param[in] y Second fixed point operand
+     *
+     * @return Result fixed point with precision equal to minimum precision of both operands
+     */
+    template <OverflowPolicy OP = OverflowPolicy::SATURATE, typename T>
+    static fixed_point<T> div(fixed_point<T> x, fixed_point<T> y)
+    {
+        using promoted_T = typename traits::promote<T>::type;
+        uint8_t    p     = std::min(x._fixed_point_position, y._fixed_point_position);
+        promoted_T denom = static_cast<promoted_T>(y._value);
+        if(denom != 0)
+        {
+            promoted_T val = (static_cast<promoted_T>(x._value) << std::max(x._fixed_point_position, y._fixed_point_position)) / denom;
+            if(OP == OverflowPolicy::SATURATE)
+            {
+                val = constant_expr<T>::saturate_cast(val);
+            }
+            return fixed_point<T>(static_cast<T>(val), p, true);
+        }
+        else
+        {
+            T val = (x._value < 0) ? std::numeric_limits<T>::min() : std::numeric_limits<T>::max();
+            return fixed_point<T>(val, p, true);
+        }
+    }
+    /** Shift left
+     *
+     * @param[in] x     Fixed point operand
+     * @param[in] shift Shift value
+     *
+     * @return Shifted value
+     */
+    template <OverflowPolicy OP = OverflowPolicy::SATURATE, typename T>
+    static fixed_point<T> shift_left(fixed_point<T> x, size_t shift)
+    {
+        using promoted_T = typename traits::promote<T>::type;
+        promoted_T val   = static_cast<promoted_T>(x._value) << shift;
+        if(OP == OverflowPolicy::SATURATE)
+        {
+            val = constant_expr<T>::saturate_cast(val);
+        }
+        return fixed_point<T>(static_cast<T>(val), x._fixed_point_position, true);
+    }
+    /** Shift right
+     *
+     * @param[in] x     Fixed point operand
+     * @param[in] shift Shift value
+     *
+     * @return Shifted value
+     */
+    template <typename T>
+    static fixed_point<T> shift_right(fixed_point<T> x, size_t shift)
+    {
+        return fixed_point<T>(x._value >> shift, x._fixed_point_position, true);
+    }
+    /** Calculate absolute value
+     *
+     * @param[in] x Fixed point operand
+     *
+     * @return Absolute value of operand
+     */
+    template <typename T>
+    static fixed_point<T> abs(fixed_point<T> x)
+    {
+        using promoted_T = typename traits::promote<T>::type;
+        T val            = (x._value < 0) ? constant_expr<T>::saturate_cast(-static_cast<promoted_T>(x._value)) : x._value;
+        return fixed_point<T>(val, x._fixed_point_position, true);
+    }
+    /** Calculate the logarithm of a fixed point number
+     *
+     * @param[in] x Fixed point operand
+     *
+     * @return Logarithm value of operand
+     */
+    template <typename T>
+    static fixed_point<T> log(fixed_point<T> x)
+    {
+        uint8_t p         = x._fixed_point_position;
+        auto    const_one = fixed_point<T>(static_cast<T>(1), p);
+
+        // Logarithm of 1 is zero and logarithm of negative values is not defined in R, so return 0.
+        // Also, log(x) == -log(1/x) for 0 < x < 1.
+        if(isequal(x, const_one) || islessequal(x, fixed_point<T>(static_cast<T>(0), p)))
+        {
+            return fixed_point<T>(static_cast<T>(0), p, true);
+        }
+        else if(isless(x, const_one))
+        {
+            return mul(log(div(const_one, x)), fixed_point<T>(-1, p));
+        }
+
+        // Remove even powers of 2
+        T shift_val = 31 - __builtin_clz(x._value >> p);
+        x           = shift_right(x, shift_val);
+        x           = sub(x, const_one);
+
+        // Constants
+        auto ln2 = fixed_point<T>(0.6931471, p);
+        auto A   = fixed_point<T>(1.4384189, p);
+        auto B   = fixed_point<T>(-0.67719, p);
+        auto C   = fixed_point<T>(0.3218538, p);
+        auto D   = fixed_point<T>(-0.0832229, p);
+
+        // Polynomial expansion
+        auto sum = add(mul(x, D), C);
+        sum      = add(mul(x, sum), B);
+        sum      = add(mul(x, sum), A);
+        sum      = mul(x, sum);
+
+        return mul(add(sum, fixed_point<T>(static_cast<T>(shift_val), p)), ln2);
+    }
+    /** Calculate the exponential of a fixed point number.
+     *
+     * exp(x) = exp(floor(x)) * exp(x - floor(x))
+     *        = pow(2, floor(x) / ln(2)) * exp(x - floor(x))
+     *        = exp(x - floor(x)) << (floor(x) / ln(2))
+     *
+     * @param[in] x Fixed point operand
+     *
+     * @return Exponential value of operand
+     */
+    template <typename T>
+    static fixed_point<T> exp(fixed_point<T> x)
+    {
+        uint8_t p = x._fixed_point_position;
+        // Constants
+        auto const_one = fixed_point<T>(1, p);
+        auto ln2       = fixed_point<T>(0.6931471, p);
+        auto inv_ln2   = fixed_point<T>(1.442695, p);
+        auto A         = fixed_point<T>(0.9978546, p);
+        auto B         = fixed_point<T>(0.4994721, p);
+        auto C         = fixed_point<T>(0.1763723, p);
+        auto D         = fixed_point<T>(0.0435108, p);
+
+        T scaled_int_part = detail::constant_expr<T>::to_int(mul(x, inv_ln2)._value, p);
+
+        // Polynomial expansion
+        auto frac_part = sub(x, mul(ln2, fixed_point<T>(scaled_int_part, p)));
+        auto taylor    = add(mul(frac_part, D), C);
+        taylor         = add(mul(frac_part, taylor), B);
+        taylor         = add(mul(frac_part, taylor), A);
+        taylor         = mul(frac_part, taylor);
+        taylor         = add(taylor, const_one);
+
+        // Saturate value
+        if(static_cast<T>(clz(taylor.raw())) <= scaled_int_part)
+        {
+            return fixed_point<T>(std::numeric_limits<T>::max(), p, true);
+        }
+
+        return (scaled_int_part < 0) ? shift_right(taylor, -scaled_int_part) : shift_left(taylor, scaled_int_part);
+    }
+    /** Calculate the inverse square root of a fixed point number
+     *
+     * @param[in] x Fixed point operand
+     *
+     * @return Inverse square root value of operand
+     */
+    template <typename T>
+    static fixed_point<T> inv_sqrt(fixed_point<T> x)
+    {
+        const uint8_t p     = x._fixed_point_position;
+        int8_t        shift = std::numeric_limits<T>::digits - (p + detail::clz(x._value));
+
+        shift += std::numeric_limits<T>::is_signed ? 1 : 0;
+
+        const auto           three_half = fixed_point<T>(1.5f, p);
+        fixed_point<T>       a          = shift < 0 ? shift_left(x, -shift) : shift_right(x, shift);
+        const fixed_point<T> x_half     = shift_right(a, 1);
+
+        // We need three iterations to find the result
+        for(int i = 0; i < 3; ++i)
+        {
+            a = mul(a, sub(three_half, mul(x_half, mul(a, a))));
+        }
+
+        return (shift < 0) ? shift_left(a, -shift >> 1) : shift_right(a, shift >> 1);
+    }
+    /** Calculate the hyperbolic tangent of a fixed point number
+     *
+     * @param[in] x Fixed point operand
+     *
+     * @return Hyperbolic tangent of the operand
+     */
+    template <typename T>
+    static fixed_point<T> tanh(fixed_point<T> x)
+    {
+        uint8_t p = x._fixed_point_position;
+        // Constants
+        auto const_one = fixed_point<T>(1, p);
+        auto const_two = fixed_point<T>(2, p);
+
+        auto exp2x = exp(const_two * x);
+        auto num   = exp2x - const_one;
+        auto den   = exp2x + const_one;
+        auto tanh  = num / den;
+
+        return tanh;
+    }
+    /** Calculate the a-th power of a fixed point number.
+     *
+     *  The power is computed as x^a = e^(log(x) * a)
+     *
+     * @param[in] x Fixed point operand
+     * @param[in] a Fixed point exponent
+     *
+     * @return a-th power of the operand
+     */
+    template <typename T>
+    static fixed_point<T> pow(fixed_point<T> x, fixed_point<T> a)
+    {
+        return exp(log(x) * a);
+    }
+};
+
+template <typename T>
+bool operator==(const fixed_point<T> &lhs, const fixed_point<T> &rhs)
+{
+    return functions::isequal(lhs, rhs);
+}
+template <typename T>
+bool operator!=(const fixed_point<T> &lhs, const fixed_point<T> &rhs)
+{
+    return !operator==(lhs, rhs);
+}
+template <typename T>
+bool operator<(const fixed_point<T> &lhs, const fixed_point<T> &rhs)
+{
+    return functions::isless(lhs, rhs);
+}
+template <typename T>
+bool operator>(const fixed_point<T> &lhs, const fixed_point<T> &rhs)
+{
+    return operator<(rhs, lhs);
+}
+template <typename T>
+bool operator<=(const fixed_point<T> &lhs, const fixed_point<T> &rhs)
+{
+    return !operator>(lhs, rhs);
+}
+template <typename T>
+bool operator>=(const fixed_point<T> &lhs, const fixed_point<T> &rhs)
+{
+    return !operator<(lhs, rhs);
+}
+template <typename T>
+fixed_point<T> operator+(const fixed_point<T> &lhs, const fixed_point<T> &rhs)
+{
+    return functions::add(lhs, rhs);
+}
+template <typename T>
+fixed_point<T> operator-(const fixed_point<T> &lhs, const fixed_point<T> &rhs)
+{
+    return functions::sub(lhs, rhs);
+}
+template <typename T>
+fixed_point<T> operator-(const fixed_point<T> &rhs)
+{
+    return functions::negate(rhs);
+}
+template <typename T>
+fixed_point<T> operator*(fixed_point<T> x, fixed_point<T> y)
+{
+    return functions::mul(x, y);
+}
+template <typename T>
+fixed_point<T> operator/(fixed_point<T> x, fixed_point<T> y)
+{
+    return functions::div(x, y);
+}
+template <typename T>
+fixed_point<T> operator>>(fixed_point<T> x, size_t shift)
+{
+    return functions::shift_right(x, shift);
+}
+template <typename T>
+fixed_point<T> operator<<(fixed_point<T> x, size_t shift)
+{
+    return functions::shift_left(x, shift);
+}
+template <typename T, typename U, typename traits>
+std::basic_ostream<T, traits> &operator<<(std::basic_ostream<T, traits> &s, fixed_point<U> x)
+{
+    return functions::write(s, x);
+}
+template <typename T>
+inline fixed_point<T> min(fixed_point<T> x, fixed_point<T> y)
+{
+    return x > y ? y : x;
+}
+template <typename T>
+inline fixed_point<T> max(fixed_point<T> x, fixed_point<T> y)
+{
+    return x > y ? x : y;
+}
+template <OverflowPolicy OP = OverflowPolicy::SATURATE, typename T>
+inline fixed_point<T> add(fixed_point<T> x, fixed_point<T> y)
+{
+    return functions::add<OP>(x, y);
+}
+template <OverflowPolicy OP = OverflowPolicy::SATURATE, typename T>
+inline fixed_point<T> sub(fixed_point<T> x, fixed_point<T> y)
+{
+    return functions::sub<OP>(x, y);
+}
+template <OverflowPolicy OP = OverflowPolicy::SATURATE, typename T>
+inline fixed_point<T> mul(fixed_point<T> x, fixed_point<T> y)
+{
+    return functions::mul<OP>(x, y);
+}
+template <typename T>
+inline fixed_point<T> div(fixed_point<T> x, fixed_point<T> y)
+{
+    return functions::div(x, y);
+}
+template <typename T>
+inline fixed_point<T> abs(fixed_point<T> x)
+{
+    return functions::abs(x);
+}
+template <typename T>
+inline fixed_point<T> clamp(fixed_point<T> x, T min, T max)
+{
+    return functions::clamp(x, min, max);
+}
+template <typename T>
+inline fixed_point<T> exp(fixed_point<T> x)
+{
+    return functions::exp(x);
+}
+template <typename T>
+inline fixed_point<T> log(fixed_point<T> x)
+{
+    return functions::log(x);
+}
+template <typename T>
+inline fixed_point<T> inv_sqrt(fixed_point<T> x)
+{
+    return functions::inv_sqrt(x);
+}
+template <typename T>
+inline fixed_point<T> tanh(fixed_point<T> x)
+{
+    return functions::tanh(x);
+}
+template <typename T>
+inline fixed_point<T> pow(fixed_point<T> x, fixed_point<T> a)
+{
+    return functions::pow(x, a);
+}
+} // namespace detail
+
+// Expose operators
+using detail::operator==;
+using detail::operator!=;
+using detail::operator<;
+using detail::operator>;
+using detail::operator<=;
+using detail::operator>=;
+using detail::operator+;
+using detail::operator-;
+using detail::operator*;
+using detail::operator/;
+using detail::operator>>;
+using detail::operator<<;
+
+// Expose additional functions
+using detail::min;
+using detail::max;
+using detail::add;
+using detail::sub;
+using detail::mul;
+using detail::div;
+using detail::abs;
+using detail::clamp;
+using detail::exp;
+using detail::log;
+using detail::inv_sqrt;
+using detail::tanh;
+using detail::pow;
+// TODO: floor
+// TODO: ceil
+// TODO: sqrt
+} // namespace fixed_point_arithmetic
+} // namespace test
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_TEST_VALIDATION_FIXEDPOINT_H__ */
diff --git a/tests/validation/Helpers.h b/tests/validation/Helpers.h
new file mode 100644
index 0000000000..cbaea4b894
--- /dev/null
+++ b/tests/validation/Helpers.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_VALIDATION_HELPERS_H__
+#define __ARM_COMPUTE_TEST_VALIDATION_HELPERS_H__
+
+#include "Types.h"
+
+#include <type_traits>
+#include <utility>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+/** Helper function to get the testing range for each activation layer.
+ *
+ * @param[in] activation           Activation function to test.
+ * @param[in] fixed_point_position (Optional) Number of bits for the fractional part. Defaults to 1.
+ *
+ * @return A pair containing the lower upper testing bounds for a given function.
+ */
+template <typename T>
+std::pair<T, T> get_activation_layer_test_bounds(ActivationLayerInfo::ActivationFunction activation, int fixed_point_position = 1)
+{
+    bool is_float = std::is_floating_point<T>::value;
+    std::pair<T, T> bounds;
+
+    // Set initial values
+    if(is_float)
+    {
+        bounds = std::make_pair(-255.f, 255.f);
+    }
+    else
+    {
+        bounds = std::make_pair(std::numeric_limits<T>::lowest(), std::numeric_limits<T>::max());
+    }
+
+    // Reduce testing ranges
+    switch(activation)
+    {
+        case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+        case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+            // Reduce range as exponent overflows
+            if(is_float)
+            {
+                bounds.first  = -40.f;
+                bounds.second = 40.f;
+            }
+            else
+            {
+                bounds.first  = -(1 << (fixed_point_position));
+                bounds.second = 1 << (fixed_point_position);
+            }
+            break;
+        case ActivationLayerInfo::ActivationFunction::TANH:
+            // Reduce range as exponent overflows
+            if(!is_float)
+            {
+                bounds.first  = -(1 << (fixed_point_position));
+                bounds.second = 1 << (fixed_point_position);
+            }
+            break;
+        case ActivationLayerInfo::ActivationFunction::SQRT:
+            // Reduce range as sqrt should take a non-negative number
+            bounds.first = (is_float) ? 0 : 1 << (fixed_point_position);
+            break;
+        default:
+            break;
+    }
+    return bounds;
+}
+
+/** Helper function to get the testing range for batch normalization layer.
+ *
+ * @param[in] fixed_point_position (Optional) Number of bits for the fractional part. Defaults to 1.
+ *
+ * @return A pair containing the lower upper testing bounds.
+ */
+template <typename T>
+std::pair<T, T> get_batchnormalization_layer_test_bounds(int fixed_point_position = 1)
+{
+    bool is_float = std::is_floating_point<T>::value;
+    std::pair<T, T> bounds;
+
+    // Set initial values
+    if(is_float)
+    {
+        bounds = std::make_pair(-1.f, 1.f);
+    }
+    else
+    {
+        bounds = std::make_pair(1, 1 << (fixed_point_position));
+    }
+
+    return bounds;
+}
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif //__ARM_COMPUTE_TEST_VALIDATION_HELPERS_H__
diff --git a/tests/validation/NEON/AbsoluteDifference.cpp b/tests/validation/NEON/AbsoluteDifference.cpp
new file mode 100644
index 0000000000..b7f45d2384
--- /dev/null
+++ b/tests/validation/NEON/AbsoluteDifference.cpp
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "validation/Datasets.h"
+#include "validation/Reference.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "boost_wrapper.h"
+
+#include <random>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+/** Compute Neon absolute difference function.
+ *
+ * @param[in] shape  Shape of the input and output tensors.
+ * @param[in] dt_in0 Data type of first input tensor.
+ * @param[in] dt_in1 Data type of second input tensor.
+ * @param[in] dt_out Data type of the output tensor.
+ *
+ * @return Computed output tensor.
+ */
+Tensor compute_absolute_difference(const TensorShape &shape, DataType dt_in0, DataType dt_in1, DataType dt_out)
+{
+    // Create tensors
+    Tensor src1 = create_tensor(shape, dt_in0);
+    Tensor src2 = create_tensor(shape, dt_in1);
+    Tensor dst  = create_tensor(shape, dt_out);
+
+    // Create and configure function
+    NEAbsoluteDifference abs_d;
+    abs_d.configure(&src1, &src2, &dst);
+
+    // Allocate tensors
+    src1.allocator()->allocate();
+    src2.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src1.info()->is_resizable());
+    BOOST_TEST(!src2.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors
+    library->fill_tensor_uniform(NEAccessor(src1), 0);
+    library->fill_tensor_uniform(NEAccessor(src2), 1);
+
+    // Compute function
+    abs_d.run();
+
+    return dst;
+}
+
+void validate_configuration(const Tensor &src1, const Tensor &src2, Tensor &dst, TensorShape shape)
+{
+    BOOST_TEST(src1.info()->is_resizable());
+    BOOST_TEST(src2.info()->is_resizable());
+    BOOST_TEST(dst.info()->is_resizable());
+
+    // Create and configure function
+    NEAbsoluteDifference abs_d;
+    abs_d.configure(&src1, &src2, &dst);
+
+    // Validate valid region
+    const ValidRegion valid_region = shape_to_valid_region(shape);
+    validate(src1.info()->valid_region(), valid_region);
+    validate(src2.info()->valid_region(), valid_region);
+    validate(dst.info()->valid_region(), valid_region);
+
+    // Validate padding
+    const PaddingSize padding(0, required_padding(shape.x(), 16), 0, 0);
+    validate(src1.info()->padding(), padding);
+    validate(src2.info()->padding(), padding);
+    validate(dst.info()->padding(), padding);
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+BOOST_AUTO_TEST_SUITE(AbsoluteDifference)
+
+BOOST_AUTO_TEST_SUITE(U8)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()),
+                     shape)
+{
+    // Create tensors
+    Tensor src1 = create_tensor(shape, DataType::U8);
+    Tensor src2 = create_tensor(shape, DataType::U8);
+    Tensor dst  = create_tensor(shape, DataType::U8);
+
+    validate_configuration(src1, src2, dst, shape);
+}
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes(),
+                     shape)
+{
+    // Compute function
+    Tensor dst = compute_absolute_difference(shape, DataType::U8, DataType::U8, DataType::U8);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_absolute_difference(shape, DataType::U8, DataType::U8, DataType::U8);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes(),
+                     shape)
+{
+    // Compute function
+    Tensor dst = compute_absolute_difference(shape, DataType::U8, DataType::U8, DataType::U8);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_absolute_difference(shape, DataType::U8, DataType::U8, DataType::U8);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(S16)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ DataType::U8, DataType::S16 }),
+                     shape, dt)
+{
+    // Create tensors
+    Tensor src1 = create_tensor(shape, dt);
+    Tensor src2 = create_tensor(shape, DataType::S16);
+    Tensor dst  = create_tensor(shape, DataType::S16);
+
+    validate_configuration(src1, src2, dst, shape);
+}
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::make({ DataType::U8, DataType::S16 }),
+                     shape, dt)
+{
+    // Compute function
+    Tensor dst = compute_absolute_difference(shape, dt, DataType::S16, DataType::S16);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_absolute_difference(shape, dt, DataType::S16, DataType::S16);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ DataType::U8, DataType::S16 }),
+                     shape, dt)
+{
+    // Compute function
+    Tensor dst = compute_absolute_difference(shape, dt, DataType::S16, DataType::S16);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_absolute_difference(shape, dt, DataType::S16, DataType::S16);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/NEON/Accumulate.cpp b/tests/validation/NEON/Accumulate.cpp
new file mode 100644
index 0000000000..e3ea37cd99
--- /dev/null
+++ b/tests/validation/NEON/Accumulate.cpp
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "validation/Datasets.h"
+#include "validation/Reference.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEAccumulate.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "boost_wrapper.h"
+
+#include <random>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+/** Compute Neon accumulate function.
+ *
+ * @param[in] shape Shape of the input and output tensors.
+ *
+ * @return Computed output tensor.
+ */
+Tensor compute_accumulate(const TensorShape &shape)
+{
+    // Create tensors
+    Tensor src = create_tensor(shape, DataType::U8);
+    Tensor dst = create_tensor(shape, DataType::S16);
+
+    // Create and configure function
+    NEAccumulate acc;
+    acc.configure(&src, &dst);
+
+    // Allocate tensors
+    src.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors
+    library->fill_tensor_uniform(NEAccessor(src), 0);
+    library->fill_tensor_uniform(NEAccessor(dst), 1);
+
+    // Compute function
+    acc.run();
+
+    return dst;
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+BOOST_AUTO_TEST_SUITE(Accumulate)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()),
+                     shape)
+{
+    // Create tensors
+    Tensor src = create_tensor(shape, DataType::U8);
+    Tensor dst = create_tensor(shape, DataType::S16);
+
+    BOOST_TEST(src.info()->is_resizable());
+    BOOST_TEST(dst.info()->is_resizable());
+
+    // Create and configure function
+    NEAccumulate acc;
+    acc.configure(&src, &dst);
+
+    // Validate valid region
+    const ValidRegion valid_region = shape_to_valid_region(shape);
+    validate(src.info()->valid_region(), valid_region);
+    validate(dst.info()->valid_region(), valid_region);
+
+    // Validate padding
+    const PaddingSize padding(0, required_padding(shape.x(), 16), 0, 0);
+    validate(src.info()->padding(), padding);
+    validate(dst.info()->padding(), padding);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes(),
+                     shape)
+{
+    // Compute function
+    Tensor dst = compute_accumulate(shape);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_accumulate(shape);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes(),
+                     shape)
+{
+    // Compute function
+    Tensor dst = compute_accumulate(shape);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_accumulate(shape);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/NEON/AccumulateSquared.cpp b/tests/validation/NEON/AccumulateSquared.cpp
new file mode 100644
index 0000000000..10263a02e3
--- /dev/null
+++ b/tests/validation/NEON/AccumulateSquared.cpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "validation/Datasets.h"
+#include "validation/Reference.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEAccumulate.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "boost_wrapper.h"
+
+#include <random>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+/** Compute Neon accumulate squared function.
+ *
+ * @param[in] shape Shape of the input and output tensors.
+ *
+ * @return Computed output tensor.
+ */
+Tensor compute_accumulate_squared(const TensorShape &shape, uint32_t shift)
+{
+    // Create tensors
+    Tensor src = create_tensor(shape, DataType::U8);
+    Tensor dst = create_tensor(shape, DataType::S16);
+
+    // Create and configure function
+    NEAccumulateSquared acc;
+    acc.configure(&src, shift, &dst);
+
+    // Allocate tensors
+    src.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors
+    // dst tensor filled with non-negative values
+    library->fill_tensor_uniform(NEAccessor(src), 0);
+    library->fill_tensor_uniform(NEAccessor(dst), 1, static_cast<int16_t>(0), std::numeric_limits<int16_t>::max());
+
+    // Compute function
+    acc.run();
+
+    return dst;
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+BOOST_AUTO_TEST_SUITE(AccumulateSquared)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::xrange(0U, 16U),
+                     shape, shift)
+{
+    // Create tensors
+    Tensor src = create_tensor(shape, DataType::U8);
+    Tensor dst = create_tensor(shape, DataType::S16);
+
+    BOOST_TEST(src.info()->is_resizable());
+    BOOST_TEST(dst.info()->is_resizable());
+
+    // Create and configure function
+    NEAccumulateSquared acc;
+    acc.configure(&src, shift, &dst);
+
+    // Validate valid region
+    const ValidRegion valid_region = shape_to_valid_region(shape);
+    validate(src.info()->valid_region(), valid_region);
+    validate(dst.info()->valid_region(), valid_region);
+
+    // Validate padding
+    const PaddingSize padding(0, required_padding(shape.x(), 16), 0, 0);
+    validate(src.info()->padding(), padding);
+    validate(dst.info()->padding(), padding);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::xrange(0U, 16U),
+                     shape, shift)
+{
+    // Compute function
+    Tensor dst = compute_accumulate_squared(shape, shift);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_accumulate_squared(shape, shift);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ 0U, 1U, 15U }),
+                     shape, shift)
+{
+    // Compute function
+    Tensor dst = compute_accumulate_squared(shape, shift);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_accumulate_squared(shape, shift);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/NEON/AccumulateWeighted.cpp b/tests/validation/NEON/AccumulateWeighted.cpp
new file mode 100644
index 0000000000..6d45848647
--- /dev/null
+++ b/tests/validation/NEON/AccumulateWeighted.cpp
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "validation/Datasets.h"
+#include "validation/Reference.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEAccumulate.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "boost_wrapper.h"
+
+#include <random>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+/** Compute Neon accumulate weighted function.
+ *
+ * @param[in] shape Shape of the input and output tensors.
+ *
+ * @return Computed output tensor.
+ */
+Tensor compute_accumulate_weighted(const TensorShape &shape, float alpha)
+{
+    // Create tensors
+    Tensor src = create_tensor(shape, DataType::U8);
+    Tensor dst = create_tensor(shape, DataType::U8);
+
+    // Create and configure function
+    NEAccumulateWeighted acc;
+    acc.configure(&src, alpha, &dst);
+
+    // Allocate tensors
+    src.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors
+    library->fill_tensor_uniform(NEAccessor(src), 0);
+    library->fill_tensor_uniform(NEAccessor(dst), 1);
+
+    // Compute function
+    acc.run();
+
+    return dst;
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+BOOST_AUTO_TEST_SUITE(AccumulateWeighted)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ 0.f, 0.5f, 1.f }),
+                     shape, alpha)
+{
+    // Create tensors
+    Tensor src = create_tensor(shape, DataType::U8);
+    Tensor dst = create_tensor(shape, DataType::U8);
+
+    BOOST_TEST(src.info()->is_resizable());
+    BOOST_TEST(dst.info()->is_resizable());
+
+    // Create and configure function
+    NEAccumulateWeighted acc;
+    acc.configure(&src, alpha, &dst);
+
+    // Validate valid region
+    const ValidRegion valid_region = shape_to_valid_region(shape);
+    validate(src.info()->valid_region(), valid_region);
+    validate(dst.info()->valid_region(), valid_region);
+
+    // Validate padding
+    const PaddingSize padding(0, required_padding(shape.x(), 16), 0, 0);
+    validate(src.info()->padding(), padding);
+    validate(dst.info()->padding(), padding);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::make({ 0.f, 0.5f, 1.f }),
+                     shape, alpha)
+{
+    // Compute function
+    Tensor dst = compute_accumulate_weighted(shape, alpha);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_accumulate_weighted(shape, alpha);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ 0.f, 0.5f, 1.f }),
+                     shape, alpha)
+{
+    // Compute function
+    Tensor dst = compute_accumulate_weighted(shape, alpha);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_accumulate_weighted(shape, alpha);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/NEON/ActivationLayer.cpp b/tests/validation/NEON/ActivationLayer.cpp
new file mode 100644
index 0000000000..da304d8087
--- /dev/null
+++ b/tests/validation/NEON/ActivationLayer.cpp
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "validation/Datasets.h"
+#include "validation/Helpers.h"
+#include "validation/Reference.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "boost_wrapper.h"
+
+#include <random>
+#include <string>
+#include <tuple>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+/** Define tolerance of the activation layer
+ *
+ * @param[in] activation           The activation function used.
+ * @param[in] fixed_point_position Number of bits for the fractional part..
+ *
+ * @return Tolerance depending on the activation function.
+ */
+float activation_layer_tolerance(ActivationLayerInfo::ActivationFunction activation, int fixed_point_position = 0)
+{
+    switch(activation)
+    {
+        case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+        case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+        case ActivationLayerInfo::ActivationFunction::SQRT:
+        case ActivationLayerInfo::ActivationFunction::TANH:
+            return (fixed_point_position != 0) ? 5.f : 0.00001f;
+            break;
+        default:
+            return 0.f;
+    }
+}
+
+/** Compute Neon activation layer function.
+ *
+ * @param[in] shape                Shape of the input and output tensors.
+ * @param[in] dt                   Shape Data type of tensors.
+ * @param[in] act_info             Activation layer information.
+ * @param[in] fixed_point_position Number of bits for the fractional part of fixed point numbers.
+ *
+ * @return Computed output tensor.
+ */
+Tensor compute_activation_layer(const TensorShape &shape, DataType dt, ActivationLayerInfo act_info, int fixed_point_position = 0)
+{
+    // Create tensors
+    Tensor src = create_tensor(shape, dt, 1, fixed_point_position);
+    Tensor dst = create_tensor(shape, dt, 1, fixed_point_position);
+
+    // Create and configure function
+    NEActivationLayer act_layer;
+    act_layer.configure(&src, &dst, act_info);
+
+    // Allocate tensors
+    src.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors
+    if(dt == DataType::F32)
+    {
+        float min_bound = 0;
+        float max_bound = 0;
+        std::tie(min_bound, max_bound) = get_activation_layer_test_bounds<float>(act_info.activation());
+        std::uniform_real_distribution<> distribution(min_bound, max_bound);
+        library->fill(NEAccessor(src), distribution, 0);
+    }
+    else
+    {
+        int min_bound = 0;
+        int max_bound = 0;
+        std::tie(min_bound, max_bound) = get_activation_layer_test_bounds<int8_t>(act_info.activation(), fixed_point_position);
+        std::uniform_int_distribution<> distribution(min_bound, max_bound);
+        library->fill(NEAccessor(src), distribution, 0);
+    }
+
+    // Compute function
+    act_layer.run();
+
+    return dst;
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+BOOST_AUTO_TEST_SUITE(ActivationLayer)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * CNNDataTypes(), shape, dt)
+{
+    // Set fixed point position data type allowed
+    int fixed_point_position = (arm_compute::is_data_type_fixed_point(dt)) ? 3 : 0;
+
+    // Create tensors
+    Tensor src = create_tensor(shape, dt, 1, fixed_point_position);
+    Tensor dst = create_tensor(shape, dt, 1, fixed_point_position);
+
+    BOOST_TEST(src.info()->is_resizable());
+    BOOST_TEST(dst.info()->is_resizable());
+
+    // Create and configure function
+    NEActivationLayer act_layer;
+    act_layer.configure(&src, &dst, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::ABS));
+
+    // Validate valid region
+    const ValidRegion valid_region = shape_to_valid_region(shape);
+    validate(src.info()->valid_region(), valid_region);
+    validate(dst.info()->valid_region(), valid_region);
+
+    // Validate padding
+    const PaddingSize padding(0, required_padding(shape.x(), 16), 0, 0);
+    validate(src.info()->padding(), padding);
+    validate(dst.info()->padding(), padding);
+}
+
+BOOST_AUTO_TEST_SUITE(Float)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * CNNFloatDataTypes() * ActivationFunctions(), shape, dt, act_function)
+{
+    // Create activation layer info
+    ActivationLayerInfo act_info(act_function, 1.f, 1.f);
+
+    // Compute function
+    Tensor dst = compute_activation_layer(shape, dt, act_info);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_activation_layer(shape, dt, act_info);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, activation_layer_tolerance(act_function));
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * CNNFloatDataTypes() * ActivationFunctions(), shape, dt, act_function)
+{
+    // Create activation layer info
+    ActivationLayerInfo act_info(act_function, 1.f, 1.f);
+
+    // Compute function
+    Tensor dst = compute_activation_layer(shape, dt, act_info);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_activation_layer(shape, dt, act_info);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, activation_layer_tolerance(act_function));
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+/** @note We test for fixed point precision [3,5] because [1,2] and [6,7] ranges
+ *        cause overflowing issues in most of the transcendentals functions.
+ */
+BOOST_AUTO_TEST_SUITE(Quantized)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * ActivationFunctions() * boost::unit_test::data::xrange(3, 6, 1),
+                     shape, act_function, fixed_point_position)
+{
+    // Create activation layer info
+    ActivationLayerInfo act_info(act_function, 1.f, 1.f);
+
+    // Compute function
+    Tensor dst = compute_activation_layer(shape, DataType::QS8, act_info, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_activation_layer(shape, DataType::QS8, act_info, fixed_point_position);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, activation_layer_tolerance(act_function, fixed_point_position));
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/NEON/ArithmeticAddition.cpp b/tests/validation/NEON/ArithmeticAddition.cpp
new file mode 100644
index 0000000000..5654a426fd
--- /dev/null
+++ b/tests/validation/NEON/ArithmeticAddition.cpp
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "validation/Datasets.h"
+#include "validation/Reference.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "boost_wrapper.h"
+
+#include <random>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+/** Compute Neon arithmetic addition function.
+ *
+ * @param[in] shape  Shape of the input and output tensors.
+ * @param[in] dt_in0 Data type of first input tensor.
+ * @param[in] dt_in1 Data type of second input tensor.
+ * @param[in] dt_out Data type of the output tensor.
+ * @param[in] policy Overflow policy of the operation.
+ *
+ * @return Computed output tensor.
+ */
+Tensor compute_arithmetic_addition(const TensorShape &shape, DataType dt_in0, DataType dt_in1, DataType dt_out, ConvertPolicy policy)
+{
+    // Create tensors
+    Tensor src1 = create_tensor(shape, dt_in0);
+    Tensor src2 = create_tensor(shape, dt_in1);
+    Tensor dst  = create_tensor(shape, dt_out);
+
+    // Create and configure function
+    NEArithmeticAddition add;
+    add.configure(&src1, &src2, &dst, policy);
+
+    // Allocate tensors
+    src1.allocator()->allocate();
+    src2.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src1.info()->is_resizable());
+    BOOST_TEST(!src2.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors
+    library->fill_tensor_uniform(NEAccessor(src1), 0);
+    library->fill_tensor_uniform(NEAccessor(src2), 1);
+
+    // Compute function
+    add.run();
+
+    return dst;
+}
+
+void validate_configuration(const Tensor &src1, const Tensor &src2, Tensor &dst, TensorShape shape, ConvertPolicy policy)
+{
+    BOOST_TEST(src1.info()->is_resizable());
+    BOOST_TEST(src2.info()->is_resizable());
+    BOOST_TEST(dst.info()->is_resizable());
+
+    // Create and configure function
+    NEArithmeticAddition add;
+    add.configure(&src1, &src2, &dst, policy);
+
+    // Validate valid region
+    const ValidRegion valid_region = shape_to_valid_region(shape);
+    validate(src1.info()->valid_region(), valid_region);
+    validate(src2.info()->valid_region(), valid_region);
+    validate(dst.info()->valid_region(), valid_region);
+
+    // Validate padding
+    const PaddingSize padding(0, required_padding(shape.x(), 16), 0, 0);
+    validate(src1.info()->padding(), padding);
+    validate(src2.info()->padding(), padding);
+    validate(dst.info()->padding(), padding);
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+BOOST_AUTO_TEST_SUITE(ArithmeticAddition)
+
+BOOST_AUTO_TEST_SUITE(U8)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP }),
+                     shape, policy)
+{
+    // Create tensors
+    Tensor src1 = create_tensor(shape, DataType::U8);
+    Tensor src2 = create_tensor(shape, DataType::U8);
+    Tensor dst  = create_tensor(shape, DataType::U8);
+
+    validate_configuration(src1, src2, dst, shape, policy);
+}
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP }),
+                     shape, policy)
+{
+    // Compute function
+    Tensor dst = compute_arithmetic_addition(shape, DataType::U8, DataType::U8, DataType::U8, policy);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_arithmetic_addition(shape, DataType::U8, DataType::U8, DataType::U8, policy);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(S16)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ DataType::U8, DataType::S16 }) * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP }),
+                     shape, dt, policy)
+{
+    // Create tensors
+    Tensor src1 = create_tensor(shape, dt);
+    Tensor src2 = create_tensor(shape, DataType::S16);
+    Tensor dst  = create_tensor(shape, DataType::S16);
+
+    validate_configuration(src1, src2, dst, shape, policy);
+}
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::make({ DataType::U8, DataType::S16 }) * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP }),
+                     shape, dt, policy)
+{
+    // Compute function
+    Tensor dst = compute_arithmetic_addition(shape, dt, DataType::S16, DataType::S16, policy);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_arithmetic_addition(shape, dt, DataType::S16, DataType::S16, policy);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ DataType::U8, DataType::S16 }) * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP }),
+                     shape, dt, policy)
+{
+    // Compute function
+    Tensor dst = compute_arithmetic_addition(shape, dt, DataType::S16, DataType::S16, policy);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_arithmetic_addition(shape, dt, DataType::S16, DataType::S16, policy);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(F32)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP }),
+                     shape, policy)
+{
+    // Create tensors
+    Tensor src1 = create_tensor(shape, DataType::F32);
+    Tensor src2 = create_tensor(shape, DataType::F32);
+    Tensor dst  = create_tensor(shape, DataType::F32);
+
+    validate_configuration(src1, src2, dst, shape, policy);
+}
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes(), shape)
+{
+    // Compute function
+    Tensor dst = compute_arithmetic_addition(shape, DataType::F32, DataType::F32, DataType::F32, ConvertPolicy::WRAP);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_arithmetic_addition(shape, DataType::F32, DataType::F32, DataType::F32, ConvertPolicy::WRAP);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP }),
+                     shape, policy)
+{
+    // Compute function
+    Tensor dst = compute_arithmetic_addition(shape, DataType::F32, DataType::F32, DataType::F32, policy);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_arithmetic_addition(shape, DataType::F32, DataType::F32, DataType::F32, policy);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/NEON/ArithmeticSubtraction.cpp b/tests/validation/NEON/ArithmeticSubtraction.cpp
new file mode 100644
index 0000000000..9c0e9131e0
--- /dev/null
+++ b/tests/validation/NEON/ArithmeticSubtraction.cpp
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "validation/Datasets.h"
+#include "validation/Reference.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "boost_wrapper.h"
+
+#include <random>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+/** Compute Neon arithmetic subtraction function.
+ *
+ * @param[in] shape  Shape of the input and output tensors.
+ * @param[in] dt_in0 Data type of first input tensor.
+ * @param[in] dt_in1 Data type of second input tensor.
+ * @param[in] dt_out Data type of the output tensor.
+ * @param[in] policy Overflow policy of the operation.
+ *
+ * @return Computed output tensor.
+ */
+Tensor compute_arithmetic_subtraction(const TensorShape &shape, DataType dt_in0, DataType dt_in1, DataType dt_out, ConvertPolicy policy)
+{
+    // Create tensors
+    Tensor src1 = create_tensor(shape, dt_in0);
+    Tensor src2 = create_tensor(shape, dt_in1);
+    Tensor dst  = create_tensor(shape, dt_out);
+
+    // Create and configure function
+    NEArithmeticSubtraction sub;
+    sub.configure(&src1, &src2, &dst, policy);
+
+    // Allocate tensors
+    src1.allocator()->allocate();
+    src2.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src1.info()->is_resizable());
+    BOOST_TEST(!src2.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors
+    library->fill_tensor_uniform(NEAccessor(src1), 0);
+    library->fill_tensor_uniform(NEAccessor(src2), 1);
+
+    // Compute function
+    sub.run();
+
+    return dst;
+}
+
+void validate_configuration(const Tensor &src1, const Tensor &src2, Tensor &dst, TensorShape shape, ConvertPolicy policy)
+{
+    BOOST_TEST(src1.info()->is_resizable());
+    BOOST_TEST(src2.info()->is_resizable());
+    BOOST_TEST(dst.info()->is_resizable());
+
+    // Create and configure function
+    NEArithmeticSubtraction sub;
+    sub.configure(&src1, &src2, &dst, policy);
+
+    // Validate valid region
+    const ValidRegion valid_region = shape_to_valid_region(shape);
+    validate(src1.info()->valid_region(), valid_region);
+    validate(src2.info()->valid_region(), valid_region);
+    validate(dst.info()->valid_region(), valid_region);
+
+    // Validate padding
+    const PaddingSize padding(0, required_padding(shape.x(), 16), 0, 0);
+    validate(src1.info()->padding(), padding);
+    validate(src2.info()->padding(), padding);
+    validate(dst.info()->padding(), padding);
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+BOOST_AUTO_TEST_SUITE(ArithmeticSubtraction)
+
+BOOST_AUTO_TEST_SUITE(U8)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP }),
+                     shape, policy)
+{
+    // Create tensors
+    Tensor src1 = create_tensor(shape, DataType::U8);
+    Tensor src2 = create_tensor(shape, DataType::U8);
+    Tensor dst  = create_tensor(shape, DataType::U8);
+
+    validate_configuration(src1, src2, dst, shape, policy);
+}
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP }),
+                     shape, policy)
+{
+    // Compute function
+    Tensor dst = compute_arithmetic_subtraction(shape, DataType::U8, DataType::U8, DataType::U8, policy);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_arithmetic_subtraction(shape, DataType::U8, DataType::U8, DataType::U8, policy);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(S16)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ DataType::U8, DataType::S16 }) * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP }),
+                     shape, dt, policy)
+{
+    // Create tensors
+    Tensor src1 = create_tensor(shape, dt);
+    Tensor src2 = create_tensor(shape, DataType::S16);
+    Tensor dst  = create_tensor(shape, DataType::S16);
+
+    validate_configuration(src1, src2, dst, shape, policy);
+}
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::make({ DataType::U8, DataType::S16 }) * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP }),
+                     shape, dt, policy)
+{
+    // Compute function
+    Tensor dst = compute_arithmetic_subtraction(shape, dt, DataType::S16, DataType::S16, policy);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_arithmetic_subtraction(shape, dt, DataType::S16, DataType::S16, policy);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ DataType::U8, DataType::S16 }) * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP }),
+                     shape, dt, policy)
+{
+    // Compute function
+    Tensor dst = compute_arithmetic_subtraction(shape, dt, DataType::S16, DataType::S16, policy);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_arithmetic_subtraction(shape, dt, DataType::S16, DataType::S16, policy);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(F32)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP }),
+                     shape, policy)
+{
+    // Create tensors
+    Tensor src1 = create_tensor(shape, DataType::F32);
+    Tensor src2 = create_tensor(shape, DataType::F32);
+    Tensor dst  = create_tensor(shape, DataType::F32);
+
+    validate_configuration(src1, src2, dst, shape, policy);
+}
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes(), shape)
+{
+    // Compute function
+    Tensor dst = compute_arithmetic_subtraction(shape, DataType::F32, DataType::F32, DataType::F32, ConvertPolicy::WRAP);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_arithmetic_subtraction(shape, DataType::F32, DataType::F32, DataType::F32, ConvertPolicy::WRAP);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP }),
+                     shape, policy)
+{
+    // Compute function
+    Tensor dst = compute_arithmetic_subtraction(shape, DataType::F32, DataType::F32, DataType::F32, policy);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_arithmetic_subtraction(shape, DataType::F32, DataType::F32, DataType::F32, policy);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/NEON/BatchNormalizationLayer.cpp b/tests/validation/NEON/BatchNormalizationLayer.cpp
new file mode 100644
index 0000000000..7656b2f392
--- /dev/null
+++ b/tests/validation/NEON/BatchNormalizationLayer.cpp
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TypePrinter.h"
+#include "dataset/BatchNormalizationLayerDataset.h"
+#include "tests/validation/Helpers.h"
+#include "validation/Datasets.h"
+#include "validation/Reference.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h"
+
+#include <random>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+const float tolerance_f = 1e-05; /**< Tolerance value for comparing reference's output against floating point implementation's output */
+const float tolerance_q = 3;     /**< Tolerance value for comparing reference's output against quantized implementation's output */
+
+/** Compute Neon batch normalization function.
+ *
+ * @param[in] shape     Shape of the input and output tensors.
+ * @param[in] dt        Data type of input and output tensors.
+ * @param[in] norm_info Normalization Layer information.
+ *
+ * @return Computed output tensor.
+ */
+Tensor compute_reference_batch_normalization_layer(const TensorShape &shape0, const TensorShape &shape1, DataType dt, float epsilon, int fixed_point_position = 0)
+{
+    // Create tensors
+    Tensor src   = create_tensor(shape0, dt, 1, fixed_point_position);
+    Tensor dst   = create_tensor(shape0, dt, 1, fixed_point_position);
+    Tensor mean  = create_tensor(shape1, dt, 1, fixed_point_position);
+    Tensor var   = create_tensor(shape1, dt, 1, fixed_point_position);
+    Tensor beta  = create_tensor(shape1, dt, 1, fixed_point_position);
+    Tensor gamma = create_tensor(shape1, dt, 1, fixed_point_position);
+
+    // Create and configure function
+    NEBatchNormalizationLayer norm;
+    norm.configure(&src, &dst, &mean, &var, &beta, &gamma, epsilon);
+
+    // Allocate tensors
+    src.allocator()->allocate();
+    dst.allocator()->allocate();
+    mean.allocator()->allocate();
+    var.allocator()->allocate();
+    beta.allocator()->allocate();
+    gamma.allocator()->allocate();
+
+    BOOST_TEST(!src.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+    BOOST_TEST(!mean.info()->is_resizable());
+    BOOST_TEST(!var.info()->is_resizable());
+    BOOST_TEST(!beta.info()->is_resizable());
+    BOOST_TEST(!gamma.info()->is_resizable());
+
+    // Fill tensors
+    if(dt == DataType::F32)
+    {
+        float min_bound = 0.f;
+        float max_bound = 0.f;
+        std::tie(min_bound, max_bound) = get_batchnormalization_layer_test_bounds<float>();
+        std::uniform_real_distribution<> distribution(min_bound, max_bound);
+        std::uniform_real_distribution<> distribution_var(0, max_bound);
+        library->fill(NEAccessor(src), distribution, 0);
+        library->fill(NEAccessor(mean), distribution, 1);
+        library->fill(NEAccessor(var), distribution_var, 0);
+        library->fill(NEAccessor(beta), distribution, 3);
+        library->fill(NEAccessor(gamma), distribution, 4);
+    }
+    else
+    {
+        int min_bound = 0;
+        int max_bound = 0;
+        std::tie(min_bound, max_bound) = get_batchnormalization_layer_test_bounds<int8_t>(fixed_point_position);
+        std::uniform_int_distribution<> distribution(min_bound, max_bound);
+        std::uniform_int_distribution<> distribution_var(0, max_bound);
+        library->fill(NEAccessor(src), distribution, 0);
+        library->fill(NEAccessor(mean), distribution, 1);
+        library->fill(NEAccessor(var), distribution_var, 0);
+        library->fill(NEAccessor(beta), distribution, 3);
+        library->fill(NEAccessor(gamma), distribution, 4);
+    }
+
+    // Compute function
+    norm.run();
+
+    return dst;
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+BOOST_AUTO_TEST_SUITE(BatchNormalizationLayer)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, RandomBatchNormalizationLayerDataset() * (boost::unit_test::data::make(DataType::F32) + boost::unit_test::data::make(DataType::QS8)), obj, dt)
+{
+    // Set fixed point position data type allowed
+    int fixed_point_position = (arm_compute::is_data_type_fixed_point(dt)) ? 3 : 0;
+
+    // Create tensors
+    Tensor src   = create_tensor(obj.shape0, dt, 1, fixed_point_position);
+    Tensor dst   = create_tensor(obj.shape0, dt, 1, fixed_point_position);
+    Tensor mean  = create_tensor(obj.shape1, dt, 1, fixed_point_position);
+    Tensor var   = create_tensor(obj.shape1, dt, 1, fixed_point_position);
+    Tensor beta  = create_tensor(obj.shape1, dt, 1, fixed_point_position);
+    Tensor gamma = create_tensor(obj.shape1, dt, 1, fixed_point_position);
+
+    BOOST_TEST(src.info()->is_resizable());
+    BOOST_TEST(dst.info()->is_resizable());
+    BOOST_TEST(mean.info()->is_resizable());
+    BOOST_TEST(var.info()->is_resizable());
+    BOOST_TEST(beta.info()->is_resizable());
+    BOOST_TEST(gamma.info()->is_resizable());
+
+    // Create and configure function
+    NEBatchNormalizationLayer norm;
+    norm.configure(&src, &dst, &mean, &var, &beta, &gamma, obj.epsilon);
+
+    // Validate valid region
+    const ValidRegion valid_region     = shape_to_valid_region(obj.shape0);
+    const ValidRegion valid_region_vec = shape_to_valid_region(obj.shape1);
+    validate(src.info()->valid_region(), valid_region);
+    validate(dst.info()->valid_region(), valid_region);
+    validate(mean.info()->valid_region(), valid_region_vec);
+    validate(var.info()->valid_region(), valid_region_vec);
+    validate(beta.info()->valid_region(), valid_region_vec);
+    validate(gamma.info()->valid_region(), valid_region_vec);
+}
+
+BOOST_AUTO_TEST_SUITE(Float)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(Random,
+                     RandomBatchNormalizationLayerDataset() * boost::unit_test::data::make(DataType::F32),
+                     obj, dt)
+{
+    // Compute function
+    Tensor dst = compute_reference_batch_normalization_layer(obj.shape0, obj.shape1, dt, obj.epsilon);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_batch_normalization_layer(obj.shape0, obj.shape1, dt, obj.epsilon);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, tolerance_f, 0);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(Quantized)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(Random,
+                     RandomBatchNormalizationLayerDataset() * boost::unit_test::data::make(DataType::QS8) * boost::unit_test::data::xrange(1, 6),
+                     obj, dt, fixed_point_position)
+{
+    // Compute function
+    Tensor dst = compute_reference_batch_normalization_layer(obj.shape0, obj.shape1, dt, obj.epsilon, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_batch_normalization_layer(obj.shape0, obj.shape1, dt, obj.epsilon, fixed_point_position);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, tolerance_q, 0);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/NEON/BitwiseAnd.cpp b/tests/validation/NEON/BitwiseAnd.cpp
new file mode 100644
index 0000000000..8c0eda992f
--- /dev/null
+++ b/tests/validation/NEON/BitwiseAnd.cpp
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "validation/Datasets.h"
+#include "validation/Reference.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEBitwiseAnd.h"
+#include "arm_compute/runtime/SubTensor.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "boost_wrapper.h"
+
+#include <random>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+/** Compute Neon bitwise and function.
+ *
+ * @param[in] shape Shape of the input and output tensors.
+ *
+ * @return Computed output tensor.
+ */
+Tensor compute_bitwise_and(const TensorShape &shape)
+{
+    // Create tensors
+    Tensor src1 = create_tensor(shape, DataType::U8);
+    Tensor src2 = create_tensor(shape, DataType::U8);
+    Tensor dst  = create_tensor(shape, DataType::U8);
+
+    // Create and configure function
+    NEBitwiseAnd band;
+    band.configure(&src1, &src2, &dst);
+
+    // Allocate tensors
+    src1.allocator()->allocate();
+    src2.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src1.info()->is_resizable());
+    BOOST_TEST(!src2.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors
+    library->fill_tensor_uniform(NEAccessor(src1), 0);
+    library->fill_tensor_uniform(NEAccessor(src2), 1);
+
+    // Compute function
+    band.run();
+
+    return dst;
+}
+
+/** Compute Neon bitwise and function that splits the input and output in two subtensor.
+ *
+ * @param[in] shape Shape of the input and output tensors.
+ *
+ * @return Computed output tensor.
+ */
+Tensor compute_bitwise_and_subtensor(const TensorShape &shape)
+{
+    // Create tensors
+    Tensor src1 = create_tensor(shape, DataType::U8);
+    Tensor src2 = create_tensor(shape, DataType::U8);
+    Tensor dst  = create_tensor(shape, DataType::U8);
+
+    // Create SubTensors
+    int         coord_z   = shape.z() / 2;
+    TensorShape sub_shape = shape;
+    sub_shape.set(2, coord_z);
+
+    SubTensor src1_sub1(&src1, sub_shape, Coordinates());
+    SubTensor src1_sub2(&src1, sub_shape, Coordinates(0, 0, coord_z));
+    SubTensor src2_sub1(&src2, sub_shape, Coordinates());
+    SubTensor src2_sub2(&src2, sub_shape, Coordinates(0, 0, coord_z));
+    SubTensor dst_sub1(&dst, sub_shape, Coordinates());
+    SubTensor dst_sub2(&dst, sub_shape, Coordinates(0, 0, coord_z));
+
+    // Create and configure function
+    NEBitwiseAnd band1, band2;
+    band1.configure(&src1_sub1, &src2_sub1, &dst_sub1);
+    band2.configure(&src1_sub2, &src2_sub2, &dst_sub2);
+
+    // Allocate tensors
+    src1.allocator()->allocate();
+    src2.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src1.info()->is_resizable());
+    BOOST_TEST(!src2.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors
+    std::uniform_int_distribution<> distribution(0, 255);
+    library->fill(NEAccessor(src1), distribution, 0);
+    library->fill(NEAccessor(src2), distribution, 1);
+
+    // Compute function
+    band1.run();
+    band2.run();
+
+    return dst;
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+BOOST_AUTO_TEST_SUITE(BitwiseAnd)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, SmallShapes() + LargeShapes(), shape)
+{
+    // Create tensors
+    Tensor src1 = create_tensor(shape, DataType::U8);
+    Tensor src2 = create_tensor(shape, DataType::U8);
+    Tensor dst  = create_tensor(shape, DataType::U8);
+
+    BOOST_TEST(src1.info()->is_resizable());
+    BOOST_TEST(src2.info()->is_resizable());
+    BOOST_TEST(dst.info()->is_resizable());
+
+    // Create and configure function
+    NEBitwiseAnd band;
+    band.configure(&src1, &src2, &dst);
+
+    // Validate valid region
+    const ValidRegion valid_region = shape_to_valid_region(shape);
+    validate(src1.info()->valid_region(), valid_region);
+    validate(src2.info()->valid_region(), valid_region);
+    validate(dst.info()->valid_region(), valid_region);
+
+    // Validate padding
+    const PaddingSize padding(0, required_padding(shape.x(), 16), 0, 0);
+    validate(src1.info()->padding(), padding);
+    validate(src2.info()->padding(), padding);
+    validate(dst.info()->padding(), padding);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes(), shape)
+{
+    // Compute function
+    Tensor dst = compute_bitwise_and(shape);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_bitwise_and(shape);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_AUTO_TEST_CASE(RunSubTensor)
+{
+    // Create shape
+    TensorShape shape(27U, 35U, 8U, 2U);
+
+    // Compute function
+    Tensor dst = compute_bitwise_and_subtensor(shape);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_bitwise_and(shape);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes(), shape)
+{
+    // Compute function
+    Tensor dst = compute_bitwise_and(shape);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_bitwise_and(shape);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/NEON/BitwiseNot.cpp b/tests/validation/NEON/BitwiseNot.cpp
new file mode 100644
index 0000000000..cb0a1fd0b5
--- /dev/null
+++ b/tests/validation/NEON/BitwiseNot.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "validation/Datasets.h"
+#include "validation/Reference.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEBitwiseNot.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "boost_wrapper.h"
+
+#include <random>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+/** Compute Neon bitwise not function.
+ *
+ * @param[in] shape Shape of the input and output tensors.
+ *
+ * @return Computed output tensor.
+ */
+Tensor compute_bitwise_not(const TensorShape &shape)
+{
+    // Create tensors
+    Tensor src = create_tensor(shape, DataType::U8);
+    Tensor dst = create_tensor(shape, DataType::U8);
+
+    // Create not configure function
+    NEBitwiseNot bnot;
+    bnot.configure(&src, &dst);
+
+    // Allocate tensors
+    src.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors
+    library->fill_tensor_uniform(NEAccessor(src), 0);
+
+    // Compute function
+    bnot.run();
+
+    return dst;
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+BOOST_AUTO_TEST_SUITE(BitwiseNot)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, SmallShapes() + LargeShapes(), shape)
+{
+    // Create tensors
+    Tensor src = create_tensor(shape, DataType::U8);
+    Tensor dst = create_tensor(shape, DataType::U8);
+
+    BOOST_TEST(src.info()->is_resizable());
+    BOOST_TEST(dst.info()->is_resizable());
+
+    // Create not configure function
+    NEBitwiseNot bnot;
+    bnot.configure(&src, &dst);
+
+    // Validate valid region
+    const ValidRegion valid_region = shape_to_valid_region(shape);
+    validate(src.info()->valid_region(), valid_region);
+    validate(dst.info()->valid_region(), valid_region);
+
+    // Validate padding
+    const PaddingSize padding(0, required_padding(shape.x(), 16), 0, 0);
+    validate(src.info()->padding(), padding);
+    validate(dst.info()->padding(), padding);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes(), shape)
+{
+    // Compute function
+    Tensor dst = compute_bitwise_not(shape);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_bitwise_not(shape);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes(), shape)
+{
+    // Compute function
+    Tensor dst = compute_bitwise_not(shape);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_bitwise_not(shape);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/NEON/BitwiseOr.cpp b/tests/validation/NEON/BitwiseOr.cpp
new file mode 100644
index 0000000000..cb853d3fd4
--- /dev/null
+++ b/tests/validation/NEON/BitwiseOr.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "validation/Datasets.h"
+#include "validation/Reference.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEBitwiseOr.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "boost_wrapper.h"
+
+#include <random>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+/** Compute Neon bitwise Or function.
+ *
+ * @param[in] shape Shape of the input and output tensors.
+ *
+ * @return Computed output tensor.
+ */
+Tensor compute_bitwise_or(const TensorShape &shape)
+{
+    // Create tensors
+    Tensor src1 = create_tensor(shape, DataType::U8);
+    Tensor src2 = create_tensor(shape, DataType::U8);
+    Tensor dst  = create_tensor(shape, DataType::U8);
+
+    // Create and configure function
+    NEBitwiseOr bor;
+    bor.configure(&src1, &src2, &dst);
+
+    // Allocate tensors
+    src1.allocator()->allocate();
+    src2.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src1.info()->is_resizable());
+    BOOST_TEST(!src2.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors
+    library->fill_tensor_uniform(NEAccessor(src1), 0);
+    library->fill_tensor_uniform(NEAccessor(src2), 1);
+
+    // Compute function
+    bor.run();
+
+    return dst;
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+BOOST_AUTO_TEST_SUITE(BitwiseOr)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, SmallShapes() + LargeShapes(), shape)
+{
+    // Create tensors
+    Tensor src1 = create_tensor(shape, DataType::U8);
+    Tensor src2 = create_tensor(shape, DataType::U8);
+    Tensor dst  = create_tensor(shape, DataType::U8);
+
+    BOOST_TEST(src1.info()->is_resizable());
+    BOOST_TEST(src2.info()->is_resizable());
+    BOOST_TEST(dst.info()->is_resizable());
+
+    // Create and configure function
+    NEBitwiseOr bor;
+    bor.configure(&src1, &src2, &dst);
+
+    // Validate valid region
+    const ValidRegion valid_region = shape_to_valid_region(shape);
+    validate(src1.info()->valid_region(), valid_region);
+    validate(src2.info()->valid_region(), valid_region);
+    validate(dst.info()->valid_region(), valid_region);
+
+    // Validate padding
+    const PaddingSize padding(0, required_padding(shape.x(), 16), 0, 0);
+    validate(src1.info()->padding(), padding);
+    validate(src2.info()->padding(), padding);
+    validate(dst.info()->padding(), padding);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes(), shape)
+{
+    // Compute function
+    Tensor dst = compute_bitwise_or(shape);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_bitwise_or(shape);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes(), shape)
+{
+    // Compute function
+    Tensor dst = compute_bitwise_or(shape);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_bitwise_or(shape);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/NEON/BitwiseXor.cpp b/tests/validation/NEON/BitwiseXor.cpp
new file mode 100644
index 0000000000..1715b04609
--- /dev/null
+++ b/tests/validation/NEON/BitwiseXor.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "validation/Datasets.h"
+#include "validation/Reference.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEBitwiseXor.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "boost_wrapper.h"
+
+#include <random>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+/** Compute Neon bitwise xor function.
+ *
+ * @param[in] shape Shape of the input and output tensors.
+ *
+ * @return Computed output tensor.
+ */
+Tensor compute_bitwise_xor(const TensorShape &shape)
+{
+    // Create tensors
+    Tensor src1 = create_tensor(shape, DataType::U8);
+    Tensor src2 = create_tensor(shape, DataType::U8);
+    Tensor dst  = create_tensor(shape, DataType::U8);
+
+    // Create xor configure function
+    NEBitwiseXor bxor;
+    bxor.configure(&src1, &src2, &dst);
+
+    // Allocate tensors
+    src1.allocator()->allocate();
+    src2.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src1.info()->is_resizable());
+    BOOST_TEST(!src2.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors
+    library->fill_tensor_uniform(NEAccessor(src1), 0);
+    library->fill_tensor_uniform(NEAccessor(src2), 1);
+
+    // Compute function
+    bxor.run();
+
+    return dst;
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+BOOST_AUTO_TEST_SUITE(BitwiseXor)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, SmallShapes() + LargeShapes(), shape)
+{
+    // Create tensors
+    Tensor src1 = create_tensor(shape, DataType::U8);
+    Tensor src2 = create_tensor(shape, DataType::U8);
+    Tensor dst  = create_tensor(shape, DataType::U8);
+
+    BOOST_TEST(src1.info()->is_resizable());
+    BOOST_TEST(src2.info()->is_resizable());
+    BOOST_TEST(dst.info()->is_resizable());
+
+    // Create xor configure function
+    NEBitwiseXor bxor;
+    bxor.configure(&src1, &src2, &dst);
+
+    // Validate valid region
+    const ValidRegion valid_region = shape_to_valid_region(shape);
+    validate(src1.info()->valid_region(), valid_region);
+    validate(src2.info()->valid_region(), valid_region);
+    validate(dst.info()->valid_region(), valid_region);
+
+    // Validate padding
+    const PaddingSize padding(0, required_padding(shape.x(), 16), 0, 0);
+    validate(src1.info()->padding(), padding);
+    validate(src2.info()->padding(), padding);
+    validate(dst.info()->padding(), padding);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes(), shape)
+{
+    // Compute function
+    Tensor dst = compute_bitwise_xor(shape);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_bitwise_xor(shape);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes(), shape)
+{
+    // Compute function
+    Tensor dst = compute_bitwise_xor(shape);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_bitwise_xor(shape);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/NEON/Box3x3.cpp b/tests/validation/NEON/Box3x3.cpp
new file mode 100644
index 0000000000..5da015c73a
--- /dev/null
+++ b/tests/validation/NEON/Box3x3.cpp
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "validation/Datasets.h"
+#include "validation/Reference.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEBox3x3.h"
+#include "arm_compute/runtime/SubTensor.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "boost_wrapper.h"
+
+#include <random>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+/** Compute Neon 3-by-3 box filter.
+ *
+ * @param[in] shape Shape of the input and output tensors.
+ *
+ * @return Computed output tensor.
+ */
+Tensor compute_box3x3(const TensorShape &shape)
+{
+    // Create tensors
+    Tensor src = create_tensor(shape, DataType::U8);
+    Tensor dst = create_tensor(shape, DataType::U8);
+
+    // Create and configure function
+    NEBox3x3 band;
+    band.configure(&src, &dst, BorderMode::UNDEFINED);
+
+    // Allocate tensors
+    src.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors
+    library->fill_tensor_uniform(NEAccessor(src), 0);
+
+    // Compute function
+    band.run();
+
+    return dst;
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+BOOST_AUTO_TEST_SUITE(Box3x3)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, SmallShapes() + LargeShapes(), shape)
+{
+    // Create tensors
+    Tensor src = create_tensor(shape, DataType::U8);
+    Tensor dst = create_tensor(shape, DataType::U8);
+
+    BOOST_TEST(src.info()->is_resizable());
+    BOOST_TEST(dst.info()->is_resizable());
+
+    // Create and configure function
+    NEBox3x3 band;
+    band.configure(&src, &dst, BorderMode::UNDEFINED);
+
+    // Validate valid region
+    const ValidRegion src_valid_region = shape_to_valid_region(shape);
+    const ValidRegion dst_valid_region = shape_to_valid_region_undefined_border(shape, BorderSize(1));
+    validate(src.info()->valid_region(), src_valid_region);
+    validate(dst.info()->valid_region(), dst_valid_region);
+
+    // Validate padding
+    const PaddingSize read_padding(0, required_padding_undefined_border_read(shape.x(), 16, 8), 0, 0);
+    const PaddingSize write_padding(0, required_padding_undefined_border_write(shape.x(), 8, 1), 0, 0);
+    validate(src.info()->padding(), read_padding);
+    validate(dst.info()->padding(), write_padding);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes(), shape)
+{
+    // Compute function
+    Tensor dst = compute_box3x3(shape);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_box3x3(shape);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, shape_to_valid_region_undefined_border(shape, BorderSize(1)));
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes(), shape)
+{
+    // Compute function
+    Tensor dst = compute_box3x3(shape);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_box3x3(shape);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, shape_to_valid_region_undefined_border(shape, BorderSize(1)));
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/NEON/CMakeLists.txt b/tests/validation/NEON/CMakeLists.txt
new file mode 100644
index 0000000000..52678f345b
--- /dev/null
+++ b/tests/validation/NEON/CMakeLists.txt
@@ -0,0 +1,55 @@
+# Copyright (c) 2017 ARM Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+cmake_minimum_required (VERSION 3.1)
+
+set(arm_compute_test_validation_NEON_SOURCE_FILES
+    ${CMAKE_SOURCE_DIR}/NEON/Helper.h
+    ${CMAKE_SOURCE_DIR}/NEON/NEAccessor.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/AbsoluteDifference.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/Accumulate.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/AccumulateSquared.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/AccumulateWeighted.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/ArithmeticAddition.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/ArithmeticSubtraction.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/BitwiseAnd.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/BitwiseNot.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/BitwiseOr.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/BitwiseXor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/Box3x3.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/Fixedpoint/Exp_QS8.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/Fixedpoint/Invsqrt_QS8.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/Fixedpoint/Log_QS8.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/Fixedpoint/Reciprocal_QS8.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/NormalizationLayer.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/PixelWiseMultiplication.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/IntegralImage.cpp
+)
+
+add_library(arm_compute_test_validation_NEON OBJECT
+    ${arm_compute_test_validation_NEON_SOURCE_FILES}
+)
+
+set(arm_compute_test_validation_TARGET_OBJECTS
+    ${arm_compute_test_validation_TARGET_OBJECTS}
+    $<TARGET_OBJECTS:arm_compute_test_validation_NEON>
+    PARENT_SCOPE
+)
diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp
new file mode 100644
index 0000000000..a1dbe38bbf
--- /dev/null
+++ b/tests/validation/NEON/ConvolutionLayer.cpp
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TypePrinter.h"
+#include "dataset/ConvolutionLayerDataset.h"
+#include "validation/Datasets.h"
+#include "validation/Reference.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+
+#include <random>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+const float tolerance_f32 = 1e-03f; /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
+const float tolerance_qs8 = 3.0f;   /**< Tolerance value for comparing reference's output against implementation's output for DataType::QS8 */
+
+Tensor compute_convolution_layer(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, DataType dt,
+                                 const PadStrideInfo &conv_info, int fixed_point_position)
+{
+    // Create tensors
+    Tensor src     = create_tensor(input_shape, dt, 1, fixed_point_position);
+    Tensor weights = create_tensor(weights_shape, dt, 1, fixed_point_position);
+    Tensor bias    = create_tensor(bias_shape, dt, 1, fixed_point_position);
+    Tensor dst     = create_tensor(output_shape, dt, 1, fixed_point_position);
+
+    // Create and configure function
+    NEConvolutionLayer conv;
+    conv.configure(&src, &weights, &bias, &dst, conv_info);
+
+    // Allocate tensors
+    src.allocator()->allocate();
+    weights.allocator()->allocate();
+    bias.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src.info()->is_resizable());
+    BOOST_TEST(!weights.info()->is_resizable());
+    BOOST_TEST(!bias.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors
+    if(dt == DataType::F32)
+    {
+        std::uniform_real_distribution<> distribution(-1.0f, 1.0f);
+        library->fill(NEAccessor(src), distribution, 0);
+        library->fill(NEAccessor(weights), distribution, 1);
+        library->fill(NEAccessor(bias), distribution, 2);
+    }
+    else
+    {
+        library->fill_tensor_uniform(NEAccessor(src), 0);
+        library->fill_tensor_uniform(NEAccessor(weights), 1);
+        library->fill_tensor_uniform(NEAccessor(bias), 2);
+    }
+
+    // Compute NEConvolutionLayer function
+    conv.run();
+
+    return dst;
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+BOOST_AUTO_TEST_SUITE(ConvolutionLayer)
+BOOST_AUTO_TEST_SUITE(GEMM)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration,
+                     AlexNetConvolutionLayerDataset() * boost::unit_test::data::make({ DataType::F32, DataType::QS8 }),
+                     conv_set, dt)
+{
+    // Set fixed point position data type allowed
+    int fixed_point_position = (dt == DataType::F32) ? 0 : 3;
+
+    // Create tensors
+    Tensor src     = create_tensor(conv_set.src_shape, dt, 1, fixed_point_position);
+    Tensor weights = create_tensor(conv_set.weights_shape, dt, 1, fixed_point_position);
+    Tensor bias    = create_tensor(conv_set.bias_shape, dt, 1, fixed_point_position);
+    Tensor dst     = create_tensor(conv_set.dst_shape, dt, 1, fixed_point_position);
+
+    BOOST_TEST(src.info()->is_resizable());
+    BOOST_TEST(weights.info()->is_resizable());
+    BOOST_TEST(bias.info()->is_resizable());
+    BOOST_TEST(dst.info()->is_resizable());
+
+    // Create and configure function
+    NEConvolutionLayer conv;
+    conv.configure(&src, &weights, &bias, &dst, conv_set.info);
+
+    // Validate valid region
+    const ValidRegion src_valid_region     = shape_to_valid_region(conv_set.src_shape);
+    const ValidRegion weights_valid_region = shape_to_valid_region(conv_set.weights_shape);
+    const ValidRegion bias_valid_region    = shape_to_valid_region(conv_set.bias_shape);
+    const ValidRegion dst_valid_region     = shape_to_valid_region(conv_set.dst_shape);
+
+    validate(src.info()->valid_region(), src_valid_region);
+    validate(weights.info()->valid_region(), weights_valid_region);
+    validate(bias.info()->valid_region(), bias_valid_region);
+    validate(dst.info()->valid_region(), dst_valid_region);
+}
+
+BOOST_AUTO_TEST_SUITE(Float)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(SmallConvolutionLayer,
+                     SmallConvolutionLayerDataset() * boost::unit_test::data::make(DataType::F32),
+                     conv_set, dt)
+{
+    // Compute function
+    Tensor dst = compute_convolution_layer(conv_set.src_shape, conv_set.weights_shape, conv_set.bias_shape, conv_set.dst_shape, dt, conv_set.info, 0);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_convolution_layer(conv_set.src_shape, conv_set.weights_shape, conv_set.bias_shape, conv_set.dst_shape, dt, conv_set.info, 0);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, tolerance_f32);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(LargeConvolutionLayer,
+                     AlexNetConvolutionLayerDataset() * boost::unit_test::data::make(DataType::F32),
+                     conv_set, dt)
+{
+    // Compute function
+    Tensor dst = compute_convolution_layer(conv_set.src_shape, conv_set.weights_shape, conv_set.bias_shape, conv_set.dst_shape, dt, conv_set.info, 0);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_convolution_layer(conv_set.src_shape, conv_set.weights_shape, conv_set.bias_shape, conv_set.dst_shape, dt, conv_set.info, 0);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, tolerance_f32);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(Quantized)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(SmallConvolutionLayer,
+                     SmallConvolutionLayerDataset() * boost::unit_test::data::make(DataType::QS8) * boost::unit_test::data::xrange(4, 7),
+                     conv_set, dt, fixed_point_position)
+{
+    // Compute function
+    Tensor dst = compute_convolution_layer(conv_set.src_shape, conv_set.weights_shape, conv_set.bias_shape, conv_set.dst_shape, dt, conv_set.info, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_convolution_layer(conv_set.src_shape, conv_set.weights_shape, conv_set.bias_shape, conv_set.dst_shape, dt, conv_set.info, fixed_point_position);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, tolerance_qs8);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(LargeConvolutionLayer,
+                     AlexNetConvolutionLayerDataset() * boost::unit_test::data::make(DataType::QS8) * boost::unit_test::data::xrange(4, 7),
+                     conv_set, dt, fixed_point_position)
+{
+    // Compute function
+    Tensor dst = compute_convolution_layer(conv_set.src_shape, conv_set.weights_shape, conv_set.bias_shape, conv_set.dst_shape, dt, conv_set.info, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_convolution_layer(conv_set.src_shape, conv_set.weights_shape, conv_set.bias_shape, conv_set.dst_shape, dt, conv_set.info, fixed_point_position);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, tolerance_qs8);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
+\ No newline at end of file
diff --git a/tests/validation/NEON/ConvolutionLayerDirect.cpp b/tests/validation/NEON/ConvolutionLayerDirect.cpp
new file mode 100644
index 0000000000..4e36e331bd
--- /dev/null
+++ b/tests/validation/NEON/ConvolutionLayerDirect.cpp
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "validation/Datasets.h"
+#include "validation/Reference.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "boost_wrapper.h"
+
+#include <random>
+#include <string>
+#include <tuple>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+const float tolerance_fp  = 1e-3f; /**< Tolerance for floating point tests */
+const float tolerance_qs8 = 1;     /**< Tolerance for fixed point tests */
+
+/** Compute NEON direct convolution layer function.
+ *
+ * @param[in] src_shape            Shape of the input tensor.
+ * @param[in] weights_shape        Shape of the weights.
+ * @param[in] bias_shape           Shape of the bias tensor.
+ * @param[in] dst_shape            Shape of the output tensor.
+ * @param[in] dt                   Data type of input, convolution matrix and output tensors.
+ * @param[in] conv_info            Padding and stride information.
+ * @param[in] fixed_point_position (Optional) Number of bits for the fractional part of the fixed point numbers
+ *
+ * @return Computed output tensor.
+*/
+Tensor compute_convolution_layer(const TensorShape &src_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &dst_shape,
+                                 DataType dt, PadStrideInfo conv_info, int fixed_point_position = 0)
+{
+    // Create tensors
+    Tensor src     = create_tensor(src_shape, dt, 1, fixed_point_position);
+    Tensor weights = create_tensor(weights_shape, dt, 1, fixed_point_position);
+    Tensor bias    = create_tensor(bias_shape, dt, 1, fixed_point_position);
+    Tensor dst     = create_tensor(dst_shape, dt, 1, fixed_point_position);
+
+    // Create and configure function
+    NEDirectConvolutionLayer conv_layer;
+    conv_layer.configure(&src, &weights, &bias, &dst, conv_info);
+
+    // Allocate tensors
+    src.allocator()->allocate();
+    weights.allocator()->allocate();
+    bias.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src.info()->is_resizable());
+    BOOST_TEST(!weights.info()->is_resizable());
+    BOOST_TEST(!bias.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors
+    if(dt == DataType::F32)
+    {
+        std::uniform_real_distribution<> distribution(-1.f, 1.f);
+        library->fill(NEAccessor(src), distribution, 0);
+        library->fill(NEAccessor(weights), distribution, 1);
+        library->fill(NEAccessor(bias), distribution, 2);
+    }
+    else
+    {
+        library->fill_tensor_uniform(NEAccessor(src), 0);
+        library->fill_tensor_uniform(NEAccessor(weights), 1);
+        library->fill_tensor_uniform(NEAccessor(bias), 2);
+    }
+
+    // Compute function
+    conv_layer.run();
+
+    return dst;
+}
+
+TensorShape get_output_shape(TensorShape in_shape, TensorShape kernel_shape, const PadStrideInfo &conv_info)
+{
+    TensorShape out_shape(in_shape);
+    const std::pair<unsigned int, unsigned int> scaled_dims = arm_compute::scaled_dimensions(in_shape.x(),
+                                                                                             in_shape.y(),
+                                                                                             kernel_shape.x(),
+                                                                                             conv_info.stride().first, conv_info.stride().second,
+                                                                                             conv_info.pad().first, conv_info.pad().second,
+                                                                                             conv_info.round());
+    out_shape.set(0, scaled_dims.first);
+    out_shape.set(1, scaled_dims.second);
+    out_shape.set(2, kernel_shape[3]);
+    return out_shape;
+}
+
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+BOOST_AUTO_TEST_SUITE(ConvolutionLayer)
+BOOST_AUTO_TEST_SUITE(Direct)
+
+BOOST_AUTO_TEST_SUITE(Float)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(W1x1,
+                     DirectConvolutionShapes() * CNNFloatDataTypes() * boost::unit_test::data::xrange(1, 3, 1) * boost::unit_test::data::xrange(1, 3, 1) * boost::unit_test::data::make({ 1, 4, 8, 16 }),
+                     input_shape, dt, sx, sy, num_kernels)
+{
+    const unsigned int  kernel_size = 1;
+    const PadStrideInfo conv_info(sx, sy, 0, 0, DimensionRoundingType::FLOOR);
+    const TensorShape   w_shape(kernel_size, kernel_size, input_shape.z(), static_cast<unsigned int>(num_kernels));
+    const TensorShape   b_shape(static_cast<unsigned int>(num_kernels));
+    const TensorShape   d_shape(get_output_shape(input_shape, w_shape, conv_info));
+
+    Tensor dst = compute_convolution_layer(input_shape, w_shape, b_shape, d_shape, dt, conv_info);
+
+    RawTensor ref = Reference::compute_reference_convolution_layer(input_shape, w_shape, b_shape, d_shape, dt, conv_info, 0);
+
+    // Validate output
+    validate(NEAccessor(dst), ref);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(W3x3, DirectConvolutionShapes() * CNNFloatDataTypes() * boost::unit_test::data::xrange(1, 3, 1) * boost::unit_test::data::xrange(1, 3, 1) * boost::unit_test::data::xrange(0, 2,
+                     1)
+                     * boost::unit_test::data::xrange(0, 2, 1) * boost::unit_test::data::make({ 1, 4, 8, 16 }),
+                     input_shape, dt, sx, sy, px, py, num_kernels)
+{
+    const unsigned int  kernel_size = 3;
+    const PadStrideInfo conv_info(sx, sy, px, py, DimensionRoundingType::FLOOR);
+    const TensorShape   w_shape(kernel_size, kernel_size, input_shape.z(), static_cast<unsigned int>(num_kernels));
+    const TensorShape   b_shape(static_cast<unsigned int>(num_kernels));
+    const TensorShape   d_shape(get_output_shape(input_shape, w_shape, conv_info));
+
+    Tensor dst = compute_convolution_layer(input_shape, w_shape, b_shape, d_shape, dt, conv_info);
+
+    RawTensor ref = Reference::compute_reference_convolution_layer(input_shape, w_shape, b_shape, d_shape, dt, conv_info, 0);
+
+    // Validate output
+    validate(NEAccessor(dst), ref, tolerance_fp);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(Quantized)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(W1x1,
+                     DirectConvolutionShapes() * boost::unit_test::data::xrange(1, 3, 1) * boost::unit_test::data::xrange(1, 3, 1) * boost::unit_test::data::make({ 1, 4, 8, 16 }) * boost::unit_test::data::make({ 4, 5 }),
+                     input_shape, sx, sy, num_kernels, fixed_point_position)
+{
+    const unsigned int  kernel_size = 1;
+    const PadStrideInfo conv_info(sx, sy, 0, 0, DimensionRoundingType::FLOOR);
+    const TensorShape   w_shape(kernel_size, kernel_size, input_shape.z(), static_cast<unsigned int>(num_kernels));
+    const TensorShape   b_shape(static_cast<unsigned int>(num_kernels));
+    const TensorShape   d_shape(get_output_shape(input_shape, w_shape, conv_info));
+
+    Tensor dst = compute_convolution_layer(input_shape, w_shape, b_shape, d_shape, DataType::QS8, conv_info, fixed_point_position);
+
+    RawTensor ref = Reference::compute_reference_convolution_layer(input_shape, w_shape, b_shape, d_shape, DataType::QS8, conv_info, fixed_point_position);
+
+    // Validate output
+    validate(NEAccessor(dst), ref);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(W3x3, DirectConvolutionShapes() * boost::unit_test::data::xrange(1, 3, 1) * boost::unit_test::data::xrange(1, 3, 1) * boost::unit_test::data::xrange(0, 2, 1)
+                     * boost::unit_test::data::xrange(0, 2, 1) * boost::unit_test::data::make({ 1, 4, 8, 16 }) * boost::unit_test::data::make({ 4, 5 }),
+                     input_shape, sx, sy, px, py, num_kernels, fixed_point_position)
+{
+    const unsigned int  kernel_size = 3;
+    const PadStrideInfo conv_info(sx, sy, px, py, DimensionRoundingType::FLOOR);
+    const TensorShape   w_shape(kernel_size, kernel_size, input_shape.z(), static_cast<unsigned int>(num_kernels));
+    const TensorShape   b_shape(static_cast<unsigned int>(num_kernels));
+    const TensorShape   d_shape(get_output_shape(input_shape, w_shape, conv_info));
+
+    Tensor dst = compute_convolution_layer(input_shape, w_shape, b_shape, d_shape, DataType::QS8, conv_info, fixed_point_position);
+
+    RawTensor ref = Reference::compute_reference_convolution_layer(input_shape, w_shape, b_shape, d_shape, DataType::QS8, conv_info, fixed_point_position);
+
+    // Validate output
+    validate(NEAccessor(dst), ref, tolerance_qs8);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
+\ No newline at end of file
diff --git a/tests/validation/NEON/DepthConvert.cpp b/tests/validation/NEON/DepthConvert.cpp
new file mode 100644
index 0000000000..ec0bb7ccc5
--- /dev/null
+++ b/tests/validation/NEON/DepthConvert.cpp
@@ -0,0 +1,500 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "validation/Datasets.h"
+#include "validation/Reference.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "boost_wrapper.h"
+
+#include <random>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+/** Compute Neon depth convert function.
+ *
+ * @param[in] shape                Shape of the input and output tensors.
+ * @param[in] dt_in                Data type of input tensor.
+ * @param[in] dt_out               Data type of the output tensor.
+ * @param[in] policy               Conversion policy.
+ * @param[in] shift                Value for down/up conversions. Must be 0 <= shift < 8.
+ * @param[in] fixed_point_position Fixed point position.
+ *
+ * @return Computed output tensor.
+ */
+Tensor compute_depth_convert(const TensorShape &shape, DataType dt_in, DataType dt_out, ConvertPolicy policy, uint32_t shift, uint32_t fixed_point_position)
+{
+    // Create tensors
+    Tensor src = create_tensor(shape, dt_in, 1, fixed_point_position);
+    Tensor dst = create_tensor(shape, dt_out, 1, fixed_point_position);
+
+    // Create and configure function
+    NEDepthConvert depth_convert;
+    depth_convert.configure(&src, &dst, policy, shift);
+
+    // Allocate tensors
+    src.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors
+    library->fill_tensor_uniform(NEAccessor(src), 0);
+
+    // Compute function
+    depth_convert.run();
+
+    return dst;
+}
+/** Configure and validate region/padding function.
+ *
+ * @param[in] shape                Shape of the input and output tensors.
+ * @param[in] dt_in                Data type of input tensor.
+ * @param[in] dt_out               Data type of the output tensor.
+ * @param[in] policy               Conversion policy.
+ * @param[in] shift                Value for down/up conversions. Must be 0 <= shift < 8.
+ * @param[in] fixed_point_position Fixed point position.
+ *
+ */
+
+void compute_configure_validate(const TensorShape &shape, DataType dt_in, DataType dt_out, ConvertPolicy policy, uint32_t shift, uint32_t fixed_point_position)
+{
+    // Create tensors
+    Tensor src = create_tensor(shape, dt_in, 1, fixed_point_position);
+    Tensor dst = create_tensor(shape, dt_out, 1, fixed_point_position);
+
+    BOOST_TEST(src.info()->is_resizable());
+    BOOST_TEST(dst.info()->is_resizable());
+
+    // Create and configure function
+    NEDepthConvert depth_convert;
+    depth_convert.configure(&src, &dst, policy, shift);
+
+    // Validate valid region
+    const ValidRegion valid_region = shape_to_valid_region(shape);
+    validate(src.info()->valid_region(), valid_region);
+    validate(dst.info()->valid_region(), valid_region);
+
+    // Validate padding
+    const PaddingSize padding(0, required_padding(shape.x(), 16), 0, 0);
+    validate(src.info()->padding(), padding);
+    validate(dst.info()->padding(), padding);
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+BOOST_AUTO_TEST_SUITE(DepthConvert)
+
+BOOST_AUTO_TEST_SUITE(QS8_to_F32)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ ConvertPolicy::SATURATE })
+                     * boost::unit_test::data::xrange(1, 7, 1),
+                     shape, policy, fixed_point_position)
+{
+    // Compute configure and validate region/padding
+    compute_configure_validate(shape, DataType::QS8, DataType::F32, policy, 0, fixed_point_position);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE })
+                     * boost::unit_test::data::xrange(1, 7, 1),
+                     shape, policy, fixed_point_position)
+{
+    // Compute function
+    Tensor dst = compute_depth_convert(shape, DataType::QS8, DataType::F32, policy, 0, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::QS8, DataType::F32, policy, 0, fixed_point_position);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE })
+                     * boost::unit_test::data::xrange(1, 7, 1),
+                     shape, policy, fixed_point_position)
+{
+    // Compute function
+    Tensor dst = compute_depth_convert(shape, DataType::QS8, DataType::F32, policy, 0, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::QS8, DataType::F32, policy, 0, fixed_point_position);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(F32_to_QS8)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ ConvertPolicy::SATURATE })
+                     * boost::unit_test::data::xrange(1, 7, 1),
+                     shape, policy, fixed_point_position)
+{
+    // Compute configure and validate region/padding
+    compute_configure_validate(shape, DataType::F32, DataType::QS8, policy, 0, fixed_point_position);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE })
+                     * boost::unit_test::data::xrange(1, 7, 1),
+                     shape, policy, fixed_point_position)
+{
+    // Compute function
+    Tensor dst = compute_depth_convert(shape, DataType::F32, DataType::QS8, policy, 0, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::F32, DataType::QS8, policy, 0, fixed_point_position);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE })
+                     * boost::unit_test::data::xrange(1, 7, 1),
+                     shape, policy, fixed_point_position)
+{
+    // Compute function
+    Tensor dst = compute_depth_convert(shape, DataType::F32, DataType::QS8, policy, 0, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::F32, DataType::QS8, policy, 0, fixed_point_position);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(U8_to_U16)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute configure and validate region/padding
+    compute_configure_validate(shape, DataType::U8, DataType::U16, policy, shift, 0);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute function
+    Tensor dst = compute_depth_convert(shape, DataType::U8, DataType::U16, policy, shift, 0);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::U8, DataType::U16, policy, shift, 0);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute function
+    Tensor dst = compute_depth_convert(shape, DataType::U8, DataType::U16, policy, shift, 0);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::U8, DataType::U16, policy, shift, 0);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(U8_to_S16)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute configure and validate region/padding
+    compute_configure_validate(shape, DataType::U8, DataType::S16, policy, shift, 0);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute function
+    Tensor dst = compute_depth_convert(shape, DataType::U8, DataType::S16, policy, shift, 0);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::U8, DataType::S16, policy, shift, 0);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute function
+    Tensor dst = compute_depth_convert(shape, DataType::U8, DataType::S16, policy, shift, 0);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::U8, DataType::S16, policy, shift, 0);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(U8_to_S32)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute configure and validate region/padding
+    compute_configure_validate(shape, DataType::U8, DataType::S32, policy, shift, 0);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute function
+    Tensor dst = compute_depth_convert(shape, DataType::U8, DataType::S32, policy, shift, 0);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::U8, DataType::S32, policy, shift, 0);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute function
+    Tensor dst = compute_depth_convert(shape, DataType::U8, DataType::S32, policy, shift, 0);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::U8, DataType::S32, policy, shift, 0);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(U16_to_U8)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute configure and validate region/padding
+    compute_configure_validate(shape, DataType::U16, DataType::U8, policy, shift, 0);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute function
+    Tensor dst = compute_depth_convert(shape, DataType::U16, DataType::U8, policy, shift, 0);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::U16, DataType::U8, policy, shift, 0);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute function
+    Tensor dst = compute_depth_convert(shape, DataType::U16, DataType::U8, policy, shift, 0);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::U16, DataType::U8, policy, shift, 0);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(U16_to_U32)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute configure and validate region/padding
+    compute_configure_validate(shape, DataType::U16, DataType::U32, policy, shift, 0);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute function
+    Tensor dst = compute_depth_convert(shape, DataType::U16, DataType::U32, policy, shift, 0);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::U16, DataType::U32, policy, shift, 0);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute function
+    Tensor dst = compute_depth_convert(shape, DataType::U16, DataType::U32, policy, shift, 0);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::U16, DataType::U32, policy, shift, 0);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(S16_to_U8)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute configure and validate region/padding
+    compute_configure_validate(shape, DataType::S16, DataType::U8, policy, shift, 0);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute function
+    Tensor dst = compute_depth_convert(shape, DataType::S16, DataType::U8, policy, shift, 0);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::S16, DataType::U8, policy, shift, 0);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute function
+    Tensor dst = compute_depth_convert(shape, DataType::S16, DataType::U8, policy, shift, 0);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::S16, DataType::U8, policy, shift, 0);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(S16_to_S32)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute configure and validate region/padding
+    compute_configure_validate(shape, DataType::S16, DataType::S32, policy, shift, 0);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute function
+    Tensor dst = compute_depth_convert(shape, DataType::S16, DataType::S32, policy, shift, 0);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::S16, DataType::S32, policy, shift, 0);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE, ConvertPolicy::WRAP })
+                     * boost::unit_test::data::xrange(0, 7, 1),
+                     shape, policy, shift)
+{
+    // Compute function
+    Tensor dst = compute_depth_convert(shape, DataType::S16, DataType::S32, policy, shift, 0);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::S16, DataType::S32, policy, shift, 0);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/NEON/FillBorder.cpp b/tests/validation/NEON/FillBorder.cpp
new file mode 100644
index 0000000000..9fbeb998f5
--- /dev/null
+++ b/tests/validation/NEON/FillBorder.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "validation/Datasets.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "boost_wrapper.h"
+
+#include <random>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(FillBorder, BorderModes() * boost::unit_test::data::make({ PaddingSize{ 0 }, PaddingSize{ 1, 0, 1, 2 }, PaddingSize{ 10 } }), border_mode, padding)
+{
+    constexpr uint8_t border_value = 42U;
+    constexpr uint8_t tensor_value = 89U;
+    BorderSize        border_size{ 5 };
+
+    // Create tensors
+    Tensor src = create_tensor(TensorShape{ 10U, 10U, 2U }, DataType::U8);
+
+    src.info()->extend_padding(padding);
+
+    // Allocate tensor
+    src.allocator()->allocate();
+
+    // Check padding is as required
+    validate(src.info()->padding(), padding);
+
+    // Fill tensor with constant value
+    std::uniform_int_distribution<uint8_t> distribution{ tensor_value, tensor_value };
+    library->fill(NEAccessor(src), distribution, 0);
+
+    // Create and configure kernel
+    NEFillBorderKernel fill_border;
+    fill_border.configure(&src, border_size, border_mode, border_value);
+
+    // Run kernel
+    fill_border.run(fill_border.window());
+
+    // Validate border
+    border_size.limit(padding);
+    validate(NEAccessor(src), border_size, border_mode, &border_value);
+
+    // Validate tensor
+    validate(NEAccessor(src), &tensor_value);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/NEON/Fixedpoint/Exp_QS8.cpp b/tests/validation/NEON/Fixedpoint/Exp_QS8.cpp
new file mode 100644
index 0000000000..086314fdd3
--- /dev/null
+++ b/tests/validation/NEON/Fixedpoint/Exp_QS8.cpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "validation/Datasets.h"
+#include "validation/ReferenceCPP.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "boost_wrapper.h"
+
+#include <random>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+const float tolerance = 0.0f; /**< Tolerance value for comparing reference's output against implementation's output */
+
+/** Compute Neon exponential function for signed 8bit fixed point.
+ *
+ * @param[in] shape Shape of the input and output tensors.
+ *
+ * @return Computed output tensor.
+ */
+Tensor compute_exp_qs8(const TensorShape &shape, int fixed_point_position)
+{
+    // Create tensors
+    Tensor src = create_tensor(shape, DataType::QS8, 1, fixed_point_position);
+    Tensor dst = create_tensor(shape, DataType::QS8, 1, fixed_point_position);
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    Window                 window                            = calculate_max_window(*src.info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(src.info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(dst.info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(window, input_access, output_access);
+    output_access.set_valid_region(window, src.info()->valid_region());
+
+    // Allocate tensors
+    src.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors. Keep the range between (1, (1 << (fixed_point_position - 1))) so the result won't
+    // overflow. E.g. e^7 = 1096, which cannot be represented in QS8
+    std::uniform_int_distribution<> distribution(1, (1 << (fixed_point_position - 1)));
+    library->fill(NEAccessor(src), distribution, 0);
+
+    Iterator input(&src, window);
+    Iterator output(&dst, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        qint8x16_t in = vld1q_s8(reinterpret_cast<const qint8_t *>(input.ptr()));
+        // Use saturated exp
+        vst1q_s8(reinterpret_cast<qint8_t *>(output.ptr()), vqexpq_qs8(in, fixed_point_position));
+    },
+    input, output);
+
+    return dst;
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+BOOST_AUTO_TEST_SUITE(FixedPoint)
+BOOST_AUTO_TEST_SUITE(QS8)
+BOOST_AUTO_TEST_SUITE(Exp)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunSmall, Small1DShape() * boost::unit_test::data::xrange(1, 7), shape, fixed_point_position)
+{
+    // Compute function
+    Tensor dst = compute_exp_qs8(shape, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_fixed_point_operation(shape, DataType::QS8, DataType::QS8, FixedPointOp::EXP, fixed_point_position);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, tolerance, 0);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/NEON/Fixedpoint/Invsqrt_QS8.cpp b/tests/validation/NEON/Fixedpoint/Invsqrt_QS8.cpp
new file mode 100644
index 0000000000..3308f7d855
--- /dev/null
+++ b/tests/validation/NEON/Fixedpoint/Invsqrt_QS8.cpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "validation/Datasets.h"
+#include "validation/ReferenceCPP.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "boost_wrapper.h"
+
+#include <random>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+const float tolerance = 3; /**< Tolerance value for comparing reference's output against implementation's output */
+
+/** Compute Neon inverse square root function for signed 8bit fixed point.
+ *
+ * @param[in] shape Shape of the input and output tensors.
+ *
+ * @return Computed output tensor.
+ */
+Tensor compute_invsqrt_qs8(const TensorShape &shape, int fixed_point_position)
+{
+    // Create tensors
+    Tensor src = create_tensor(shape, DataType::QS8, 1, fixed_point_position);
+    Tensor dst = create_tensor(shape, DataType::QS8, 1, fixed_point_position);
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    Window                 window                            = calculate_max_window(*src.info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(src.info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(dst.info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(window, input_access, output_access);
+    output_access.set_valid_region(window, src.info()->valid_region());
+
+    // Allocate tensors
+    src.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors. Keep the range between (32, 127) so the result won't
+    // overflow. E.g. for Q2.5 invsqrt(0.001) = 31.6, which cannot be represented.
+    std::uniform_int_distribution<> distribution(32, 127);
+    library->fill(NEAccessor(src), distribution, 0);
+
+    Iterator input(&src, window);
+    Iterator output(&dst, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        qint8x16_t in = vld1q_s8(reinterpret_cast<const qint8_t *>(input.ptr()));
+        vst1q_s8(reinterpret_cast<qint8_t *>(output.ptr()), vinvsqrtq_qs8(in, fixed_point_position));
+    },
+    input, output);
+
+    return dst;
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+BOOST_AUTO_TEST_SUITE(FixedPoint)
+BOOST_AUTO_TEST_SUITE(QS8)
+BOOST_AUTO_TEST_SUITE(Invsqrt)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Small1DShape, SmallShapes() * boost::unit_test::data::xrange(1, 6), shape, fixed_point_position)
+{
+    // Compute function
+    Tensor dst = compute_invsqrt_qs8(shape, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_fixed_point_operation(shape, DataType::QS8, DataType::QS8, FixedPointOp::INV_SQRT, fixed_point_position);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, tolerance, 0);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/NEON/Fixedpoint/Log_QS8.cpp b/tests/validation/NEON/Fixedpoint/Log_QS8.cpp
new file mode 100644
index 0000000000..7b734c12b1
--- /dev/null
+++ b/tests/validation/NEON/Fixedpoint/Log_QS8.cpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "validation/Datasets.h"
+#include "validation/ReferenceCPP.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "boost_wrapper.h"
+
+#include <random>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+const float tolerance = 5; /**< Tolerance value for comparing reference's output against implementation's output */
+
+/** Compute Neon logarithm function for signed 8bit fixed point.
+ *
+ * @param[in] shape Shape of the input and output tensors.
+ *
+ * @return Computed output tensor.
+ */
+Tensor compute_log_qs8(const TensorShape &shape, int fixed_point_position)
+{
+    // Create tensors
+    Tensor src = create_tensor(shape, DataType::QS8, 1, fixed_point_position);
+    Tensor dst = create_tensor(shape, DataType::QS8, 1, fixed_point_position);
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    Window                 window                            = calculate_max_window(*src.info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(src.info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(dst.info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(window, input_access, output_access);
+    output_access.set_valid_region(window, src.info()->valid_region());
+
+    // Allocate tensors
+    src.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors. Keep the range between ((1 << (fixed_point_position - 1), 63) so the result won't
+    // overflow. E.g. for Q2.5 ln(0.001) = -6.9, which cannot be represented.
+    std::uniform_int_distribution<> distribution((1 << (fixed_point_position - 1)), 63);
+    library->fill(NEAccessor(src), distribution, 0);
+
+    Iterator input(&src, window);
+    Iterator output(&dst, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        qint8x16_t in = vld1q_s8(reinterpret_cast<const qint8_t *>(input.ptr()));
+        vst1q_s8(reinterpret_cast<qint8_t *>(output.ptr()), vlogq_qs8(in, fixed_point_position));
+    },
+    input, output);
+
+    return dst;
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+BOOST_AUTO_TEST_SUITE(FixedPoint)
+BOOST_AUTO_TEST_SUITE(QS8)
+BOOST_AUTO_TEST_SUITE(Log)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunSmall, Small1DShape() * boost::unit_test::data::xrange(3, 6), shape, fixed_point_position)
+{
+    // Compute function
+    Tensor dst = compute_log_qs8(shape, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_fixed_point_operation(shape, DataType::QS8, DataType::QS8, FixedPointOp::LOG, fixed_point_position);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, tolerance, 0);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/NEON/Fixedpoint/Reciprocal_QS8.cpp b/tests/validation/NEON/Fixedpoint/Reciprocal_QS8.cpp
new file mode 100644
index 0000000000..4c1c782a18
--- /dev/null
+++ b/tests/validation/NEON/Fixedpoint/Reciprocal_QS8.cpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "validation/Datasets.h"
+#include "validation/ReferenceCPP.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "boost_wrapper.h"
+
+#include <random>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+const float tolerance = 3; /**< Tolerance value for comparing reference's output against implementation's output */
+
+/** Compute Neon reciprocal function for signed 8bit fixed point.
+ *
+ * @param[in] shape Shape of the input and output tensors.
+ *
+ * @return Computed output tensor.
+ */
+Tensor compute_reciprocal_qs8(const TensorShape &shape, int fixed_point_position)
+{
+    // Create tensors
+    Tensor src = create_tensor(shape, DataType::QS8, 1, fixed_point_position);
+    Tensor dst = create_tensor(shape, DataType::QS8, 1, fixed_point_position);
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    Window                 window                            = calculate_max_window(*src.info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(src.info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(dst.info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(window, input_access, output_access);
+    output_access.set_valid_region(window, src.info()->valid_region());
+
+    // Allocate tensors
+    src.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors. Keep the range between (15, 100) so the result won't
+    // overflow. E.g. for Q2.5 reciprocal(0.001) = 1000, which cannot be represented.
+    std::uniform_int_distribution<> distribution(15, 100);
+    library->fill(NEAccessor(src), distribution, 0);
+
+    Iterator input(&src, window);
+    Iterator output(&dst, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        qint8x16_t in = vld1q_s8(reinterpret_cast<const qint8_t *>(input.ptr()));
+        vst1q_s8(reinterpret_cast<qint8_t *>(output.ptr()), vrecipq_qs8(in, fixed_point_position));
+    },
+    input, output);
+
+    return dst;
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+BOOST_AUTO_TEST_SUITE(FixedPoint)
+BOOST_AUTO_TEST_SUITE(QS8)
+BOOST_AUTO_TEST_SUITE(Reciprocal)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunSmall, Small1DShape() * boost::unit_test::data::xrange(1, 6), shape, fixed_point_position)
+{
+    // Compute function
+    Tensor dst = compute_reciprocal_qs8(shape, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_fixed_point_operation(shape, DataType::QS8, DataType::QS8, FixedPointOp::RECIPROCAL, fixed_point_position);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, tolerance, 0);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/NEON/FullyConnectedLayer.cpp b/tests/validation/NEON/FullyConnectedLayer.cpp
new file mode 100644
index 0000000000..bda235bd55
--- /dev/null
+++ b/tests/validation/NEON/FullyConnectedLayer.cpp
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TypePrinter.h"
+#include "dataset/FullyConnectedLayerDataset.h"
+#include "validation/Datasets.h"
+#include "validation/Reference.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+
+#include <random>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+const float tolerance_f32 = 1e-03f; /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
+const float tolerance_qs8 = 1.0f;   /**< Tolerance value for comparing reference's output against implementation's output for DataType::QS8 */
+
+Tensor compute_fully_connected_layer(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, DataType dt,
+                                     bool transpose_weights, int fixed_point_position)
+{
+    // Create tensors
+    Tensor src  = create_tensor(input_shape, dt, 1, fixed_point_position);
+    Tensor bias = create_tensor(bias_shape, dt, 1, fixed_point_position);
+    Tensor dst  = create_tensor(output_shape, dt, 1, fixed_point_position);
+
+    // Swap the first and second dimension of weights' shape if transpose_weights is true
+    TensorShape ws = weights_shape;
+    if(transpose_weights)
+    {
+        const size_t dimx = ws.x();
+        ws.set(0, ws.y());
+        ws.set(1, dimx);
+    }
+
+    Tensor weights = create_tensor(ws, dt, 1, fixed_point_position);
+
+    // Create and configure function.
+    // Note: We pass the weights already transposed
+    NEFullyConnectedLayer fc;
+    fc.configure(&src, &weights, &bias, &dst, false);
+
+    // Allocate tensors
+    src.allocator()->allocate();
+    weights.allocator()->allocate();
+    bias.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src.info()->is_resizable());
+    BOOST_TEST(!weights.info()->is_resizable());
+    BOOST_TEST(!bias.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors
+    if(dt == DataType::F32)
+    {
+        std::uniform_real_distribution<> distribution(-1.0f, 1.0f);
+        library->fill(NEAccessor(src), distribution, 0);
+        library->fill(NEAccessor(weights), distribution, 1);
+        library->fill(NEAccessor(bias), distribution, 2);
+    }
+    else
+    {
+        library->fill_tensor_uniform(NEAccessor(src), 0);
+        library->fill_tensor_uniform(NEAccessor(weights), 1);
+        library->fill_tensor_uniform(NEAccessor(bias), 2);
+    }
+
+    // Compute NEFullyConnectedLayer function
+    fc.run();
+
+    return dst;
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+BOOST_AUTO_TEST_SUITE(FullyConnectedLayer)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration,
+                     SmallFullyConnectedLayerDataset() * boost::unit_test::data::make({ DataType::F32, DataType::QS8 }),
+                     fc_set, dt)
+{
+    // Set fixed point position data type allowed
+    int fixed_point_position = (dt == DataType::F32) ? 0 : 3;
+
+    // Create tensors
+    Tensor src  = create_tensor(fc_set.src_shape, dt, 1, fixed_point_position);
+    Tensor bias = create_tensor(fc_set.bias_shape, dt, 1, fixed_point_position);
+    Tensor dst  = create_tensor(fc_set.dst_shape, dt, 1, fixed_point_position);
+
+    // Swap the first and second dimension of weights' shape if transpose_weights is true
+    TensorShape ws = fc_set.weights_shape;
+    if(fc_set.transpose_weights)
+    {
+        const size_t dimx = ws.x();
+        ws.set(0, ws.y());
+        ws.set(1, dimx);
+    }
+
+    Tensor weights = create_tensor(ws, dt, 1, fixed_point_position);
+
+    BOOST_TEST(src.info()->is_resizable());
+    BOOST_TEST(weights.info()->is_resizable());
+    BOOST_TEST(bias.info()->is_resizable());
+    BOOST_TEST(dst.info()->is_resizable());
+
+    // Create and configure function.
+    // Note: We pass the weights already transposed
+    NEFullyConnectedLayer fc;
+    fc.configure(&src, &weights, &bias, &dst, false);
+
+    // Validate valid region
+    const ValidRegion src_valid_region     = shape_to_valid_region(fc_set.src_shape);
+    const ValidRegion weights_valid_region = shape_to_valid_region(ws);
+    const ValidRegion bias_valid_region    = shape_to_valid_region(fc_set.bias_shape);
+    const ValidRegion dst_valid_region     = shape_to_valid_region(fc_set.dst_shape);
+
+    validate(src.info()->valid_region(), src_valid_region);
+    validate(weights.info()->valid_region(), weights_valid_region);
+    validate(bias.info()->valid_region(), bias_valid_region);
+    validate(dst.info()->valid_region(), dst_valid_region);
+}
+
+BOOST_AUTO_TEST_SUITE(Float)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall,
+                     SmallFullyConnectedLayerDataset() * boost::unit_test::data::make({ DataType::F32 }),
+                     fc_set, dt)
+{
+    // Compute function
+    Tensor dst = compute_fully_connected_layer(fc_set.src_shape, fc_set.weights_shape, fc_set.bias_shape, fc_set.dst_shape, dt, fc_set.transpose_weights, 0);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_fully_connected_layer(fc_set.src_shape, fc_set.weights_shape, fc_set.bias_shape, fc_set.dst_shape, dt, fc_set.transpose_weights, 0);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, tolerance_f32);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge,
+                     LargeFullyConnectedLayerDataset() * boost::unit_test::data::make({ DataType::F32 }),
+                     fc_set, dt)
+{
+    // Compute function
+    Tensor dst = compute_fully_connected_layer(fc_set.src_shape, fc_set.weights_shape, fc_set.bias_shape, fc_set.dst_shape, dt, fc_set.transpose_weights, 0);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_fully_connected_layer(fc_set.src_shape, fc_set.weights_shape, fc_set.bias_shape, fc_set.dst_shape, dt, fc_set.transpose_weights, 0);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, tolerance_f32);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(Quantized)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall,
+                     SmallFullyConnectedLayerDataset() * boost::unit_test::data::make({ DataType::QS8 }) * boost::unit_test::data::xrange(4, 7),
+                     fc_set, dt, fixed_point_position)
+{
+    // Compute function
+    Tensor dst = compute_fully_connected_layer(fc_set.src_shape, fc_set.weights_shape, fc_set.bias_shape, fc_set.dst_shape, dt, fc_set.transpose_weights, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_fully_connected_layer(fc_set.src_shape, fc_set.weights_shape, fc_set.bias_shape, fc_set.dst_shape, dt, fc_set.transpose_weights, fixed_point_position);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, tolerance_qs8);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge,
+                     LargeFullyConnectedLayerDataset() * boost::unit_test::data::make({ DataType::QS8 }) * boost::unit_test::data::xrange(4, 7),
+                     fc_set, dt, fixed_point_position)
+{
+    // Compute function
+    Tensor dst = compute_fully_connected_layer(fc_set.src_shape, fc_set.weights_shape, fc_set.bias_shape, fc_set.dst_shape, dt, fc_set.transpose_weights, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_fully_connected_layer(fc_set.src_shape, fc_set.weights_shape, fc_set.bias_shape, fc_set.dst_shape, dt, fc_set.transpose_weights, fixed_point_position);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, tolerance_qs8);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/NEON/GEMM.cpp b/tests/validation/NEON/GEMM.cpp
new file mode 100644
index 0000000000..0172ddeb76
--- /dev/null
+++ b/tests/validation/NEON/GEMM.cpp
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "dataset/GEMMDataset.h"
+#include "validation/Datasets.h"
+#include "validation/Reference.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "boost_wrapper.h"
+
+#include <random>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+const float tolerance_f32 = 1e-03f; /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
+const float tolerance_qs8 = 1.0f;   /**< Tolerance value for comparing reference's output against implementation's output for DataType::QS8 */
+
+Tensor compute_gemm(const TensorShape &src_shape1, const TensorShape &src_shape2, const TensorShape &src_shape3,
+                    const TensorShape &out_shape, float alpha, float beta, DataType dt, int fixed_point_position = 0)
+{
+    // Create tensors
+    Tensor src1 = create_tensor(src_shape1, dt, 1, fixed_point_position);
+    Tensor src2 = create_tensor(src_shape2, dt, 1, fixed_point_position);
+    Tensor src3 = create_tensor(src_shape3, dt, 1, fixed_point_position);
+    Tensor dst  = create_tensor(out_shape, dt, 1, fixed_point_position);
+
+    // Create and configure function
+    NEGEMM gemm;
+    gemm.configure(&src1, &src2, &src3, &dst, alpha, beta);
+
+    // Allocate tensors
+    src1.allocator()->allocate();
+    src2.allocator()->allocate();
+    src3.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src1.info()->is_resizable());
+    BOOST_TEST(!src2.info()->is_resizable());
+    BOOST_TEST(!src3.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors
+    if(dt == DataType::F32)
+    {
+        std::uniform_real_distribution<> distribution(-1.0f, 1.0f);
+        library->fill(NEAccessor(src1), distribution, 0);
+        library->fill(NEAccessor(src2), distribution, 1);
+        library->fill(NEAccessor(src3), distribution, 2);
+    }
+    else
+    {
+        library->fill_tensor_uniform(NEAccessor(src1), 0);
+        library->fill_tensor_uniform(NEAccessor(src2), 1);
+        library->fill_tensor_uniform(NEAccessor(src3), 2);
+    }
+
+    // Compute function
+    gemm.run();
+
+    return dst;
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+BOOST_AUTO_TEST_SUITE(GEMM)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration,
+                     SmallGEMMDataset() * boost::unit_test::data::make({ DataType::F32, DataType::QS8 }),
+                     gemm_set, dt)
+{
+    // Set fixed point position data type allowed
+    int fixed_point_position = (dt == DataType::F32) ? 0 : 3;
+
+    // Create tensors
+    Tensor src1 = create_tensor(gemm_set.shape_a, dt, 1, fixed_point_position);
+    Tensor src2 = create_tensor(gemm_set.shape_b, dt, 1, fixed_point_position);
+    Tensor src3 = create_tensor(gemm_set.shape_c, dt, 1, fixed_point_position);
+    Tensor dst  = create_tensor(gemm_set.shape_d, dt, 1, fixed_point_position);
+
+    BOOST_TEST(src1.info()->is_resizable());
+    BOOST_TEST(src2.info()->is_resizable());
+    BOOST_TEST(src3.info()->is_resizable());
+    BOOST_TEST(dst.info()->is_resizable());
+
+    // Create and configure function
+    NEGEMM gemm;
+    gemm.configure(&src1, &src2, &src3, &dst, gemm_set.alpha, gemm_set.beta);
+
+    // Validate valid region
+    const ValidRegion src1_valid_region = shape_to_valid_region(gemm_set.shape_a);
+    const ValidRegion src2_valid_region = shape_to_valid_region(gemm_set.shape_b);
+    const ValidRegion src3_valid_region = shape_to_valid_region(gemm_set.shape_c);
+    const ValidRegion dst_valid_region  = shape_to_valid_region(gemm_set.shape_d);
+
+    validate(src1.info()->valid_region(), src1_valid_region);
+    validate(src2.info()->valid_region(), src2_valid_region);
+    validate(src3.info()->valid_region(), src3_valid_region);
+    validate(dst.info()->valid_region(), dst_valid_region);
+}
+
+BOOST_AUTO_TEST_SUITE(Float)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(SmallGEMM, SmallGEMMDataset() * boost::unit_test::data::make(DataType::F32),
+                     gemm_set, dt)
+{
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_gemm(gemm_set.shape_a, gemm_set.shape_b, gemm_set.shape_c, gemm_set.shape_d, gemm_set.alpha, gemm_set.beta, dt);
+
+    // Compute function
+    Tensor dst = compute_gemm(gemm_set.shape_a, gemm_set.shape_b, gemm_set.shape_c, gemm_set.shape_d, gemm_set.alpha, gemm_set.beta, dt);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, tolerance_f32);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(LargeGEMM, LargeGEMMDataset() * boost::unit_test::data::make(DataType::F32),
+                     gemm_set, dt)
+{
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_gemm(gemm_set.shape_a, gemm_set.shape_b, gemm_set.shape_c, gemm_set.shape_d, gemm_set.alpha, gemm_set.beta, dt);
+
+    // Compute function
+    Tensor dst = compute_gemm(gemm_set.shape_a, gemm_set.shape_b, gemm_set.shape_c, gemm_set.shape_d, gemm_set.alpha, gemm_set.beta, dt);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, tolerance_f32);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(Quantized)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(SmallGEMM, SmallGEMMDataset() * boost::unit_test::data::make(DataType::QS8) * boost::unit_test::data::xrange(1, 7),
+                     gemm_set, dt, fixed_point_position)
+{
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_gemm(gemm_set.shape_a, gemm_set.shape_b, gemm_set.shape_c, gemm_set.shape_d, gemm_set.alpha, gemm_set.beta, dt, fixed_point_position);
+
+    // Compute function
+    Tensor dst = compute_gemm(gemm_set.shape_a, gemm_set.shape_b, gemm_set.shape_c, gemm_set.shape_d, gemm_set.alpha, gemm_set.beta, dt, fixed_point_position);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, tolerance_qs8);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(LargeGEMM, LargeGEMMDataset() * boost::unit_test::data::make(DataType::QS8) * boost::unit_test::data::xrange(1, 7),
+                     gemm_set, dt, fixed_point_position)
+{
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_gemm(gemm_set.shape_a, gemm_set.shape_b, gemm_set.shape_c, gemm_set.shape_d, gemm_set.alpha, gemm_set.beta, dt, fixed_point_position);
+
+    // Compute function
+    Tensor dst = compute_gemm(gemm_set.shape_a, gemm_set.shape_b, gemm_set.shape_c, gemm_set.shape_d, gemm_set.alpha, gemm_set.beta, dt, fixed_point_position);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, tolerance_qs8);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/NEON/IntegralImage.cpp b/tests/validation/NEON/IntegralImage.cpp
new file mode 100644
index 0000000000..f94af430d1
--- /dev/null
+++ b/tests/validation/NEON/IntegralImage.cpp
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "validation/Datasets.h"
+#include "validation/Reference.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEIntegralImage.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "boost_wrapper.h"
+
+#include <random>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+/** Compute Neon integral image function.
+ *
+ * @param[in] shape Shape of the input and output tensors.
+ *
+ * @return Computed output tensor.
+ */
+Tensor compute_integral_image(const TensorShape &shape)
+{
+    // Create tensors
+    Tensor src = create_tensor(shape, DataType::U8);
+    Tensor dst = create_tensor(shape, DataType::U32);
+
+    // Create integral image configure function
+    NEIntegralImage integral_image;
+    integral_image.configure(&src, &dst);
+
+    // Allocate tensors
+    src.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors
+    library->fill_tensor_uniform(NEAccessor(src), 0);
+
+    // Compute function
+    integral_image.run();
+
+    return dst;
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+BOOST_AUTO_TEST_SUITE(IntegralImage)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, SmallShapes() + LargeShapes(), shape)
+{
+    // Create tensors
+    Tensor src = create_tensor(shape, DataType::U8);
+    Tensor dst = create_tensor(shape, DataType::U32);
+
+    BOOST_TEST(src.info()->is_resizable());
+    BOOST_TEST(dst.info()->is_resizable());
+
+    // Create integral image configure function
+    NEIntegralImage integral_image;
+    integral_image.configure(&src, &dst);
+
+    // Validate valid region
+    const ValidRegion valid_region = shape_to_valid_region(shape);
+    validate(src.info()->valid_region(), valid_region);
+    validate(dst.info()->valid_region(), valid_region);
+
+    // Validate padding
+    const PaddingSize src_padding(0, required_padding(shape.x(), 16), 0, 0);
+    const PaddingSize dst_padding(1, required_padding(shape.x(), 16), 0, 1);
+
+    validate(src.info()->padding(), src_padding);
+    validate(dst.info()->padding(), dst_padding);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes(), shape)
+{
+    // Compute function
+    Tensor dst = compute_integral_image(shape);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_integral_image(shape);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes(), shape)
+{
+    // Compute function
+    Tensor dst = compute_integral_image(shape);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_integral_image(shape);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/NEON/NormalizationLayer.cpp b/tests/validation/NEON/NormalizationLayer.cpp
new file mode 100644
index 0000000000..ff791effa0
--- /dev/null
+++ b/tests/validation/NEON/NormalizationLayer.cpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TypePrinter.h"
+#include "validation/Datasets.h"
+#include "validation/Reference.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
+
+#include <random>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+/** Define tolerance of the normalization layer depending on values data type.
+ *
+ * @param[in] dt Data type of the tensors' values.
+ *
+ * @return Tolerance depending on the data type.
+ */
+float normalization_layer_tolerance(DataType dt)
+{
+    switch(dt)
+    {
+        case DataType::QS8:
+            return 2.0f;
+        case DataType::F32:
+            return 1e-05;
+        default:
+            return 0.f;
+    }
+}
+
+/** Compute Neon normalization layer function.
+ *
+ * @param[in] shape                Shape of the input and output tensors.
+ * @param[in] dt                   Data type of input and output tensors.
+ * @param[in] norm_info            Normalization Layer information.
+ * @param[in] fixed_point_position (Optional) Fixed point position that expresses the number of bits for the fractional part of the number when the tensor's data type is QS8 or QS16 (default = 0).
+ *
+ * @return Computed output tensor.
+ */
+Tensor compute_normalization_layer(const TensorShape &shape, DataType dt, NormalizationLayerInfo norm_info, int fixed_point_position = 0)
+{
+    // Create tensors
+    Tensor src = create_tensor(shape, dt, 1, fixed_point_position);
+    Tensor dst = create_tensor(shape, dt, 1, fixed_point_position);
+
+    // Create and configure function
+    NENormalizationLayer norm;
+    norm.configure(&src, &dst, norm_info);
+
+    // Allocate tensors
+    src.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors
+    if(dt == DataType::QS8)
+    {
+        const int8_t one_fixed_point       = 1 << fixed_point_position;
+        const int8_t minus_one_fixed_point = -one_fixed_point;
+        library->fill_tensor_uniform(NEAccessor(src), 0, minus_one_fixed_point, one_fixed_point);
+    }
+    else
+    {
+        library->fill_tensor_uniform(NEAccessor(src), 0);
+    }
+
+    // Compute function
+    norm.run();
+
+    return dst;
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+BOOST_AUTO_TEST_SUITE(NormalizationLayer)
+
+BOOST_AUTO_TEST_SUITE(Float)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall,
+                     SmallShapes() * DataType::F32 *NormalizationTypes() * boost::unit_test::data::xrange(3, 9, 2) * boost::unit_test::data::make({ 0.5f, 1.0f, 2.0f }),
+                     shape, dt, norm_type, norm_size, beta)
+{
+    // Provide normalization layer information
+    NormalizationLayerInfo norm_info(norm_type, norm_size, 5, beta);
+
+    // Compute function
+    Tensor dst = compute_normalization_layer(shape, dt, norm_info);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_normalization_layer(shape, dt, norm_info);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, normalization_layer_tolerance(DataType::F32));
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(Quantized)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall,
+                     SmallShapes() * DataType::QS8 *NormalizationTypes() * boost::unit_test::data::xrange(3, 7, 2) * (boost::unit_test::data::xrange(1, 6) * boost::unit_test::data::make({ 0.5f, 1.0f, 2.0f })),
+                     shape, dt, norm_type, norm_size, fixed_point_position, beta)
+{
+    // Provide normalization layer information
+    NormalizationLayerInfo norm_info(norm_type, norm_size, 5, beta, 1.f);
+
+    // Compute function
+    Tensor dst = compute_normalization_layer(shape, dt, norm_info, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_normalization_layer(shape, dt, norm_info, fixed_point_position);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, normalization_layer_tolerance(DataType::QS8));
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/NEON/PixelWiseMultiplication.cpp b/tests/validation/NEON/PixelWiseMultiplication.cpp
new file mode 100644
index 0000000000..c6c2792126
--- /dev/null
+++ b/tests/validation/NEON/PixelWiseMultiplication.cpp
@@ -0,0 +1,428 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "validation/Datasets.h"
+#include "validation/Reference.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "boost_wrapper.h"
+
+#include <random>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+/** Compute Neon arithmetic addition function.
+ *
+ * @param[in] shape                Shape of the input and output tensors.
+ * @param[in] dt_in0               Data type of first input tensor.
+ * @param[in] dt_in1               Data type of second input tensor.
+ * @param[in] dt_out               Data type of the output tensor.
+ * @param[in] scale                Non-negative scale.
+ * @param[in] convert_policy       Overflow policy of the operation.
+ * @param[in] rounding_policy      Rounding policy of the operation.
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number.
+ *
+ * @return Computed output tensor.
+ */
+Tensor compute_pixel_wise_multiplication(const TensorShape &shape, DataType dt_in0, DataType dt_in1, DataType dt_out, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy,
+                                         int fixed_point_position = 0)
+{
+    // Create tensors
+    Tensor src1 = create_tensor(shape, dt_in0, 1, fixed_point_position);
+    Tensor src2 = create_tensor(shape, dt_in1, 1, fixed_point_position);
+    Tensor dst  = create_tensor(shape, dt_out, 1, fixed_point_position);
+
+    // Create and configure function
+    NEPixelWiseMultiplication multiply;
+    multiply.configure(&src1, &src2, &dst, scale, convert_policy, rounding_policy);
+
+    // Allocate tensors
+    src1.allocator()->allocate();
+    src2.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src1.info()->is_resizable());
+    BOOST_TEST(!src2.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors
+    library->fill_tensor_uniform(NEAccessor(src1), 0);
+    library->fill_tensor_uniform(NEAccessor(src2), 1);
+
+    // Compute function
+    multiply.run();
+
+    return dst;
+}
+
+void validate_configuration(const Tensor &src1, const Tensor &src2, Tensor &dst, TensorShape shape, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy)
+{
+    BOOST_TEST(src1.info()->is_resizable());
+    BOOST_TEST(src2.info()->is_resizable());
+    BOOST_TEST(dst.info()->is_resizable());
+
+    // Create and configure function
+    NEPixelWiseMultiplication multiply;
+    multiply.configure(&src1, &src2, &dst, scale, convert_policy, rounding_policy);
+
+    // Validate valid region
+    const ValidRegion valid_region = shape_to_valid_region(shape);
+    validate(src1.info()->valid_region(), valid_region);
+    validate(src2.info()->valid_region(), valid_region);
+    validate(dst.info()->valid_region(), valid_region);
+
+    // Validate padding
+    const PaddingSize padding(0, required_padding(shape.x(), 16), 0, 0);
+    validate(src1.info()->padding(), padding);
+    validate(src2.info()->padding(), padding);
+    validate(dst.info()->padding(), padding);
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+BOOST_AUTO_TEST_SUITE(PixelWiseMultiplication)
+
+BOOST_AUTO_TEST_SUITE(U8)
+
+BOOST_AUTO_TEST_SUITE(Scale255)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * (1.f / 255.f) * ConvertPolicies()
+                     * RoundingPolicy::TO_NEAREST_UP,
+                     shape, scale, convert_policy, rounding_policy)
+{
+    // Create tensors
+    Tensor src1 = create_tensor(shape, DataType::U8);
+    Tensor src2 = create_tensor(shape, DataType::U8);
+    Tensor dst  = create_tensor(shape, DataType::U8);
+
+    validate_configuration(src1, src2, dst, shape, scale, convert_policy, rounding_policy);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * (1.f / 255.f) * ConvertPolicies() * RoundingPolicy::TO_NEAREST_UP,
+                     shape, scale, convert_policy, rounding_policy)
+{
+    // Compute function
+    Tensor dst = compute_pixel_wise_multiplication(shape, DataType::U8, DataType::U8, DataType::U8, scale, convert_policy,
+                                                   rounding_policy);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_pixel_wise_multiplication(shape, DataType::U8, DataType::U8,
+                                                                               DataType::U8, scale, convert_policy, rounding_policy);
+
+    // Validate output
+    // Allow tolerance value of 1.f to counteract imprecision due to 32-bit float conversion
+    validate(NEAccessor(dst), ref_dst, 1.f, 0.f, std::numeric_limits<uint8_t>::max());
+}
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * (1.f / 255.f) * ConvertPolicies() * RoundingPolicy::TO_NEAREST_UP,
+                     shape, scale, convert_policy, rounding_policy)
+{
+    // Compute function
+    Tensor dst = compute_pixel_wise_multiplication(shape, DataType::U8, DataType::U8, DataType::U8, scale, convert_policy,
+                                                   rounding_policy);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_pixel_wise_multiplication(shape, DataType::U8, DataType::U8,
+                                                                               DataType::U8, scale, convert_policy, rounding_policy);
+
+    // Validate output
+    // Allow tolerance value of 1.f to counteract imprecision due to 32-bit float conversion
+    validate(NEAccessor(dst), ref_dst, 1.f, 0.f, std::numeric_limits<uint8_t>::max());
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(ScaleOther)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ 1.f, 1.f / 32768.f })
+                     * ConvertPolicies()
+                     * RoundingPolicy::TO_ZERO,
+                     shape, scale, convert_policy, rounding_policy)
+{
+    // Create tensors
+    Tensor src1 = create_tensor(shape, DataType::U8);
+    Tensor src2 = create_tensor(shape, DataType::U8);
+    Tensor dst  = create_tensor(shape, DataType::U8);
+
+    validate_configuration(src1, src2, dst, shape, scale, convert_policy, rounding_policy);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::make({ 1.f, 1.f / 32768.f }) * ConvertPolicies()
+                     * RoundingPolicy::TO_ZERO,
+                     shape, scale, convert_policy, rounding_policy)
+{
+    // Compute function
+    Tensor dst = compute_pixel_wise_multiplication(shape, DataType::U8, DataType::U8, DataType::U8, scale, convert_policy,
+                                                   rounding_policy);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_pixel_wise_multiplication(shape, DataType::U8, DataType::U8,
+                                                                               DataType::U8, scale, convert_policy, rounding_policy);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ 1.f, 1.f / 32768.f }) * ConvertPolicies()
+                     * RoundingPolicy::TO_ZERO,
+                     shape, scale, convert_policy, rounding_policy)
+{
+    // Compute function
+    Tensor dst = compute_pixel_wise_multiplication(shape, DataType::U8, DataType::U8, DataType::U8, scale, convert_policy,
+                                                   rounding_policy);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_pixel_wise_multiplication(shape, DataType::U8, DataType::U8,
+                                                                               DataType::U8, scale, convert_policy, rounding_policy);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(S16)
+BOOST_AUTO_TEST_SUITE(Scale255)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ DataType::U8, DataType::S16 }) * (1.f / 255.f) * ConvertPolicies()
+                     * RoundingPolicy::TO_NEAREST_UP,
+                     shape, dt, scale, convert_policy, rounding_policy)
+{
+    // Create tensors
+    Tensor src1 = create_tensor(shape, dt);
+    Tensor src2 = create_tensor(shape, DataType::S16);
+    Tensor dst  = create_tensor(shape, DataType::S16);
+
+    validate_configuration(src1, src2, dst, shape, scale, convert_policy, rounding_policy);
+}
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::make({ DataType::U8, DataType::S16 }) * (1.f / 255.f) * ConvertPolicies()
+                     * RoundingPolicy::TO_NEAREST_UP,
+                     shape, dt, scale, convert_policy, rounding_policy)
+{
+    // Compute function
+    Tensor dst = compute_pixel_wise_multiplication(shape, dt, DataType::S16, DataType::S16, scale, convert_policy, rounding_policy);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_pixel_wise_multiplication(shape, dt, DataType::S16, DataType::S16, scale, convert_policy, rounding_policy);
+
+    // Validate output
+    // Allow tolerance value of 2.f to counteract imprecision due to 32-bit float conversion
+    validate(NEAccessor(dst), ref_dst, 2.f, 0.f, std::numeric_limits<int16_t>::max());
+}
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ DataType::U8, DataType::S16 }) * (1.f / 255.f) * ConvertPolicies()
+                     * RoundingPolicy::TO_NEAREST_UP,
+                     shape, dt, scale, convert_policy, rounding_policy)
+{
+    // Compute function
+    Tensor dst = compute_pixel_wise_multiplication(shape, dt, DataType::S16, DataType::S16, scale, convert_policy, rounding_policy);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_pixel_wise_multiplication(shape, dt, DataType::S16, DataType::S16,
+                                                                               scale, convert_policy, rounding_policy);
+
+    // Validate output
+    // Allow tolerance value of 2.f to counteract imprecision due to 32-bit float conversion
+    validate(NEAccessor(dst), ref_dst, 2.f, 0.f, std::numeric_limits<int16_t>::max());
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(ScaleOther)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ DataType::U8, DataType::S16 }) * boost::unit_test::data::make({ 1.f, 1.f / 32768.f })
+                     * ConvertPolicies()
+                     * RoundingPolicy::TO_ZERO,
+                     shape, dt, scale, convert_policy, rounding_policy)
+{
+    // Create tensors
+    Tensor src1 = create_tensor(shape, dt);
+    Tensor src2 = create_tensor(shape, DataType::S16);
+    Tensor dst  = create_tensor(shape, DataType::S16);
+
+    validate_configuration(src1, src2, dst, shape, scale, convert_policy, rounding_policy);
+}
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::make({ DataType::U8, DataType::S16 }) * boost::unit_test::data::make({ 1.f, 1.f / 32768.f }) * ConvertPolicies()
+                     * RoundingPolicy::TO_ZERO,
+                     shape, dt, scale, convert_policy, rounding_policy)
+{
+    // Compute function
+    Tensor dst = compute_pixel_wise_multiplication(shape, dt, DataType::S16, DataType::S16, scale, convert_policy, rounding_policy);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_pixel_wise_multiplication(shape, dt, DataType::S16, DataType::S16, scale, convert_policy, rounding_policy);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ DataType::U8, DataType::S16 }) * boost::unit_test::data::make({ 1.f, 1.f / 32768.f }) * ConvertPolicies()
+                     * RoundingPolicy::TO_ZERO,
+                     shape, dt, scale, convert_policy, rounding_policy)
+{
+    // Compute function
+    Tensor dst = compute_pixel_wise_multiplication(shape, dt, DataType::S16, DataType::S16, scale, convert_policy, rounding_policy);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_pixel_wise_multiplication(shape, dt, DataType::S16, DataType::S16,
+                                                                               scale, convert_policy, rounding_policy);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(F32)
+BOOST_AUTO_TEST_SUITE(Scale255)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * (1.f / 255.f) * ConvertPolicies()
+                     * RoundingPolicy::TO_NEAREST_UP,
+                     shape, scale, convert_policy, rounding_policy)
+{
+    // Create tensors
+    Tensor src1 = create_tensor(shape, DataType::F32);
+    Tensor src2 = create_tensor(shape, DataType::F32);
+    Tensor dst  = create_tensor(shape, DataType::F32);
+
+    validate_configuration(src1, src2, dst, shape, scale, convert_policy, rounding_policy);
+}
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * (1.f / 255.f) * ConvertPolicies()
+                     * RoundingPolicy::TO_NEAREST_UP,
+                     shape, scale, convert_policy, rounding_policy)
+{
+    // Compute function
+    Tensor dst = compute_pixel_wise_multiplication(shape, DataType::F32, DataType::F32, DataType::F32, scale, convert_policy, rounding_policy);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_pixel_wise_multiplication(shape, DataType::F32, DataType::F32, DataType::F32, scale, convert_policy, rounding_policy);
+
+    // Validate output
+    // Allow tolerance value of 1.f to counteract imprecision due to 32-bit float conversion
+    validate(NEAccessor(dst), ref_dst, 1.f, 0.f, std::numeric_limits<int16_t>::max());
+}
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * (1.f / 255.f) * ConvertPolicies()
+                     * RoundingPolicy::TO_NEAREST_UP,
+                     shape, scale, convert_policy, rounding_policy)
+{
+    // Compute function
+    Tensor dst = compute_pixel_wise_multiplication(shape, DataType::F32, DataType::F32, DataType::F32, scale, convert_policy, rounding_policy);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_pixel_wise_multiplication(shape, DataType::F32, DataType::F32, DataType::F32,
+                                                                               scale, convert_policy, rounding_policy);
+
+    // Validate output
+    // Allow tolerance value of 1.f to counteract imprecision due to 32-bit float conversion
+    validate(NEAccessor(dst), ref_dst, 1.f, 0.f, std::numeric_limits<int16_t>::max());
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(ScaleOther)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ 1.f, 1.f / 32768.f })
+                     * ConvertPolicies()
+                     * RoundingPolicy::TO_ZERO,
+                     shape, scale, convert_policy, rounding_policy)
+{
+    // Create tensors
+    Tensor src1 = create_tensor(shape, DataType::F32);
+    Tensor src2 = create_tensor(shape, DataType::F32);
+    Tensor dst  = create_tensor(shape, DataType::F32);
+
+    validate_configuration(src1, src2, dst, shape, scale, convert_policy, rounding_policy);
+}
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::make({ 1.f, 1.f / 32768.f }) * ConvertPolicies()
+                     * RoundingPolicy::TO_ZERO,
+                     shape, scale, convert_policy, rounding_policy)
+{
+    // Compute function
+    Tensor dst = compute_pixel_wise_multiplication(shape, DataType::F32, DataType::F32, DataType::F32, scale, convert_policy, rounding_policy);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_pixel_wise_multiplication(shape, DataType::F32, DataType::F32, DataType::F32, scale, convert_policy, rounding_policy);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ 1.f, 1.f / 32768.f }) * ConvertPolicies()
+                     * RoundingPolicy::TO_ZERO,
+                     shape, scale, convert_policy, rounding_policy)
+{
+    // Compute function
+    Tensor dst = compute_pixel_wise_multiplication(shape, DataType::F32, DataType::F32, DataType::F32, scale, convert_policy, rounding_policy);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_pixel_wise_multiplication(shape, DataType::F32, DataType::F32, DataType::F32,
+                                                                               scale, convert_policy, rounding_policy);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(QS8)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * DataType::QS8 *ConvertPolicies() * RoundingPolicy::TO_ZERO * boost::unit_test::data::xrange<int>(1, 7),
+                     shape, dt, convert_policy, rounding_policy, fixed_point_position)
+{
+    // Compute function
+    Tensor dst = compute_pixel_wise_multiplication(shape, dt, dt, dt, 1.f, convert_policy, rounding_policy, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_fixed_point_pixel_wise_multiplication(shape, dt, dt, dt, 1.f, fixed_point_position, convert_policy, rounding_policy);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/NEON/Pooling/PoolingLayer.cpp b/tests/validation/NEON/Pooling/PoolingLayer.cpp
new file mode 100644
index 0000000000..b15ad1c5e6
--- /dev/null
+++ b/tests/validation/NEON/Pooling/PoolingLayer.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TypePrinter.h"
+#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
+#include "tests/dataset/PoolingLayerDataset.h"
+#include "validation/Datasets.h"
+#include "validation/Reference.h"
+#include "validation/Validation.h"
+
+#include <iostream>
+#include <random>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+const float tolerance_q = 0;     /**< Tolerance value for comparing reference's output against implementation's output for quantized input */
+const float tolerance_f = 1e-05; /**< Tolerance value for comparing reference's output against implementation's output for float input */
+
+/** Compute Neon pooling layer function.
+ *
+ * @param[in] shape     Shape of the input and output tensors.
+ * @param[in] dt        Data type of input and output tensors.
+ * @param[in] pool_info Pooling Layer information.
+ *
+ * @return Computed output tensor.
+ */
+Tensor compute_pooling_layer(const TensorShape &shape_in, const TensorShape &shape_out, DataType dt, PoolingLayerInfo pool_info, int fixed_point_position = 0)
+{
+    // Create tensors
+    Tensor src = create_tensor(shape_in, dt, 1, fixed_point_position);
+    Tensor dst = create_tensor(shape_out, dt, 1, fixed_point_position);
+
+    // Create and configure function
+    NEPoolingLayer pool;
+    pool.configure(&src, &dst, pool_info);
+
+    // Allocate tensors
+    src.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors
+    int min = 0;
+    int max = 0;
+    switch(dt)
+    {
+        case DataType::F32:
+            min = -1;
+            max = 1;
+            break;
+        case DataType::QS8:
+            min = -(1 << fixed_point_position);
+            max = (1 << fixed_point_position);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("DataType not supported.");
+    }
+    std::uniform_real_distribution<> distribution(min, max);
+    library->fill(NEAccessor(src), distribution, 0);
+
+    // Compute function
+    pool.run();
+
+    return dst;
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+BOOST_AUTO_TEST_SUITE(Pooling)
+BOOST_AUTO_TEST_SUITE(PoolingLayer)
+
+BOOST_AUTO_TEST_SUITE(Float)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RandomDataset,
+                     RandomPoolingLayerDataset() * boost::unit_test::data::make(DataType::F32),
+                     obj, dt)
+{
+    // Compute function
+    Tensor dst = compute_pooling_layer(obj.src_shape, obj.dst_shape, dt, obj.info);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_pooling_layer(obj.src_shape, obj.dst_shape, dt, obj.info);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, tolerance_f, 0);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(Quantized)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RandomDataset,
+                     RandomPoolingLayerDataset() * boost::unit_test::data::make(DataType::QS8) * boost::unit_test::data::xrange(1, 5),
+                     obj, dt, fixed_point_position)
+{
+    // Compute function
+    Tensor dst = compute_pooling_layer(obj.src_shape, obj.dst_shape, dt, obj.info, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_pooling_layer(obj.src_shape, obj.dst_shape, dt, obj.info, fixed_point_position);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, tolerance_q, 0);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/NEON/SoftmaxLayer.cpp b/tests/validation/NEON/SoftmaxLayer.cpp
new file mode 100644
index 0000000000..f5c7a21abd
--- /dev/null
+++ b/tests/validation/NEON/SoftmaxLayer.cpp
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "validation/Datasets.h"
+#include "validation/Reference.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "boost_wrapper.h"
+
+#include <random>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+/** Tolerance for float operations */
+const float tolerance = 0.000001f;
+/** Tolerance for fixed point operations */
+const float tolerance_fixed_point = 2.f;
+
+/** Compute Neon softmax layer function.
+ *
+ * @param[in] shape                Shape of the input and output tensors.
+ * @param[in] dt                   Shape Data type of tensors.
+ * @param[in] fixed_point_position (Optional) Number of bits for the fractional part of fixed point numbers.
+ *
+ * @return Computed output tensor.
+ */
+Tensor compute_softmax_layer(const TensorShape &shape, DataType dt, int fixed_point_position = 0)
+{
+    // Create tensors
+    Tensor src = create_tensor(shape, dt, 1, fixed_point_position);
+    Tensor dst = create_tensor(shape, dt, 1, fixed_point_position);
+
+    // Create and configure function
+    NESoftmaxLayer smx_layer;
+    smx_layer.configure(&src, &dst);
+
+    // Allocate tensors
+    src.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors
+    if(arm_compute::is_data_type_float(dt))
+    {
+        std::uniform_real_distribution<> distribution(-10, 10);
+        library->fill(NEAccessor(src), distribution, 0);
+    }
+    else
+    {
+        int                             one_fixed = 1 << fixed_point_position;
+        std::uniform_int_distribution<> distribution(-one_fixed, one_fixed);
+        library->fill(NEAccessor(src), distribution, 0);
+    }
+
+    // Compute function
+    smx_layer.run();
+
+    return dst;
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+BOOST_AUTO_TEST_SUITE(SoftmaxLayer)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * CNNDataTypes(), shape, dt)
+{
+    // Set fixed point position data type allowed
+    int fixed_point_position = (arm_compute::is_data_type_fixed_point(dt)) ? 3 : 0;
+
+    // Create tensors
+    Tensor src = create_tensor(shape, dt, 1, fixed_point_position);
+    Tensor dst = create_tensor(shape, dt, 1, fixed_point_position);
+
+    BOOST_TEST(src.info()->is_resizable());
+    BOOST_TEST(dst.info()->is_resizable());
+
+    // Create and configure function
+    NESoftmaxLayer smx_layer;
+    smx_layer.configure(&src, &dst);
+
+    // Validate valid region
+    const ValidRegion valid_region = shape_to_valid_region(shape);
+    validate(src.info()->valid_region(), valid_region);
+    validate(dst.info()->valid_region(), valid_region);
+
+    // Validate padding
+    int               step = 16 / arm_compute::data_size_from_type(dt);
+    const PaddingSize padding(0, required_padding(shape.x(), step), 0, 0);
+    validate(src.info()->padding(), padding);
+    validate(dst.info()->padding(), padding);
+}
+
+BOOST_AUTO_TEST_SUITE(Float)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * CNNFloatDataTypes(), shape, dt)
+{
+    // Compute function
+    Tensor dst = compute_softmax_layer(shape, dt);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_softmax_layer(shape, dt);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, tolerance);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * CNNFloatDataTypes(), shape, dt)
+{
+    // Compute function
+    Tensor dst = compute_softmax_layer(shape, dt);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_softmax_layer(shape, dt);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, tolerance);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(Quantized)
+// Testing for fixed point position [1,6) as reciprocal limits the maximum fixed point position to 5
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * CNNFixedPointDataTypes() * boost::unit_test::data::xrange(1, 6),
+                     shape, dt, fixed_point_position)
+{
+    // Compute function
+    Tensor dst = compute_softmax_layer(shape, dt, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_softmax_layer(shape, dt, fixed_point_position);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, tolerance_fixed_point);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * CNNFixedPointDataTypes() * boost::unit_test::data::xrange(1, 6),
+                     shape, dt, fixed_point_position)
+{
+    // Compute function
+    Tensor dst = compute_softmax_layer(shape, dt, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_softmax_layer(shape, dt, fixed_point_position);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, tolerance_fixed_point);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/NEON/Threshold.cpp b/tests/validation/NEON/Threshold.cpp
new file mode 100644
index 0000000000..6ac6f3d26b
--- /dev/null
+++ b/tests/validation/NEON/Threshold.cpp
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "dataset/ThresholdDataset.h"
+#include "validation/Datasets.h"
+#include "validation/Reference.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEThreshold.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "boost_wrapper.h"
+
+#include <random>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+/** Compute Threshold function.
+ *
+ * @param[in] shape       Shape of the input and output tensors.
+ * @param[in] threshold   Threshold. When the threshold type is RANGE, this is used as the lower threshold.
+ * @param[in] false_value value to set when the condition is not respected.
+ * @param[in] true_value  value to set when the condition is respected.
+ * @param[in] type        Thresholding type. Either RANGE or BINARY.
+ * @param[in] upper       Upper threshold. Only used when the thresholding type is RANGE.
+ *
+ * @return Computed output tensor.
+ */
+Tensor compute_threshold(const TensorShape &shape, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
+{
+    // Create tensors
+    Tensor src1 = create_tensor(shape, DataType::U8);
+    Tensor dst  = create_tensor(shape, DataType::U8);
+
+    // Create and configure function
+    NEThreshold thrsh;
+    thrsh.configure(&src1, &dst, threshold, false_value, true_value, type, upper);
+
+    // Allocate tensors
+    src1.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src1.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors
+    library->fill_tensor_uniform(NEAccessor(src1), 0);
+
+    // Compute function
+    thrsh.run();
+
+    return dst;
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+BOOST_AUTO_TEST_SUITE(Threshold)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration,
+                     (SmallShapes() + LargeShapes()) * ThresholdDataset(),
+                     shape, thrshConf)
+{
+    // Create tensors
+    Tensor src1 = create_tensor(shape, DataType::U8);
+    Tensor dst  = create_tensor(shape, DataType::U8);
+
+    BOOST_TEST(src1.info()->is_resizable());
+    BOOST_TEST(dst.info()->is_resizable());
+
+    // Create and configure function
+    NEThreshold thrsh;
+    thrsh.configure(&src1, &dst, thrshConf.threshold, thrshConf.false_value, thrshConf.true_value, thrshConf.type, thrshConf.upper);
+
+    // Validate valid region
+    const ValidRegion valid_region = shape_to_valid_region(shape);
+    validate(src1.info()->valid_region(), valid_region);
+    validate(dst.info()->valid_region(), valid_region);
+
+    // Validate padding
+    const PaddingSize padding(0, required_padding(shape.x(), 16), 0, 0);
+    validate(src1.info()->padding(), padding);
+    validate(dst.info()->padding(), padding);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall,
+                     SmallShapes() * ThresholdDataset(),
+                     shape, thrshConf)
+{
+    // Compute function
+    Tensor dst = compute_threshold(shape, thrshConf.threshold, thrshConf.false_value, thrshConf.true_value, thrshConf.type, thrshConf.upper);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_threshold(shape, thrshConf.threshold, thrshConf.false_value, thrshConf.true_value, thrshConf.type, thrshConf.upper);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge,
+                     LargeShapes() * ThresholdDataset(),
+                     shape, thrshConf)
+{
+    // Compute function
+    Tensor dst = compute_threshold(shape, thrshConf.threshold, thrshConf.false_value, thrshConf.true_value, thrshConf.type, thrshConf.upper);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_threshold(shape, thrshConf.threshold, thrshConf.false_value, thrshConf.true_value, thrshConf.type, thrshConf.upper);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/Reference.cpp b/tests/validation/Reference.cpp
new file mode 100644
index 0000000000..263c57b16b
--- /dev/null
+++ b/tests/validation/Reference.cpp
@@ -0,0 +1,596 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Reference.h"
+
+#include "Globals.h"
+#include "Helpers.h"
+#include "ReferenceCPP.h"
+#include "TensorLibrary.h"
+#include "validation/Helpers.h"
+
+#include <random>
+
+using namespace arm_compute::test;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+RawTensor Reference::compute_reference_integral_image(const TensorShape &shape)
+{
+    // Create reference
+    RawTensor ref_src = library->get(shape, DataType::U8);
+    RawTensor ref_dst = library->get(shape, DataType::U32);
+
+    // Fill reference
+    library->fill_tensor_uniform(ref_src, 0);
+
+    // Compute reference
+    ReferenceCPP::integral_image(ref_src, ref_dst);
+
+    return ref_dst;
+}
+RawTensor Reference::compute_reference_absolute_difference(const TensorShape &shape, DataType dt_in0, DataType dt_in1, DataType dt_out)
+{
+    // Create reference
+    RawTensor ref_src1 = library->get(shape, dt_in0);
+    RawTensor ref_src2 = library->get(shape, dt_in1);
+    RawTensor ref_dst  = library->get(shape, dt_out);
+
+    // Fill reference
+    library->fill_tensor_uniform(ref_src1, 0);
+    library->fill_tensor_uniform(ref_src2, 1);
+
+    // Compute reference
+    ReferenceCPP::absolute_difference(ref_src1, ref_src2, ref_dst);
+
+    return ref_dst;
+}
+
+RawTensor Reference::compute_reference_accumulate(const TensorShape &shape)
+{
+    // Create reference
+    RawTensor ref_src = library->get(shape, DataType::U8);
+    RawTensor ref_dst = library->get(shape, DataType::S16);
+
+    // Fill reference
+    library->fill_tensor_uniform(ref_src, 0);
+    library->fill_tensor_uniform(ref_dst, 1);
+
+    // Compute reference
+    ReferenceCPP::accumulate(ref_src, ref_dst);
+
+    return ref_dst;
+}
+
+RawTensor Reference::compute_reference_accumulate_squared(const TensorShape &shape, uint32_t shift)
+{
+    // Create reference
+    RawTensor ref_src = library->get(shape, DataType::U8);
+    RawTensor ref_dst = library->get(shape, DataType::S16);
+
+    // Fill reference
+    // ref_dst tensor filled with non-negative values
+    library->fill_tensor_uniform(ref_src, 0);
+    library->fill_tensor_uniform(ref_dst, 1, static_cast<int16_t>(0), std::numeric_limits<int16_t>::max());
+
+    // Compute reference
+    ReferenceCPP::accumulate_squared(ref_src, ref_dst, shift);
+
+    return ref_dst;
+}
+
+RawTensor Reference::compute_reference_accumulate_weighted(const TensorShape &shape, float alpha)
+{
+    // Create reference
+    RawTensor ref_src = library->get(shape, DataType::U8);
+    RawTensor ref_dst = library->get(shape, DataType::U8);
+
+    // Fill reference
+    library->fill_tensor_uniform(ref_src, 0);
+    library->fill_tensor_uniform(ref_dst, 1);
+
+    // Compute reference
+    ReferenceCPP::accumulate_weighted(ref_src, ref_dst, alpha);
+
+    return ref_dst;
+}
+
+RawTensor Reference::compute_reference_arithmetic_addition(const TensorShape &shape, DataType dt_in0, DataType dt_in1, DataType dt_out, ConvertPolicy convert_policy)
+{
+    // Create reference
+    RawTensor ref_src1 = library->get(shape, dt_in0);
+    RawTensor ref_src2 = library->get(shape, dt_in1);
+    RawTensor ref_dst  = library->get(shape, dt_out);
+
+    // Fill reference
+    library->fill_tensor_uniform(ref_src1, 0);
+    library->fill_tensor_uniform(ref_src2, 1);
+
+    // Compute reference
+    ReferenceCPP::arithmetic_addition(ref_src1, ref_src2, ref_dst, convert_policy);
+
+    return ref_dst;
+}
+
+RawTensor Reference::compute_reference_arithmetic_subtraction(const TensorShape &shape, DataType dt_in0, DataType dt_in1, DataType dt_out, ConvertPolicy convert_policy)
+{
+    // Create reference
+    RawTensor ref_src1 = library->get(shape, dt_in0);
+    RawTensor ref_src2 = library->get(shape, dt_in1);
+    RawTensor ref_dst  = library->get(shape, dt_out);
+
+    // Fill reference
+    library->fill_tensor_uniform(ref_src1, 0);
+    library->fill_tensor_uniform(ref_src2, 1);
+
+    // Compute reference
+    ReferenceCPP::arithmetic_subtraction(ref_src1, ref_src2, ref_dst, convert_policy);
+
+    return ref_dst;
+}
+
+RawTensor Reference::compute_reference_bitwise_and(const TensorShape &shape)
+{
+    // Create reference
+    RawTensor ref_src1 = library->get(shape, DataType::U8);
+    RawTensor ref_src2 = library->get(shape, DataType::U8);
+    RawTensor ref_dst  = library->get(shape, DataType::U8);
+
+    // Fill reference
+    library->fill_tensor_uniform(ref_src1, 0);
+    library->fill_tensor_uniform(ref_src2, 1);
+
+    // Compute reference
+    ReferenceCPP::bitwise_and(ref_src1, ref_src2, ref_dst);
+
+    return ref_dst;
+}
+
+RawTensor Reference::compute_reference_bitwise_or(const TensorShape &shape)
+{
+    // Create reference
+    RawTensor ref_src1 = library->get(shape, DataType::U8);
+    RawTensor ref_src2 = library->get(shape, DataType::U8);
+    RawTensor ref_dst  = library->get(shape, DataType::U8);
+
+    // Fill reference
+    library->fill_tensor_uniform(ref_src1, 0);
+    library->fill_tensor_uniform(ref_src2, 1);
+
+    // Compute reference
+    ReferenceCPP::bitwise_or(ref_src1, ref_src2, ref_dst);
+
+    return ref_dst;
+}
+
+RawTensor Reference::compute_reference_bitwise_xor(const TensorShape &shape)
+{
+    // Create reference
+    RawTensor ref_src1 = library->get(shape, DataType::U8);
+    RawTensor ref_src2 = library->get(shape, DataType::U8);
+    RawTensor ref_dst  = library->get(shape, DataType::U8);
+
+    // Fill reference
+    library->fill_tensor_uniform(ref_src1, 0);
+    library->fill_tensor_uniform(ref_src2, 1);
+
+    // Compute reference
+    ReferenceCPP::bitwise_xor(ref_src1, ref_src2, ref_dst);
+
+    return ref_dst;
+}
+
+RawTensor Reference::compute_reference_bitwise_not(const TensorShape &shape)
+{
+    // Create reference
+    RawTensor ref_src = library->get(shape, DataType::U8);
+    RawTensor ref_dst = library->get(shape, DataType::U8);
+
+    // Fill reference
+    library->fill_tensor_uniform(ref_src, 0);
+
+    // Compute reference
+    ReferenceCPP::bitwise_not(ref_src, ref_dst);
+
+    return ref_dst;
+}
+
+RawTensor Reference::compute_reference_box3x3(const TensorShape &shape)
+{
+    // Create reference
+    RawTensor ref_src = library->get(shape, DataType::U8);
+    RawTensor ref_dst = library->get(shape, DataType::U8);
+
+    // Fill reference
+    library->fill_tensor_uniform(ref_src, 0);
+
+    // Compute reference
+    ReferenceCPP::box3x3(ref_src, ref_dst);
+
+    return ref_dst;
+}
+
+RawTensor Reference::compute_reference_depth_convert(const TensorShape &shape, DataType dt_in, DataType dt_out, ConvertPolicy policy, uint32_t shift, uint32_t fixed_point_position)
+{
+    RawTensor ref_src = library->get(shape, dt_in, 1, fixed_point_position);
+    RawTensor ref_dst = library->get(shape, dt_out, 1, fixed_point_position);
+
+    // Fill reference
+    library->fill_tensor_uniform(ref_src, 0);
+
+    // Compute reference
+    ReferenceCPP::depth_convert(ref_src, ref_dst, policy, shift);
+
+    return ref_dst;
+}
+
+RawTensor Reference::compute_reference_gemm(const TensorShape &src_shape1, const TensorShape &src_shape2, const TensorShape &src_shape3,
+                                            const TensorShape &dst_shape, float alpha, float beta, DataType dt, int fixed_point_position)
+{
+    RawTensor src1 = library->get(src_shape1, dt, 1, fixed_point_position);
+    RawTensor src2 = library->get(src_shape2, dt, 1, fixed_point_position);
+    RawTensor src3 = library->get(src_shape3, dt, 1, fixed_point_position);
+    RawTensor dst  = library->get(dst_shape, dt, 1, fixed_point_position);
+
+    // Fill reference
+    if(dt == DataType::F32)
+    {
+        std::uniform_real_distribution<> distribution(-1.0f, 1.0f);
+        library->fill(src1, distribution, 0);
+        library->fill(src2, distribution, 1);
+        library->fill(src3, distribution, 2);
+    }
+    else
+    {
+        library->fill_tensor_uniform(src1, 0);
+        library->fill_tensor_uniform(src2, 1);
+        library->fill_tensor_uniform(src3, 2);
+    }
+
+    // Compute reference
+    ReferenceCPP::gemm(src1, src2, src3, dst, alpha, beta);
+
+    return dst;
+}
+
+RawTensor Reference::compute_reference_pixel_wise_multiplication(const TensorShape &shape, DataType dt_in0, DataType dt_in1, DataType dt_out, float scale, ConvertPolicy convert_policy,
+                                                                 RoundingPolicy rounding_policy)
+{
+    // Create reference
+    RawTensor ref_src1 = library->get(shape, dt_in0);
+    RawTensor ref_src2 = library->get(shape, dt_in1);
+    RawTensor ref_dst  = library->get(shape, dt_out);
+
+    // Fill reference
+    library->fill_tensor_uniform(ref_src1, 0);
+    library->fill_tensor_uniform(ref_src2, 1);
+
+    // Compute reference
+    ReferenceCPP::pixel_wise_multiplication(ref_src1, ref_src2, ref_dst, scale, convert_policy, rounding_policy);
+
+    return ref_dst;
+}
+
+RawTensor Reference::compute_reference_fixed_point_pixel_wise_multiplication(const TensorShape &shape, DataType dt_in0, DataType dt_in1, DataType dt_out, float scale, int fixed_point_position,
+                                                                             ConvertPolicy convert_policy, RoundingPolicy rounding_policy)
+{
+    // Create reference
+    RawTensor ref_src1 = library->get(shape, dt_in0, 1, fixed_point_position);
+    RawTensor ref_src2 = library->get(shape, dt_in1, 1, fixed_point_position);
+    RawTensor ref_dst  = library->get(shape, dt_out, 1, fixed_point_position);
+
+    // Fill reference
+    library->fill_tensor_uniform(ref_src1, 0);
+    library->fill_tensor_uniform(ref_src2, 1);
+
+    // Compute reference
+    ReferenceCPP::fixed_point_pixel_wise_multiplication(ref_src1, ref_src2, ref_dst, scale, convert_policy, rounding_policy);
+
+    return ref_dst;
+}
+
+RawTensor Reference::compute_reference_threshold(const TensorShape &shape, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
+{
+    // Create reference
+    RawTensor ref_src1 = library->get(shape, DataType::U8);
+    RawTensor ref_dst  = library->get(shape, DataType::U8);
+
+    // Fill reference
+    library->fill_tensor_uniform(ref_src1, 0);
+
+    // Compute reference
+    ReferenceCPP::threshold(ref_src1, ref_dst, threshold, false_value, true_value, type, upper);
+
+    return ref_dst;
+}
+
+RawTensor Reference::compute_reference_activation_layer(const TensorShape &shape, DataType dt, ActivationLayerInfo act_info, int fixed_point_position)
+{
+    // Create reference
+    RawTensor ref_src = library->get(shape, dt, 1, fixed_point_position);
+    RawTensor ref_dst = library->get(shape, dt, 1, fixed_point_position);
+
+    // Fill reference
+    if(dt == DataType::F32)
+    {
+        float min_bound = 0;
+        float max_bound = 0;
+        std::tie(min_bound, max_bound) = get_activation_layer_test_bounds<float>(act_info.activation());
+        std::uniform_real_distribution<> distribution(min_bound, max_bound);
+        library->fill(ref_src, distribution, 0);
+    }
+    else
+    {
+        int min_bound = 0;
+        int max_bound = 0;
+        std::tie(min_bound, max_bound) = get_activation_layer_test_bounds<int8_t>(act_info.activation(), fixed_point_position);
+        std::uniform_int_distribution<> distribution(min_bound, max_bound);
+        library->fill(ref_src, distribution, 0);
+    }
+
+    // Compute reference
+    ReferenceCPP::activation_layer(ref_src, ref_dst, act_info);
+
+    return ref_dst;
+}
+
+RawTensor Reference::compute_reference_batch_normalization_layer(const TensorShape &shape0, const TensorShape &shape1, DataType dt, float epsilon, int fixed_point_position)
+{
+    // Create reference
+    RawTensor ref_src   = library->get(shape0, dt, 1, fixed_point_position);
+    RawTensor ref_dst   = library->get(shape0, dt, 1, fixed_point_position);
+    RawTensor ref_mean  = library->get(shape1, dt, 1, fixed_point_position);
+    RawTensor ref_var   = library->get(shape1, dt, 1, fixed_point_position);
+    RawTensor ref_beta  = library->get(shape1, dt, 1, fixed_point_position);
+    RawTensor ref_gamma = library->get(shape1, dt, 1, fixed_point_position);
+
+    // Fill tensors with values from -1 to 1.
+    if(dt == DataType::F32)
+    {
+        float min_bound = 0.f;
+        float max_bound = 0.f;
+        std::tie(min_bound, max_bound) = get_batchnormalization_layer_test_bounds<float>();
+        std::uniform_real_distribution<> distribution(min_bound, max_bound);
+        std::uniform_real_distribution<> distribution_var(0, max_bound);
+        library->fill(ref_src, distribution, 0);
+        library->fill(ref_mean, distribution, 1);
+        library->fill(ref_var, distribution_var, 0);
+        library->fill(ref_beta, distribution, 3);
+        library->fill(ref_gamma, distribution, 4);
+    }
+    else
+    {
+        int min_bound = 0;
+        int max_bound = 0;
+        std::tie(min_bound, max_bound) = get_batchnormalization_layer_test_bounds<int8_t>(fixed_point_position);
+        std::uniform_int_distribution<> distribution(min_bound, max_bound);
+        std::uniform_int_distribution<> distribution_var(0, max_bound);
+        library->fill(ref_src, distribution, 0);
+        library->fill(ref_mean, distribution, 1);
+        library->fill(ref_var, distribution_var, 0);
+        library->fill(ref_beta, distribution, 3);
+        library->fill(ref_gamma, distribution, 4);
+    }
+
+    // Compute reference
+    ReferenceCPP::batch_normalization_layer(ref_src, ref_dst, ref_mean, ref_var, ref_beta, ref_gamma, epsilon, fixed_point_position);
+
+    return ref_dst;
+}
+
+RawTensor Reference::compute_reference_convolution_layer(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, DataType dt,
+                                                         const PadStrideInfo &conv_info, int fixed_point_position)
+{
+    // Create reference
+    RawTensor ref_src     = library->get(input_shape, dt, 1, fixed_point_position);
+    RawTensor ref_weights = library->get(weights_shape, dt, 1, fixed_point_position);
+    RawTensor ref_bias    = library->get(bias_shape, dt, 1, fixed_point_position);
+    RawTensor ref_dst     = library->get(output_shape, dt, 1, fixed_point_position);
+
+    // Fill reference
+    if(dt == DataType::F32)
+    {
+        std::uniform_real_distribution<> distribution(-1.0f, 1.0f);
+        library->fill(ref_src, distribution, 0);
+        library->fill(ref_weights, distribution, 1);
+        library->fill(ref_bias, distribution, 2);
+    }
+    else
+    {
+        library->fill_tensor_uniform(ref_src, 0);
+        library->fill_tensor_uniform(ref_weights, 1);
+        library->fill_tensor_uniform(ref_bias, 2);
+    }
+
+    // Compute reference
+    ReferenceCPP::convolution_layer(ref_src, ref_weights, ref_bias, ref_dst, conv_info);
+
+    return ref_dst;
+}
+
+RawTensor Reference::compute_reference_fully_connected_layer(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape,
+                                                             DataType dt, bool transpose_weights, int fixed_point_position)
+{
+    // Create reference
+    RawTensor ref_src  = library->get(input_shape, dt, 1, fixed_point_position);
+    RawTensor ref_bias = library->get(bias_shape, dt, 1, fixed_point_position);
+    RawTensor ref_dst  = library->get(output_shape, dt, 1, fixed_point_position);
+
+    // Swap the first and second dimension of weights' shape if transpose_weights is true
+    TensorShape ws = weights_shape;
+    if(transpose_weights)
+    {
+        const size_t dimx = ws.x();
+        ws.set(0, ws.y());
+        ws.set(1, dimx);
+    }
+
+    RawTensor ref_weights = library->get(ws, dt, 1, fixed_point_position);
+
+    // Fill reference
+    if(dt == DataType::F32)
+    {
+        std::uniform_real_distribution<> distribution(-1.0f, 1.0f);
+        library->fill(ref_src, distribution, 0);
+        library->fill(ref_weights, distribution, 1);
+        library->fill(ref_bias, distribution, 2);
+    }
+    else
+    {
+        library->fill_tensor_uniform(ref_src, 0);
+        library->fill_tensor_uniform(ref_weights, 1);
+        library->fill_tensor_uniform(ref_bias, 2);
+    }
+
+    // Compute reference
+    ReferenceCPP::fully_connected_layer(ref_src, ref_weights, ref_bias, ref_dst);
+
+    return ref_dst;
+}
+
+RawTensor Reference::compute_reference_normalization_layer(const TensorShape &shape, DataType dt, NormalizationLayerInfo norm_info, int fixed_point_position)
+{
+    // Create reference
+    RawTensor ref_src = library->get(shape, dt, 1, fixed_point_position);
+    RawTensor ref_dst = library->get(shape, dt, 1, fixed_point_position);
+
+    // Fill reference
+    if(dt == DataType::QS8)
+    {
+        const int8_t one_fixed_point       = 1 << fixed_point_position;
+        const int8_t minus_one_fixed_point = -one_fixed_point;
+        library->fill_tensor_uniform(ref_src, 0, minus_one_fixed_point, one_fixed_point);
+    }
+    else
+    {
+        library->fill_tensor_uniform(ref_src, 0);
+    }
+
+    // Compute reference
+    ReferenceCPP::normalization_layer(ref_src, ref_dst, norm_info);
+
+    return ref_dst;
+}
+
+RawTensor Reference::compute_reference_pooling_layer(const TensorShape &shape_in, const TensorShape &shape_out, DataType dt, PoolingLayerInfo pool_info, int fixed_point_position)
+{
+    // Create reference
+    RawTensor ref_src = library->get(shape_in, dt, 1, fixed_point_position);
+    RawTensor ref_dst = library->get(shape_out, dt, 1, fixed_point_position);
+
+    // Fill reference
+    int min = 0;
+    int max = 0;
+    switch(dt)
+    {
+        case DataType::F32:
+            min = -1;
+            max = 1;
+            break;
+        case DataType::QS8:
+            min = -(1 << fixed_point_position);
+            max = (1 << fixed_point_position);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("DataType not supported.");
+    }
+    std::uniform_real_distribution<> distribution(min, max);
+    library->fill(ref_src, distribution, 0.0);
+
+    // Compute reference
+    ReferenceCPP::pooling_layer(ref_src, ref_dst, pool_info, fixed_point_position);
+
+    return ref_dst;
+}
+
+RawTensor Reference::compute_reference_softmax_layer(const TensorShape &shape, DataType dt, int fixed_point_position)
+{
+    // Create reference
+    RawTensor ref_src = library->get(shape, dt, 1, fixed_point_position);
+    RawTensor ref_dst = library->get(shape, dt, 1, fixed_point_position);
+
+    // Fill reference
+    if(arm_compute::is_data_type_float(dt))
+    {
+        std::uniform_real_distribution<> distribution(-10, 10);
+        library->fill(ref_src, distribution, 0);
+    }
+    else
+    {
+        int                             one_fixed = 1 << fixed_point_position;
+        std::uniform_int_distribution<> distribution(-one_fixed, one_fixed);
+        library->fill(ref_src, distribution, 0);
+    }
+
+    // Compute reference
+    ReferenceCPP::softmax_layer(ref_src, ref_dst);
+
+    return ref_dst;
+}
+
+RawTensor Reference::compute_reference_fixed_point_operation(const TensorShape &shape, DataType dt_in, DataType dt_out, FixedPointOp op, int fixed_point_position)
+{
+    // Create reference
+    RawTensor ref_src = library->get(shape, dt_in, 1, fixed_point_position);
+    RawTensor ref_dst = library->get(shape, dt_out, 1, fixed_point_position);
+
+    // Fill reference
+    int min = 0;
+    int max = 0;
+    switch(op)
+    {
+        case(FixedPointOp::INV_SQRT):
+            min = 32;
+            max = 127;
+            break;
+        case(FixedPointOp::LOG):
+            min = (1 << (fixed_point_position - 1));
+            max = 63;
+            break;
+        case(FixedPointOp::EXP):
+            min = 1;
+            max = (1 << (fixed_point_position - 1));
+            break;
+        case(FixedPointOp::RECIPROCAL):
+            min = 15;
+            max = 100;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Fixed point operation not supported");
+    }
+    std::uniform_int_distribution<> distribution(min, max);
+    library->fill(ref_src, distribution, 0);
+
+    // Compute reference
+    ReferenceCPP::fixed_point_operation(ref_src, ref_dst, op);
+
+    return ref_dst;
+}
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/Reference.h b/tests/validation/Reference.h
new file mode 100644
index 0000000000..4e5b462f9e
--- /dev/null
+++ b/tests/validation/Reference.h
@@ -0,0 +1,303 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_REFERENCE_REFERENCE_H__
+#define __ARM_COMPUTE_TEST_REFERENCE_REFERENCE_H__
+
+#include "RawTensor.h"
+#include "Types.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+/** Interface for reference implementations. */
+class Reference
+{
+public:
+    /** Compute reference integral image.
+     *
+     * @param[in] shape Shape of the input and output tensors.
+     *
+     * @return Computed raw tensor.
+     */
+    static RawTensor compute_reference_integral_image(const TensorShape &shape);
+    /** Compute reference absolute difference.
+     *
+     * @param[in] shape  Shape of the input and output tensors.
+     * @param[in] dt_in0 Data type of first input tensor.
+     * @param[in] dt_in1 Data type of second input tensor.
+     * @param[in] dt_out Data type of the output tensor.
+     *
+     * @return Computed raw tensor.
+     */
+    static RawTensor compute_reference_absolute_difference(const TensorShape &shape, DataType dt_in0, DataType dt_in1, DataType dt_out);
+    /** Compute reference accumulate.
+     *
+     * @param[in] shape Shape of the input and output tensors.
+     *
+     * @return Computed raw tensor.
+     */
+    static RawTensor compute_reference_accumulate(const TensorShape &shape);
+    /** Compute reference accumulate.
+     *
+     * @param[in] shape Shape of the input and output tensors.
+     * @param[in] shift A uint32_t value within the range of [0, 15]
+     *
+     * @return Computed raw tensor.
+     */
+    static RawTensor compute_reference_accumulate_squared(const TensorShape &shape, uint32_t shift);
+    /** Compute reference accumulate.
+     *
+     * @param[in] shape Shape of the input and output tensors.
+     * @param[in] alpha A float value within the range of [0, 1]
+     *
+     * @return Computed raw tensor.
+     */
+    static RawTensor compute_reference_accumulate_weighted(const TensorShape &shape, float alpha);
+    /** Compute reference arithmetic addition.
+     *
+     * @param[in] shape          Shape of the input and output tensors.
+     * @param[in] dt_in0         Data type of first input tensor.
+     * @param[in] dt_in1         Data type of second input tensor.
+     * @param[in] dt_out         Data type of the output tensor.
+     * @param[in] convert_policy Overflow policy of the operation.
+     *
+     * @return Computed raw tensor.
+     */
+    static RawTensor compute_reference_arithmetic_addition(const TensorShape &shape, DataType dt_in0, DataType dt_in1, DataType dt_out, ConvertPolicy convert_policy);
+    /** Compute reference arithmetic subtraction.
+     *
+     * @param[in] shape          Shape of the input and output tensors.
+     * @param[in] dt_in0         Data type of first input tensor.
+     * @param[in] dt_in1         Data type of second input tensor.
+     * @param[in] dt_out         Data type of the output tensor.
+     * @param[in] convert_policy Overflow policy of the operation.
+     *
+     * @return Computed raw tensor.
+     */
+    static RawTensor compute_reference_arithmetic_subtraction(const TensorShape &shape, DataType dt_in0, DataType dt_in1, DataType dt_out, ConvertPolicy convert_policy);
+    /** Compute reference bitwise and.
+     *
+     * @param[in] shape Shape of the input and output tensors.
+     *
+     * @return Computed raw tensor.
+     */
+    static RawTensor compute_reference_bitwise_and(const TensorShape &shape);
+    /** Compute reference bitwise or.
+     *
+     * @param[in] shape Shape of the input and output tensors.
+     *
+     * @return Computed raw tensor.
+     */
+    static RawTensor compute_reference_bitwise_or(const TensorShape &shape);
+    /** Compute reference bitwise xor.
+     *
+     * @param[in] shape Shape of the input and output tensors.
+     *
+     * @return Computed raw tensor.
+     */
+    static RawTensor compute_reference_bitwise_xor(const TensorShape &shape);
+    /** Compute reference bitwise not.
+     *
+     * @param[in] shape Shape of the input and output tensors.
+     *
+     * @return Computed raw tensor.
+     */
+    static RawTensor compute_reference_bitwise_not(const TensorShape &shape);
+    /** Compute reference 3-by-3 box filter.
+     *
+     * @param[in] shape Shape of the input and output tensors.
+     *
+     * @return Computed raw tensor.
+     */
+    static RawTensor compute_reference_box3x3(const TensorShape &shape);
+    /** Compute reference depth convert.
+     *
+     * @param[in] shape                Shape of the input and output tensors.
+     * @param[in] dt_in                Data type of input tensor.
+     * @param[in] dt_out               Data type of the output tensor.
+     * @param[in] policy               Overflow policy of the operation.
+     * @param[in] shift                Value for down/up conversions. Must be 0 <= shift < 8.
+     * @param[in] fixed_point_position Fixed point position.
+     *
+     * @return Computed raw tensor.
+     */
+    static RawTensor compute_reference_depth_convert(const TensorShape &shape, DataType dt_in, DataType dt_out, ConvertPolicy policy, uint32_t shift, uint32_t fixed_point_position);
+    /** Compute matrix multiply function.
+     *
+     * @param[in]  src_shape1           First input tensor shape
+     * @param[in]  src_shape2           Second input tensor shape
+     * @param[in]  src_shape3           Third input tensor shape
+     * @param[out] dst_shape            Output tensor.
+     * @param[in]  alpha                Weight of the matrix product
+     * @param[in]  beta                 Weight of the third matrix
+     * @param[in]  dt                   Tensor's data type
+     * @param[in]  fixed_point_position (Optional) Number of bits for the fractional part of the fixed point numbers
+     *
+     * @return Computed output tensor.
+     */
+    static RawTensor compute_reference_gemm(const TensorShape &src_shape1, const TensorShape &src_shape2, const TensorShape &src_shape3,
+                                            const TensorShape &dst_shape, float alpha, float beta, DataType dt, int fixed_point_position = 0);
+    /** Compute reference pixel-wise multiplication
+     *
+     * @param[in] shape           Shape of the input and output tensors.
+     * @param[in] dt_in0          Data type of first input tensor.
+     * @param[in] dt_in1          Data type of second input tensor.
+     * @param[in] dt_out          Data type of the output tensor.
+     * @param[in] scale           Non-negative scale.
+     * @param[in] convert_policy  Overflow policy of the operation.
+     * @param[in] rounding_policy Rounding policy of the operation.
+     *
+     * @return Computed raw tensor.
+     */
+    static RawTensor compute_reference_pixel_wise_multiplication(const TensorShape &shape, DataType dt_in0, DataType dt_in1, DataType dt_out, float scale, ConvertPolicy convert_policy,
+                                                                 RoundingPolicy rounding_policy);
+    /** Compute reference pixel-wise multiplication.
+     *
+     * @param[in] shape                Shape of the input and output tensors.
+     * @param[in] dt_in0               Data type of first input tensor.
+     * @param[in] dt_in1               Data type of second input tensor.
+     * @param[in] dt_out               Data type of the output tensor.
+     * @param[in] scale                Scale to apply after multiplication. Must be positive.
+     * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number.
+     * @param[in] convert_policy       Overflow policy of the operation.
+     * @param[in] rounding_policy      Rounding policy of the operation.
+     *
+     * @return Computed raw tensor.
+     */
+    static RawTensor compute_reference_fixed_point_pixel_wise_multiplication(const TensorShape &shape, DataType dt_in0, DataType dt_in1, DataType dt_out, float scale, int fixed_point_position,
+                                                                             ConvertPolicy convert_policy, RoundingPolicy rounding_policy);
+    /** Compute reference threshold.
+     *
+     * @param[in] shape       Shape of the input and output tensors.
+     * @param[in] threshold   Threshold. When the threshold type is RANGE, this is used as the lower threshold.
+     * @param[in] false_value value to set when the condition is not respected.
+     * @param[in] true_value  value to set when the condition is respected.
+     * @param[in] type        Thresholding type. Either RANGE or BINARY.
+     * @param[in] upper       Upper threshold. Only used when the thresholding type is RANGE.
+     *
+     * @return Computed raw tensor.
+     */
+    static RawTensor compute_reference_threshold(const TensorShape &shape, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper);
+    /** Compute reference activation layer.
+     *
+     * @param[in] shape                Shape of the input and output tensors.
+     * @param[in] dt                   Data type of the tensors.
+     * @param[in] act_info             Activation layer information.
+     * @param[in] fixed_point_position (Optional)Number of bits for the fractional part of fixed point numbers.
+     *
+     * @return Computed raw tensor.
+     */
+    static RawTensor compute_reference_activation_layer(const TensorShape &shape, DataType dt, ActivationLayerInfo act_info, int fixed_point_position = 0);
+    /** Compute reference batch normalization layer.
+     *
+     * @param[in] shape0               Shape of the input and output tensors.
+     * @param[in] shape1               Shape of the vector tensors.
+     * @param[in] dt                   Data type of all input and output tensors.
+     * @param[in] epsilon              Small value to avoid division with zero.
+     * @param[in] fixed_point_position Fixed point position.
+     *
+     * @return Computed raw tensor.
+     */
+    static RawTensor compute_reference_batch_normalization_layer(const TensorShape &shape0, const TensorShape &shape1, DataType dt, float epsilon, int fixed_point_position = 0);
+    /** Compute reference pixel-wise multiplication
+     *
+     * @param[in] input_shape          Shape for the input tensor
+     * @param[in] weights_shape        Shape for the weights tensor
+     * @param[in] bias_shape           Shape for the bias tensor
+     * @param[in] output_shape         Shape for the output tensor
+     * @param[in] dt                   Data type to use
+     * @param[in] conv_info            Pads and strides information for the convolution layer
+     * @param[in] fixed_point_position Number of bits for the fractional part of the fixed point numbers
+     *
+     * @return Computed raw tensor.
+     */
+    static RawTensor compute_reference_convolution_layer(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, DataType dt,
+                                                         const PadStrideInfo &conv_info, int fixed_point_position);
+    /** Compute reference for fully connected layer function
+     *
+     * @param[in] input_shape          Shape for the input tensor
+     * @param[in] weights_shape        Shape for the weights tensor
+     * @param[in] bias_shape           Shape for the bias tensor
+     * @param[in] output_shape         Shape for the output tensor
+     * @param[in] dt                   Data type to use
+     * @param[in] transpose_weights    Transpose the weights if true
+     * @param[in] fixed_point_position Number of bits for the fractional part of the fixed point numbers
+     *
+     * @return Computed raw tensor.
+     */
+    static RawTensor compute_reference_fully_connected_layer(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, DataType dt,
+                                                             bool transpose_weights, int fixed_point_position);
+    /** Compute reference normalization layer.
+     *
+     * @param[in] shape                Shape of the input and output tensors.
+     * @param[in] dt                   Data type of input and output tensors.
+     * @param[in] norm_info            Normalization Layer information.
+     * @param[in] fixed_point_position (Optional) Fixed point position that expresses the number of bits for the fractional part of the number when the tensor's data type is QS8 or QS16 (default = 0).
+     *
+     * @return Computed raw tensor.
+     */
+    static RawTensor compute_reference_normalization_layer(const TensorShape &shape, DataType dt, NormalizationLayerInfo norm_info, int fixed_point_position = 0);
+    /** Compute reference pooling layer.
+      *
+      * @param[in] shape_in             Shape of the input tensor.
+      * @param[in] shape_out            Shape of the output tensor.
+      * @param[in] dt                   Data type of input and output tensors.
+      * @param[in] pool_info            Pooling Layer information.
+      * @param[in] fixed_point_position (Optional) Number of bits for the fractional part of the fixed point numbers.
+      *
+      * @return Computed raw tensor.
+      */
+    static RawTensor compute_reference_pooling_layer(const TensorShape &shape_in, const TensorShape &shape_out, DataType dt, PoolingLayerInfo pool_info, int fixed_point_position = 0);
+    /** Compute reference softmax layer.
+     *
+     * @param[in] shape                Shape of the input and output tensors.
+     * @param[in] dt                   Data type of input and output tensors.
+     * @param[in] fixed_point_position (Optional) Number of bits for the fractional part of the fixed point numbers
+     *
+     * @return Computed raw tensor.
+     */
+    static RawTensor compute_reference_softmax_layer(const TensorShape &shape, DataType dt, int fixed_point_position = 0);
+    /** Compute reference fixed point operation.
+     *
+     * @param[in] shape                Shape of the input and output tensors.
+     * @param[in] dt_in                Data type of the input tensor.
+     * @param[in] dt_out               Data type of the output tensor.
+     * @param[in] op                   Fixed point operation to perform.
+     * @param[in] fixed_point_position Number of bits for the fractional part of the fixed point numbers
+     *
+     * @return Computed raw tensor.
+     */
+    static RawTensor compute_reference_fixed_point_operation(const TensorShape &shape, DataType dt_in, DataType dt_out, FixedPointOp op, int fixed_point_position);
+
+protected:
+    Reference()  = default;
+    ~Reference() = default;
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/validation/ReferenceCPP.cpp b/tests/validation/ReferenceCPP.cpp
new file mode 100644
index 0000000000..ddb84835c3
--- /dev/null
+++ b/tests/validation/ReferenceCPP.cpp
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ReferenceCPP.h"
+
+#include "TensorFactory.h"
+#include "TensorOperations.h"
+#include "TensorVisitors.h"
+#include "TypePrinter.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include "boost_wrapper.h"
+
+#include <functional>
+#include <numeric>
+#include <vector>
+
+using namespace arm_compute::test::validation::tensor_visitors;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+// Absolute difference
+void ReferenceCPP::absolute_difference(const RawTensor &src1, const RawTensor &src2, RawTensor &dst)
+{
+    const TensorVariant s1 = TensorFactory::get_tensor(src1);
+    const TensorVariant s2 = TensorFactory::get_tensor(src2);
+    TensorVariant       d  = TensorFactory::get_tensor(dst);
+    boost::apply_visitor(absolute_difference_visitor(), s1, s2, d);
+}
+// Integral image
+void ReferenceCPP::integral_image(const RawTensor &src, RawTensor &dst)
+{
+    ARM_COMPUTE_ERROR_ON(src.data_type() != DataType::U8 || dst.data_type() != DataType::U32);
+    const Tensor<uint8_t> s(src.shape(), src.data_type(), src.fixed_point_position(), reinterpret_cast<const uint8_t *>(src.data()));
+    Tensor<uint32_t>      d(dst.shape(), dst.data_type(), dst.fixed_point_position(), reinterpret_cast<uint32_t *>(dst.data()));
+    tensor_operations::integral_image(s, d);
+}
+// Accumulate
+void ReferenceCPP::accumulate(const RawTensor &src, RawTensor &dst)
+{
+    ARM_COMPUTE_ERROR_ON(src.data_type() != DataType::U8 || dst.data_type() != DataType::S16);
+    const Tensor<uint8_t> s(src.shape(), src.data_type(), src.fixed_point_position(), reinterpret_cast<const uint8_t *>(src.data()));
+    Tensor<int16_t>       d(dst.shape(), dst.data_type(), dst.fixed_point_position(), reinterpret_cast<int16_t *>(dst.data()));
+    tensor_operations::accumulate(s, d);
+}
+
+// Accumulate squared
+void ReferenceCPP::accumulate_squared(const RawTensor &src, RawTensor &dst, uint32_t shift)
+{
+    ARM_COMPUTE_ERROR_ON(src.data_type() != DataType::U8 || dst.data_type() != DataType::S16);
+    const Tensor<uint8_t> s(src.shape(), src.data_type(), src.fixed_point_position(), reinterpret_cast<const uint8_t *>(src.data()));
+    Tensor<int16_t>       d(dst.shape(), dst.data_type(), dst.fixed_point_position(), reinterpret_cast<int16_t *>(dst.data()));
+    tensor_operations::accumulate_squared(s, d, shift);
+}
+
+// Accumulate weighted
+void ReferenceCPP::accumulate_weighted(const RawTensor &src, RawTensor &dst, float alpha)
+{
+    ARM_COMPUTE_ERROR_ON(src.data_type() != DataType::U8 || dst.data_type() != DataType::U8);
+    const Tensor<uint8_t> s(src.shape(), src.data_type(), src.fixed_point_position(), reinterpret_cast<const uint8_t *>(src.data()));
+    Tensor<uint8_t>       d(dst.shape(), dst.data_type(), dst.fixed_point_position(), reinterpret_cast<uint8_t *>(dst.data()));
+    tensor_operations::accumulate_weighted(s, d, alpha);
+}
+
+// Arithmetic addition
+void ReferenceCPP::arithmetic_addition(const RawTensor &src1, const RawTensor &src2, RawTensor &dst, ConvertPolicy convert_policy)
+{
+    const TensorVariant s1 = TensorFactory::get_tensor(src1);
+    const TensorVariant s2 = TensorFactory::get_tensor(src2);
+    TensorVariant       d  = TensorFactory::get_tensor(dst);
+    boost::apply_visitor(arithmetic_addition_visitor(convert_policy), s1, s2, d);
+}
+
+// Arithmetic subtraction
+void ReferenceCPP::arithmetic_subtraction(const RawTensor &src1, const RawTensor &src2, RawTensor &dst, ConvertPolicy convert_policy)
+{
+    const TensorVariant s1 = TensorFactory::get_tensor(src1);
+    const TensorVariant s2 = TensorFactory::get_tensor(src2);
+    TensorVariant       d  = TensorFactory::get_tensor(dst);
+    boost::apply_visitor(arithmetic_subtraction_visitor(convert_policy), s1, s2, d);
+}
+
+// Bitwise and
+void ReferenceCPP::bitwise_and(const RawTensor &src1, const RawTensor &src2, RawTensor &dst)
+{
+    ARM_COMPUTE_ERROR_ON(src1.data_type() != DataType::U8 || src2.data_type() != DataType::U8 || dst.data_type() != DataType::U8);
+    const Tensor<uint8_t> s1(src1.shape(), src1.data_type(), src1.fixed_point_position(), reinterpret_cast<const uint8_t *>(src1.data()));
+    const Tensor<uint8_t> s2(src2.shape(), src2.data_type(), src2.fixed_point_position(), reinterpret_cast<const uint8_t *>(src2.data()));
+    Tensor<uint8_t>       d(dst.shape(), dst.data_type(), dst.fixed_point_position(), reinterpret_cast<uint8_t *>(dst.data()));
+    tensor_operations::bitwise_and(s1, s2, d);
+}
+
+// Bitwise or
+void ReferenceCPP::bitwise_or(const RawTensor &src1, const RawTensor &src2, RawTensor &dst)
+{
+    ARM_COMPUTE_ERROR_ON(src1.data_type() != DataType::U8 || src2.data_type() != DataType::U8 || dst.data_type() != DataType::U8);
+    const Tensor<uint8_t> s1(src1.shape(), src1.data_type(), src1.fixed_point_position(), reinterpret_cast<const uint8_t *>(src1.data()));
+    const Tensor<uint8_t> s2(src2.shape(), src2.data_type(), src2.fixed_point_position(), reinterpret_cast<const uint8_t *>(src2.data()));
+    Tensor<uint8_t>       d(dst.shape(), dst.data_type(), dst.fixed_point_position(), reinterpret_cast<uint8_t *>(dst.data()));
+    tensor_operations::bitwise_or(s1, s2, d);
+}
+
+// Bitwise xor
+void ReferenceCPP::bitwise_xor(const RawTensor &src1, const RawTensor &src2, RawTensor &dst)
+{
+    ARM_COMPUTE_ERROR_ON(src1.data_type() != DataType::U8 || src2.data_type() != DataType::U8 || dst.data_type() != DataType::U8);
+    const Tensor<uint8_t> s1(src1.shape(), src1.data_type(), src1.fixed_point_position(), reinterpret_cast<const uint8_t *>(src1.data()));
+    const Tensor<uint8_t> s2(src2.shape(), src2.data_type(), src2.fixed_point_position(), reinterpret_cast<const uint8_t *>(src2.data()));
+    Tensor<uint8_t>       d(dst.shape(), dst.data_type(), dst.fixed_point_position(), reinterpret_cast<uint8_t *>(dst.data()));
+    tensor_operations::bitwise_xor(s1, s2, d);
+}
+
+// Bitwise not
+void ReferenceCPP::bitwise_not(const RawTensor &src, RawTensor &dst)
+{
+    ARM_COMPUTE_ERROR_ON(src.data_type() != DataType::U8 || dst.data_type() != DataType::U8);
+    const Tensor<uint8_t> s(src.shape(), src.data_type(), src.fixed_point_position(), reinterpret_cast<const uint8_t *>(src.data()));
+    Tensor<uint8_t>       d(dst.shape(), dst.data_type(), dst.fixed_point_position(), reinterpret_cast<uint8_t *>(dst.data()));
+    tensor_operations::bitwise_not(s, d);
+}
+
+// 3-by-3 box filter
+void ReferenceCPP::box3x3(const RawTensor &src, RawTensor &dst)
+{
+    ARM_COMPUTE_ERROR_ON(src.data_type() != DataType::U8 || dst.data_type() != DataType::U8);
+    const Tensor<uint8_t> s(src.shape(), src.data_type(), src.fixed_point_position(), reinterpret_cast<const uint8_t *>(src.data()));
+    Tensor<uint8_t>       d(dst.shape(), dst.data_type(), dst.fixed_point_position(), reinterpret_cast<uint8_t *>(dst.data()));
+    tensor_operations::box3x3(s, d);
+}
+
+// Depth conversion
+void ReferenceCPP::depth_convert(const RawTensor &src, RawTensor &dst, ConvertPolicy policy, uint32_t shift)
+{
+    const TensorVariant s = TensorFactory::get_tensor(src);
+    TensorVariant       d = TensorFactory::get_tensor(dst);
+    boost::apply_visitor(tensor_visitors::depth_convert_visitor(policy, shift), s, d);
+}
+
+// GEMM
+void ReferenceCPP::gemm(const RawTensor &src1, const RawTensor &src2, const RawTensor &src3,
+                        RawTensor &dst, float alpha, float beta)
+{
+    const TensorVariant s1 = TensorFactory::get_tensor(src1);
+    const TensorVariant s2 = TensorFactory::get_tensor(src2);
+    const TensorVariant s3 = TensorFactory::get_tensor(src3);
+    TensorVariant       d  = TensorFactory::get_tensor(dst);
+
+    boost::apply_visitor(tensor_visitors::gemm_visitor(s1, s2, s3, alpha, beta), d);
+}
+
+// Pixel-wise multiplication
+void ReferenceCPP::pixel_wise_multiplication(const RawTensor &src1, const RawTensor &src2, RawTensor &dst, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy)
+{
+    const TensorVariant s1 = TensorFactory::get_tensor(src1);
+    const TensorVariant s2 = TensorFactory::get_tensor(src2);
+    TensorVariant       d  = TensorFactory::get_tensor(dst);
+    boost::apply_visitor(pixel_wise_multiplication_visitor(scale, convert_policy, rounding_policy), s1, s2, d);
+}
+
+// Fixed-point Pixel-wise multiplication
+void ReferenceCPP::fixed_point_pixel_wise_multiplication(const RawTensor &src1, const RawTensor &src2, RawTensor &dst, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy)
+{
+    const TensorVariant s1 = TensorFactory::get_tensor(src1);
+    const TensorVariant s2 = TensorFactory::get_tensor(src2);
+    TensorVariant       d  = TensorFactory::get_tensor(dst);
+    boost::apply_visitor(tensor_visitors::fixed_point_pixel_wise_multiplication_visitor(s1, s2, scale, convert_policy, rounding_policy), d);
+}
+
+// Threshold
+void ReferenceCPP::threshold(const RawTensor &src, RawTensor &dst, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
+{
+    ARM_COMPUTE_ERROR_ON(src.data_type() != DataType::U8 || dst.data_type() != DataType::U8);
+    const Tensor<uint8_t> s(src.shape(), src.data_type(), src.fixed_point_position(), reinterpret_cast<const uint8_t *>(src.data()));
+    Tensor<uint8_t>       d(dst.shape(), dst.data_type(), dst.fixed_point_position(), reinterpret_cast<uint8_t *>(dst.data()));
+    threshold_operation(s, d, threshold, false_value, true_value, type, upper);
+}
+
+// Activation layer
+void ReferenceCPP::activation_layer(const RawTensor &input, RawTensor &output, ActivationLayerInfo act_info)
+{
+    const TensorVariant s = TensorFactory::get_tensor(input);
+    TensorVariant       d = TensorFactory::get_tensor(output);
+    boost::apply_visitor(tensor_visitors::activation_layer_visitor(s, act_info), d);
+}
+
+// Batch Normalization Layer
+void ReferenceCPP::batch_normalization_layer(const RawTensor &src, RawTensor &dst, const RawTensor &mean, const RawTensor &var, const RawTensor &beta, const RawTensor &gamma, float epsilon,
+                                             int fixed_point_position)
+{
+    const TensorVariant s = TensorFactory::get_tensor(src);
+    TensorVariant       d = TensorFactory::get_tensor(dst);
+    const TensorVariant m = TensorFactory::get_tensor(mean);
+    const TensorVariant v = TensorFactory::get_tensor(var);
+    const TensorVariant b = TensorFactory::get_tensor(beta);
+    const TensorVariant g = TensorFactory::get_tensor(gamma);
+    boost::apply_visitor(tensor_visitors::batch_normalization_layer_visitor(s, m, v, b, g, epsilon, fixed_point_position), d);
+}
+
+// Convolution Layer
+void ReferenceCPP::convolution_layer(const RawTensor &src, const RawTensor &weights, const RawTensor &bias, RawTensor &dst, const PadStrideInfo &conv_info)
+{
+    const TensorVariant s = TensorFactory::get_tensor(src);
+    const TensorVariant w = TensorFactory::get_tensor(weights);
+    const TensorVariant b = TensorFactory::get_tensor(bias);
+    TensorVariant       d = TensorFactory::get_tensor(dst);
+    boost::apply_visitor(tensor_visitors::convolution_layer_visitor(s, w, b, conv_info), d);
+}
+
+// Fully connected layer
+void ReferenceCPP::fully_connected_layer(const RawTensor &src, const RawTensor &weights, const RawTensor &bias, RawTensor &dst)
+{
+    const TensorVariant s = TensorFactory::get_tensor(src);
+    const TensorVariant w = TensorFactory::get_tensor(weights);
+    const TensorVariant b = TensorFactory::get_tensor(bias);
+    TensorVariant       d = TensorFactory::get_tensor(dst);
+    boost::apply_visitor(tensor_visitors::fully_connected_layer_visitor(s, w, b), d);
+}
+
+// Normalization Layer
+void ReferenceCPP::normalization_layer(const RawTensor &src, RawTensor &dst, NormalizationLayerInfo norm_info)
+{
+    const TensorVariant s = TensorFactory::get_tensor(src);
+    TensorVariant       d = TensorFactory::get_tensor(dst);
+    boost::apply_visitor(tensor_visitors::normalization_layer_visitor(s, norm_info), d);
+}
+
+// Pooling Layer
+void ReferenceCPP::pooling_layer(const RawTensor &src, RawTensor &dst, PoolingLayerInfo pool_info, int fixed_point_position)
+{
+    const TensorVariant s = TensorFactory::get_tensor(src);
+    TensorVariant       d = TensorFactory::get_tensor(dst);
+    boost::apply_visitor(tensor_visitors::pooling_layer_visitor(s, pool_info, fixed_point_position), d);
+}
+
+// Softmax Layer
+void ReferenceCPP::softmax_layer(const RawTensor &src, RawTensor &dst)
+{
+    const TensorVariant s = TensorFactory::get_tensor(src);
+    TensorVariant       d = TensorFactory::get_tensor(dst);
+    boost::apply_visitor(tensor_visitors::softmax_layer_visitor(s), d);
+}
+
+// Fixed point operation
+void ReferenceCPP::fixed_point_operation(const RawTensor &src, RawTensor &dst, FixedPointOp op)
+{
+    const TensorVariant s = TensorFactory::get_tensor(src);
+    TensorVariant       d = TensorFactory::get_tensor(dst);
+    boost::apply_visitor(tensor_visitors::fixed_point_operation_visitor(s, op), d);
+}
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/ReferenceCPP.h b/tests/validation/ReferenceCPP.h
new file mode 100644
index 0000000000..be5a733896
--- /dev/null
+++ b/tests/validation/ReferenceCPP.h
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_REFERENCE_REFERENCE_CPP_H__
+#define __ARM_COMPUTE_TEST_REFERENCE_REFERENCE_CPP_H__
+
+#include "Reference.h"
+
+#include "RawTensor.h"
+
+#include <ostream>
+
+namespace arm_compute
+{
+class Tensor;
+
+namespace test
+{
+namespace validation
+{
+/** C++ reference implementation. */
+class ReferenceCPP final : public Reference
+{
+public:
+    /** Function to compute the integral image of a tensor.
+     *
+     * @param[in]  src Input tensor.
+     * @param[out] dst Result tensor.
+     */
+    static void integral_image(const RawTensor &src, RawTensor &dst);
+    /** Function to compute the absolute difference between two tensors.
+     *
+     * @param[in]  src1 First tensor.
+     * @param[in]  src2 Second tensor.
+     * @param[out] dst  Result tensor.
+     */
+    static void absolute_difference(const RawTensor &src1, const RawTensor &src2, RawTensor &dst);
+    /** Function to accumulate an input tensor into an output tensor.
+     *
+     * @param[in]      src Input tensor.
+     * @param[in, out] dst Result tensor.
+     */
+    static void accumulate(const RawTensor &src, RawTensor &dst);
+    /** Function to accumulate a squared value from an input tensor to an output tensor.
+     *
+     * @param[in]      src   Input tensor.
+     * @param[in, out] dst   Result tensor.
+     * @param[in]      shift A uint32_t value within the range of [0, 15]
+     */
+    static void accumulate_squared(const RawTensor &src, RawTensor &dst, uint32_t shift);
+    /** Function to accumulate a weighted value from an input tensor to an output tensor.
+     *
+     * @param[in]      src   Input tensor.
+     * @param[in, out] dst   Result tensor.
+     * @param[in]      alpha A float value within the range of [0, 1]
+     */
+    static void accumulate_weighted(const RawTensor &src, RawTensor &dst, float alpha);
+    /** Arithmetic addition of @p src1 and @p src2
+     *
+     * @param[in]  src1           First tensor.
+     * @param[in]  src2           Second tensor.
+     * @param[out] dst            Result tensor.
+     * @param[in]  convert_policy Overflow policy.
+     */
+    static void arithmetic_addition(const RawTensor &src1, const RawTensor &src2, RawTensor &dst, ConvertPolicy convert_policy);
+    /** Arithmetic subtraction of @p src2 from @p src1
+     *
+     * @param[in]  src1           First tensor.
+     * @param[in]  src2           Second tensor.
+     * @param[out] dst            Result tensor.
+     * @param[in]  convert_policy Overflow policy.
+     */
+    static void arithmetic_subtraction(const RawTensor &src1, const RawTensor &src2, RawTensor &dst, ConvertPolicy convert_policy);
+    /** Function to compute the bitwise and between two tensors.
+     *
+     * @param[in]  src1 First tensor.
+     * @param[in]  src2 Second tensor.
+     * @param[out] dst  Result tensor.
+     */
+    static void bitwise_and(const RawTensor &src1, const RawTensor &src2, RawTensor &dst);
+    /** Function to compute the bitwise or between two tensors.
+     *
+     * @param[in]  src1 First tensor.
+     * @param[in]  src2 Second tensor.
+     * @param[out] dst  Result tensor.
+     */
+    static void bitwise_or(const RawTensor &src1, const RawTensor &src2, RawTensor &dst);
+    /** Function to compute the bitwise xor between two tensors.
+     *
+     * @param[in]  src1 First tensor.
+     * @param[in]  src2 Second tensor.
+     * @param[out] dst  Result tensor.
+     */
+    static void bitwise_xor(const RawTensor &src1, const RawTensor &src2, RawTensor &dst);
+    /** Function to compute the bitwise not of a tensor.
+     *
+     * @param[in]  src Input tensor.
+     * @param[out] dst Result tensor.
+     */
+    static void bitwise_not(const RawTensor &src, RawTensor &dst);
+    /** Function to compute 3-by-3 box filtered result tensor.
+     *
+     * @param[in]  src Input tensor.
+     * @param[out] dst Result tensor.
+     */
+    static void box3x3(const RawTensor &src, RawTensor &dst);
+    /** Depth conversion from @p src to @p dst
+     *
+     * @param[in]  src    First tensor.
+     * @param[out] dst    Result tensor.
+     * @param[in]  policy Overflow policy.
+     * @param[in]  shift  Value for down/up conversions.
+     */
+    static void depth_convert(const RawTensor &src, RawTensor &dst, ConvertPolicy policy, uint32_t shift);
+    /** Compute GEMM function.
+     *
+     * @param[in]  src1  First input tensor
+     * @param[in]  src2  Second input tensor
+     * @param[in]  src3  Third input tensor
+     * @param[out] dst   Output tensr
+     * @param[in]  alpha Weight of the matrix product
+     * @param[in]  beta  Weight of the third matrix
+     */
+    static void gemm(const RawTensor &src1, const RawTensor &src2, const RawTensor &src3,
+                     RawTensor &dst, float alpha, float beta);
+    /** Element-wise multiplication of @p src1, @p src2 and @p scale
+     *
+     * @param[in]  src1            First tensor.
+     * @param[in]  src2            Second tensor.
+     * @param[out] dst             Result tensor.
+     * @param[in]  scale           A non-negative float multiplied to each product.
+     * @param[in]  convert_policy  Overflow policy.
+     * @param[in]  rounding_policy Rounding policy.
+     */
+    static void pixel_wise_multiplication(const RawTensor &src1, const RawTensor &src2, RawTensor &dst, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy);
+    /** Fixed-point Pixel-wise multiplication of @p src1 by @p src2
+     *
+     * @param[in]  src1            First tensor.
+     * @param[in]  src2            Second tensor.
+     * @param[out] dst             Result tensor.
+     * @param[in]  scale           A non-negative float multiplied to each product.
+     * @param[in]  convert_policy  Overflow policy.
+     * @param[in]  rounding_policy Rounding policy.
+     */
+    static void fixed_point_pixel_wise_multiplication(const RawTensor &src1, const RawTensor &src2, RawTensor &dst, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy);
+    /** Threshold of@p src to @p dst
+     *
+     * @param[in]  src         First tensor.
+     * @param[out] dst         Result tensor.
+     * @param[in]  threshold   Threshold. When the threhold type is RANGE, this is used as the lower threshold.
+     * @param[in]  false_value value to set when the condition is not respected.
+     * @param[in]  true_value  value to set when the condition is respected.
+     * @param[in]  type        Thresholding type. Either RANGE or BINARY.
+     * @param[in]  upper       Upper threshold. Only used when the thresholding type is RANGE.
+     */
+    static void threshold(const RawTensor &src, RawTensor &dst, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper);
+    /** Activation layer of @p src base on information from @p act_info.
+     *
+     * @param[in]  input    Input tensor.
+     * @param[in]  output   Second tensor.
+     * @param[out] act_info Activation layer information.
+     */
+    static void activation_layer(const RawTensor &input, RawTensor &output, ActivationLayerInfo act_info);
+    /** Batch Normalization of @p src based on the information from @p norm_info.
+     *
+     * @param[in]  src                  Input tensor.
+     * @param[out] dst                  Result tensor.
+     * @param[out] mean                 Mean vector tensor.
+     * @param[out] var                  Var vector tensor.
+     * @param[out] beta                 Beta vector tensor.
+     * @param[out] gamma                Gamma vector tensor.
+     * @param[in]  epsilon              Small value to avoid division with zero.
+     * @param[in]  fixed_point_position Fixed point position.
+     */
+    static void batch_normalization_layer(const RawTensor &src, RawTensor &dst, const RawTensor &mean, const RawTensor &var, const RawTensor &beta, const RawTensor &gamma, float epsilon,
+                                          int fixed_point_position = 0);
+    /** Convolution layer function
+     *
+     * @param[in]  src       Input tensor.
+     * @param[in]  weights   Weights tensor.
+     * @param[in]  bias      Bias tensor.
+     * @param[out] dst       Result tensor.
+     * @param[in]  conv_info Pads and strides information for the convolution layer.
+     */
+    static void convolution_layer(const RawTensor &src, const RawTensor &weights, const RawTensor &bias, RawTensor &dst, const PadStrideInfo &conv_info);
+    /** Fully connected layer function
+     *
+     * @param[in]  src     Input tensor
+     * @param[in]  weights Weights tensor.
+     * @param[in]  bias    Bias tensor.
+     * @param[out] dst     Result tensor.
+     */
+    static void fully_connected_layer(const RawTensor &src, const RawTensor &weights, const RawTensor &bias, RawTensor &dst);
+    /** Normalization of @p src based on the information from @p norm_info.
+     *
+     * @param[in]  src       Input tensor.
+     * @param[out] dst       Result tensor.
+     * @param[in]  norm_info Normalization Layer information.
+     */
+    static void normalization_layer(const RawTensor &src, RawTensor &dst, NormalizationLayerInfo norm_info);
+    /** Pooling layer of @p src based on the information from @p norm_info.
+     *
+     * @param[in]  src                  Input tensor.
+     * @param[out] dst                  Result tensor.
+     * @param[in]  pool_info            Pooling Layer information.
+     * @param[in]  fixed_point_position Fixed point position. (Optional)
+     */
+    static void pooling_layer(const RawTensor &src, RawTensor &dst, PoolingLayerInfo pool_info, int fixed_point_position = 0);
+    /** Softmax Layer of @p src.
+     *
+     * @param[in]  src Input tensor.
+     * @param[out] dst Result tensor.
+     */
+    static void softmax_layer(const RawTensor &src, RawTensor &dst);
+    /** Fixed point operations of @p src
+     *
+     * @param[in]  src Input tensor.
+     * @param[out] dst Result tensor.
+     * @param[in]  op  Fixed point operation to perform.
+     */
+    static void fixed_point_operation(const RawTensor &src, RawTensor &dst, FixedPointOp op);
+
+private:
+    ReferenceCPP()  = delete;
+    ~ReferenceCPP() = delete;
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/validation/Tensor.h b/tests/validation/Tensor.h
new file mode 100644
index 0000000000..81066b40ad
--- /dev/null
+++ b/tests/validation/Tensor.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_TENSOR_H__
+#define __ARM_COMPUTE_TEST_TENSOR_H__
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename T>
+class Tensor
+{
+public:
+    Tensor()
+        : _shape(), _dt(DataType::UNKNOWN), _fixed_point_position(0), _ptr(nullptr), _ptr_const(nullptr) {};
+
+    Tensor(TensorShape shape, DataType dt, int fixed_point_position, T *ptr)
+        : _shape(shape), _dt(dt), _fixed_point_position(fixed_point_position), _ptr(ptr), _ptr_const(nullptr) {};
+
+    Tensor(TensorShape shape, DataType dt, int fixed_point_position, const T *ptr)
+        : _shape(shape), _dt(dt), _fixed_point_position(fixed_point_position), _ptr(nullptr), _ptr_const(ptr) {};
+
+    Tensor(const Tensor &tensor) = delete;
+    Tensor &operator=(const Tensor &) = delete;
+    Tensor(Tensor &&)                 = default;
+    Tensor &operator=(Tensor &&) = default;
+
+    ~Tensor() = default;
+
+    T &operator[](size_t offset)
+    {
+        return _ptr[offset];
+    }
+
+    const T &operator[](size_t offset) const
+    {
+        return _ptr_const[offset];
+    }
+
+    int num_elements() const
+    {
+        return std::accumulate(_shape.cbegin(), _shape.cend(), 1, std::multiplies<int>());
+    }
+
+    TensorShape shape() const
+    {
+        return _shape;
+    }
+
+    DataType data_type() const
+    {
+        return _dt;
+    }
+
+    int fixed_point_position() const
+    {
+        return _fixed_point_position;
+    }
+
+    const T *data() const
+    {
+        return (_ptr != nullptr) ? _ptr : _ptr_const;
+    }
+    T *data()
+    {
+        return _ptr;
+    }
+
+    const T *data_const()
+    {
+        return _ptr_const;
+    }
+
+private:
+    TensorShape _shape;
+    DataType    _dt;
+    int         _fixed_point_position;
+    T          *_ptr;
+    const T    *_ptr_const;
+};
+} // namespace validation
+} // test
+} // arm_compute
+
+#endif /* __ARM_COMPUTE_TEST_TENSOR_H__ */
diff --git a/tests/validation/TensorFactory.h b/tests/validation/TensorFactory.h
new file mode 100644
index 0000000000..48f9d6702f
--- /dev/null
+++ b/tests/validation/TensorFactory.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_TENSOR_FACTORY_H__
+#define __ARM_COMPUTE_TEST_TENSOR_FACTORY_H__
+
+#include "RawTensor.h"
+#include "Tensor.h"
+#include "arm_compute/core/Error.h"
+
+#include "boost_wrapper.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+using TensorVariant = boost::variant < Tensor<uint8_t>, Tensor<int8_t>,
+      Tensor<uint16_t>, Tensor<int16_t>,
+      Tensor<uint32_t>, Tensor<int32_t>,
+#ifdef ENABLE_FP16
+      Tensor<float16_t>,
+#endif
+      Tensor<float >>;
+
+/** Helper to create a constant type if the passed reference is constant. */
+template <typename R, typename T>
+struct match_const
+{
+    using type = typename std::conditional<std::is_const<typename std::remove_reference<R>::type>::value, const T, T>::type;
+};
+
+class TensorFactory
+{
+public:
+    template <typename R>
+    static TensorVariant get_tensor(R &&raw)
+    {
+        TensorVariant v;
+        DataType      dt                   = raw.data_type();
+        int           fixed_point_position = raw.fixed_point_position();
+        auto          shape                = raw.shape();
+        auto          data                 = raw.data();
+
+        switch(dt)
+        {
+            case DataType::U8:
+                using value_type_u8 = typename match_const<R, uint8_t>::type;
+                v                   = Tensor<uint8_t>(shape, dt, fixed_point_position, reinterpret_cast<value_type_u8 *>(data));
+                break;
+            case DataType::S8:
+            case DataType::QS8:
+                using value_type_s8 = typename match_const<R, int8_t>::type;
+                v                   = Tensor<int8_t>(shape, dt, fixed_point_position, reinterpret_cast<value_type_s8 *>(data));
+                break;
+            case DataType::U16:
+                using value_type_u16 = typename match_const<R, uint16_t>::type;
+                v                    = Tensor<uint16_t>(shape, dt, fixed_point_position, reinterpret_cast<value_type_u16 *>(data));
+                break;
+            case DataType::S16:
+                using value_type_s16 = typename match_const<R, int16_t>::type;
+                v                    = Tensor<int16_t>(shape, dt, fixed_point_position, reinterpret_cast<value_type_s16 *>(data));
+                break;
+            case DataType::U32:
+                using value_type_u32 = typename match_const<R, uint32_t>::type;
+                v                    = Tensor<uint32_t>(shape, dt, fixed_point_position, reinterpret_cast<value_type_u32 *>(data));
+                break;
+            case DataType::S32:
+                using value_type_s32 = typename match_const<R, int32_t>::type;
+                v                    = Tensor<int32_t>(shape, dt, fixed_point_position, reinterpret_cast<value_type_s32 *>(data));
+                break;
+#ifdef ENABLE_FP16
+            case DataType::F16:
+                using value_type_f16 = typename match_const<R, float16_t>::type;
+                v                    = Tensor<float16_t>(raw.shape(), dt, reinterpret_cast<value_type_f16 *>(raw.data()));
+                break;
+#endif
+            case DataType::F32:
+                using value_type_f32 = typename match_const<R, float>::type;
+                v                    = Tensor<float>(shape, dt, fixed_point_position, reinterpret_cast<value_type_f32 *>(data));
+                break;
+            default:
+                ARM_COMPUTE_ERROR("NOT SUPPORTED!");
+        }
+        return v;
+    }
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+
+#endif /* __ARM_COMPUTE_TEST_TENSOR_FACTORY_H__ */
diff --git a/tests/validation/TensorOperations.h b/tests/validation/TensorOperations.h
new file mode 100644
index 0000000000..5e27e9d3a0
--- /dev/null
+++ b/tests/validation/TensorOperations.h
@@ -0,0 +1,1370 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_TENSOR_OPERATIONS_H__
+#define __ARM_COMPUTE_TEST_TENSOR_OPERATIONS_H__
+
+#include "FixedPoint.h"
+#include "Tensor.h"
+#include "Types.h"
+#include "Utils.h"
+
+#include "FixedPoint.h"
+#include "Types.h"
+#include "arm_compute/core/FixedPoint.h"
+#include "arm_compute/core/Types.h"
+#include "tests/validation/FixedPoint.h"
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace tensor_operations
+{
+namespace
+{
+bool is_valid_pixel(int i, int min, int max)
+{
+    return (i >= min && i < max);
+}
+
+// 3D convolution for floating point type
+template <typename T, typename std::enable_if<std::is_floating_point<T>::value, int>::type * = nullptr>
+void convolution3d(const T *in, const T *weights, const T *bias, T *out, int xi, int yi, int width_in, int height_in, int depth_in, int width_weights, int height_weights, int8_t fixed_point_position)
+{
+    const int half_width_weights  = width_weights / 2;
+    const int half_height_weights = height_weights / 2;
+
+    // Reset accumulator
+    T acc = static_cast<T>(0);
+
+    // Compute a 2D convolution for each IFM and accumulate the result
+    for(int ifm = 0; ifm < depth_in; ++ifm)
+    {
+        // Compute the offset for the input slice
+        const int offset_slice_in = xi + yi * width_in + ifm * width_in * height_in;
+
+        // Compute 2D convolution
+        for(int yk = -half_height_weights; yk <= half_height_weights; ++yk)
+        {
+            for(int xk = -half_width_weights; xk <= half_width_weights; ++xk)
+            {
+                // Check if the pixel is out-of-bound
+                if(is_valid_pixel(xi + xk, 0, width_in) && is_valid_pixel(yi + yk, 0, height_in))
+                {
+                    const int idx = xk + half_width_weights;
+                    const int idy = yk + half_height_weights;
+
+                    const T i_value = in[offset_slice_in + xk + yk * width_in];
+                    const T w_value = weights[idx + idy * width_weights + ifm * width_weights * height_weights];
+
+                    acc += i_value * w_value;
+                }
+            }
+        }
+    }
+
+    // Accumulate the bias and store the result
+    *out = acc + (*bias);
+}
+
+// 3D convolution for fixed point type
+template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type * = nullptr>
+void convolution3d(const T *in, const T *weights, const T *bias, T *out, int xi, int yi, int width_in, int height_in, int depth_in, int width_weights, int height_weights,
+                   int8_t fixed_point_position)
+{
+    const int half_width_weights  = width_weights / 2;
+    const int half_height_weights = height_weights / 2;
+
+    using namespace fixed_point_arithmetic;
+    using promoted_type = typename fixed_point_arithmetic::traits::promote<T>::type;
+
+    // Reset accumulator
+    fixed_point<promoted_type> acc(0, fixed_point_position);
+
+    // Compute a 2D convolution for each IFM and accumulate the result
+    for(int ifm = 0; ifm < depth_in; ++ifm)
+    {
+        // Compute the offset for the input slice
+        const int offset_slice_in = xi + yi * width_in + ifm * width_in * height_in;
+
+        // Compute 2D convolution
+        for(int yk = -half_height_weights; yk <= half_height_weights; ++yk)
+        {
+            for(int xk = -half_width_weights; xk <= half_width_weights; ++xk)
+            {
+                // Check if the pixel is out-of-bound
+                if(is_valid_pixel(xi + xk, 0, width_in) && is_valid_pixel(yi + yk, 0, height_in))
+                {
+                    const int idx = xk + half_width_weights;
+                    const int idy = yk + half_height_weights;
+
+                    const fixed_point<promoted_type> i_value(in[offset_slice_in + xk + yk * width_in], fixed_point_position, true);
+                    const fixed_point<promoted_type> w_value(weights[idx + idy * width_weights + ifm * width_weights * height_weights], fixed_point_position, true);
+                    const fixed_point<promoted_type> iw = i_value * w_value;
+                    acc                                 = iw + acc;
+                }
+            }
+        }
+    }
+
+    // Get the bias
+    const fixed_point<promoted_type> b(*bias, fixed_point_position, true);
+
+    // Accumulate the bias and covert back
+    acc = acc + b;
+    fixed_point<T> res(acc);
+    *out = res.raw();
+}
+
+template <typename T>
+void vector_matrix_multiply(const T *in, const T *weights, const T *bias, T *out, int cols_weights, int rows_weights, uint8_t fixed_point_position)
+{
+    for(int x = 0; x < cols_weights; ++x)
+    {
+        T acc = 0.0f;
+        for(int y = 0; y < rows_weights; ++y)
+        {
+            acc += in[y] * weights[x + y * cols_weights];
+        }
+        out[x] = acc + bias[x];
+    }
+}
+
+template <>
+void vector_matrix_multiply(const int8_t *in, const int8_t *weights, const int8_t *bias, int8_t *out, int cols_weights, int rows_weights, uint8_t fixed_point_position)
+{
+    using namespace fixed_point_arithmetic;
+    using promoted_type = typename fixed_point_arithmetic::traits::promote<int8_t>::type;
+
+    for(int x = 0; x < cols_weights; ++x)
+    {
+        // Reset accumulator
+        fixed_point<promoted_type> acc(0, fixed_point_position);
+
+        for(int y = 0; y < rows_weights; ++y)
+        {
+            const fixed_point<promoted_type> i_value(in[y], fixed_point_position, true);
+            const fixed_point<promoted_type> w_value(weights[x + y * cols_weights], fixed_point_position, true);
+            const fixed_point<promoted_type> iw = i_value * w_value;
+            acc                                 = iw + acc;
+        }
+
+        // Get the bias
+        const fixed_point<int8_t> b(bias[x], fixed_point_position, true);
+
+        // Convert back and accumulate the bias
+        fixed_point<int8_t> res(acc);
+        res = res + b;
+
+        // Store the result
+        out[x] = res.raw();
+    }
+}
+
+/** Apply 2D spatial filter on a single element of @p in at coordinates @p coord
+ *
+ * - filter sizes have to be odd number
+ * - Valid region assumed
+ * - Row major order of filter assumed
+ * - TO_ZERO rounding policy assumed
+ * - SATURATE convert policy assumed
+ *
+ */
+template <typename T1, typename T2, typename T3>
+void apply_2d_spatial_filter(Coordinates coord, const Tensor<T1> &in, Tensor<T3> &out, const TensorShape &filter_shape, const T2 *filter_itr, float scale)
+{
+    using intermediate_type = typename common_promoted_signed_type<T1, T2, T3>::intermediate_type;
+    intermediate_type val   = 0;
+    int               x     = coord.x();
+    int               y     = coord.y();
+    for(size_t j = y - filter_shape[1] / 2; j <= y + filter_shape[1] / 2; ++j)
+    {
+        for(size_t i = x - filter_shape[0] / 2; i <= x + filter_shape[0] / 2; ++i)
+        {
+            coord.set(0, i);
+            coord.set(1, j);
+            val += static_cast<intermediate_type>(*filter_itr) * static_cast<intermediate_type>(in[coord2index(in.shape(), coord)]);
+            ++filter_itr;
+        }
+    }
+    coord.set(0, x);
+    coord.set(1, y);
+    double rounded_val = cpp11::trunc(val * static_cast<double>(scale));
+    out[coord2index(in.shape(), coord)] = saturate_cast<T3>(rounded_val);
+}
+} // namespace
+
+// Integral Image
+void integral_image(const Tensor<uint8_t> &in, Tensor<uint32_t> &out)
+{
+    // Length of dimensions
+    const size_t width  = in.shape().x();
+    const size_t height = in.shape().y();
+    const size_t depth  = in.shape().z() * in.shape()[3] * in.shape()[4] * in.shape()[5];
+
+    const size_t image_size = width * height;
+
+    for(size_t z = 0; z < depth; ++z)
+    {
+        size_t current_image = z * image_size;
+
+        //First element of each image
+        out[current_image] = in[current_image];
+
+        // First row of each image (add only pixel on the left)
+        for(size_t x = 1; x < width; ++x)
+        {
+            out[current_image + x] = static_cast<uint32_t>(in[current_image + x]) + out[current_image + x - 1];
+        }
+
+        // Subsequent rows
+        for(size_t y = 1; y < height; ++y)
+        {
+            size_t current_row = current_image + (width * y);
+
+            // First element of each row (add only pixel up)
+            out[current_row] = static_cast<uint32_t>(in[current_row]) + out[current_row - width];
+
+            // Following row elements
+            for(size_t x = 1; x < width; ++x)
+            {
+                size_t current_pixel = current_row + x;
+
+                // out = in + up(out) + left(out) - up_left(out)
+                out[current_pixel] = static_cast<uint32_t>(in[current_pixel]) + out[current_pixel - 1]
+                                     + out[current_pixel - width] - out[current_pixel - width - 1];
+            }
+        }
+    }
+}
+
+// Absolute difference
+template <typename T1, typename T2, typename T3>
+void absolute_difference(const Tensor<T1> &in1, const Tensor<T2> &in2, Tensor<T3> &out)
+{
+    using intermediate_type = typename common_promoted_signed_type<T1, T2, T3>::intermediate_type;
+
+    for(int i = 0; i < in1.num_elements(); ++i)
+    {
+        intermediate_type val = std::abs(static_cast<intermediate_type>(in1[i]) - static_cast<intermediate_type>(in2[i]));
+        out[i]                = saturate_cast<T3>(val);
+    }
+}
+
+// Accumulate
+template <typename T1, typename T2>
+void accumulate(const Tensor<T1> &in, Tensor<T2> &out)
+{
+    using intermediate_type = typename common_promoted_signed_type<T1, T2>::intermediate_type;
+
+    for(int i = 0; i < in.num_elements(); ++i)
+    {
+        intermediate_type val = static_cast<intermediate_type>(out[i]) + static_cast<intermediate_type>(in[i]);
+        out[i]                = saturate_cast<T2>(val);
+    }
+}
+
+// Accumulate squared
+template <typename T1, typename T2>
+void accumulate_squared(const Tensor<T1> &in, Tensor<T2> &out, uint32_t shift)
+{
+    if(shift > 15)
+    {
+        ARM_COMPUTE_ERROR("Shift in accumulate_squared must be within the range [0, 15]");
+    }
+    using intermediate_type = typename common_promoted_signed_type<T1, T2>::intermediate_type;
+    intermediate_type denom = 1 << shift;
+
+    for(int i = 0; i < in.num_elements(); ++i)
+    {
+        intermediate_type val = static_cast<intermediate_type>(out[i]) + (static_cast<intermediate_type>(in[i]) * static_cast<intermediate_type>(in[i]) / denom);
+        out[i]                = saturate_cast<T2>(val);
+    }
+}
+
+// Accumulate weighted
+template <typename T>
+void accumulate_weighted(const Tensor<T> &in, Tensor<T> &out, float alpha)
+{
+    if(alpha < 0.f || alpha > 1.f)
+    {
+        ARM_COMPUTE_ERROR("Weight (alpha) specified in accumulate_weighted must be within the range [0, 1]");
+    }
+    using intermediate_type = typename common_promoted_signed_type<T>::intermediate_type;
+
+    for(int i = 0; i < in.num_elements(); ++i)
+    {
+        double val = (1. - static_cast<double>(alpha)) * static_cast<intermediate_type>(out[i]) + static_cast<double>(alpha) * static_cast<intermediate_type>(in[i]);
+        out[i]     = static_cast<T>(val);
+    }
+}
+
+// Arithmetic addition
+template <typename T1, typename T2, typename T3>
+void arithmetic_addition(const Tensor<T1> &in1, const Tensor<T2> &in2, Tensor<T3> &out, ConvertPolicy convert_policy)
+{
+    using intermediate_type = typename common_promoted_signed_type<T1, T2, T3>::intermediate_type;
+
+    for(int i = 0; i < in1.num_elements(); ++i)
+    {
+        intermediate_type val = static_cast<intermediate_type>(in1[i]) + static_cast<intermediate_type>(in2[i]);
+        out[i]                = (convert_policy == ConvertPolicy::SATURATE) ? saturate_cast<T3>(val) : static_cast<T3>(val);
+    }
+}
+
+// Arithmetic Subtraction
+template <typename T1, typename T2, typename T3>
+void arithmetic_subtraction(const Tensor<T1> &in1, const Tensor<T2> &in2, Tensor<T3> &out, ConvertPolicy convert_policy)
+{
+    using intermediate_type = typename common_promoted_signed_type<T1, T2, T3>::intermediate_type;
+
+    for(int i = 0; i < in1.num_elements(); ++i)
+    {
+        intermediate_type val = static_cast<intermediate_type>(in1[i]) - static_cast<intermediate_type>(in2[i]);
+        out[i]                = (convert_policy == ConvertPolicy::SATURATE) ? saturate_cast<T3>(val) : static_cast<T3>(val);
+    }
+}
+
+// Bitwise and
+template <typename T, typename = typename std::enable_if<std::is_integral<T>::value>::type>
+void bitwise_and(const Tensor<T> &in1, const Tensor<T> &in2, Tensor<T> &out)
+{
+    for(int i = 0; i < in1.num_elements(); ++i)
+    {
+        out[i] = in1[i] & in2[i];
+    }
+}
+
+// Bitwise or
+template <typename T, typename = typename std::enable_if<std::is_integral<T>::value>::type>
+void bitwise_or(const Tensor<T> &in1, const Tensor<T> &in2, Tensor<T> &out)
+{
+    for(int i = 0; i < in1.num_elements(); ++i)
+    {
+        out[i] = in1[i] | in2[i];
+    }
+}
+
+// Bitwise xor
+template <typename T, typename = typename std::enable_if<std::is_integral<T>::value>::type>
+void bitwise_xor(const Tensor<T> &in1, const Tensor<T> &in2, Tensor<T> &out)
+{
+    for(int i = 0; i < in1.num_elements(); ++i)
+    {
+        out[i] = in1[i] ^ in2[i];
+    }
+}
+
+// Bitwise not
+template <typename T, typename = typename std::enable_if<std::is_integral<T>::value>::type>
+void bitwise_not(const Tensor<T> &in, Tensor<T> &out)
+{
+    for(int i = 0; i < in.num_elements(); ++i)
+    {
+        out[i] = ~in[i];
+    }
+}
+
+// 3-by-3 box filter
+template <typename T, typename = typename std::enable_if<std::is_integral<T>::value>::type>
+void box3x3(const Tensor<T> &in, Tensor<T> &out)
+{
+    const std::array<T, 9> filter{ { 1, 1, 1, 1, 1, 1, 1, 1, 1 } };
+    float             scale        = 1.f / static_cast<float>(filter.size());
+    const ValidRegion valid_region = shape_to_valid_region_undefined_border(in.shape(), BorderSize(1));
+    for(int element_idx = 0; element_idx < in.num_elements(); ++element_idx)
+    {
+        const Coordinates id = index2coord(in.shape(), element_idx);
+        if(is_in_valid_region(valid_region, id))
+        {
+            apply_2d_spatial_filter(id, in, out, TensorShape(3U, 3U), filter.data(), scale);
+        }
+    }
+}
+
+// Depth conversion
+template <typename T1, typename T2>
+void depth_convert(const Tensor<T1> &in, Tensor<T2> &out, ConvertPolicy policy, uint32_t shift)
+{
+    ARM_COMPUTE_ERROR("The conversion is not supported");
+}
+
+template <>
+void depth_convert<int8_t, float>(const Tensor<int8_t> &in, Tensor<float> &out, ConvertPolicy policy, uint32_t shift)
+{
+    const int8_t fixed_point_position = static_cast<int8_t>(in.fixed_point_position());
+    for(int i = 0; i < in.num_elements(); ++i)
+    {
+        out[i] = static_cast<float>(in[i]) * (1.0f / (1 << fixed_point_position));
+    }
+}
+
+template <>
+void depth_convert<float, int8_t>(const Tensor<float> &in, Tensor<int8_t> &out, ConvertPolicy policy, uint32_t shift)
+{
+    const int8_t fixed_point_position = static_cast<int8_t>(in.fixed_point_position());
+    for(int i = 0; i < in.num_elements(); ++i)
+    {
+        float val = in[i] * (1 << fixed_point_position) + 0.5f;
+        out[i]    = ((policy == ConvertPolicy::SATURATE) ? saturate_cast<int8_t>(val) : static_cast<int8_t>(val));
+    }
+}
+
+template <>
+void depth_convert<uint8_t, uint16_t>(const Tensor<uint8_t> &in, Tensor<uint16_t> &out, ConvertPolicy policy, uint32_t shift)
+{
+    for(int i = 0; i < in.num_elements(); ++i)
+    {
+        out[i] = static_cast<uint16_t>(in[i]) << shift;
+    }
+}
+
+template <>
+void depth_convert<uint8_t, int16_t>(const Tensor<uint8_t> &in, Tensor<int16_t> &out, ConvertPolicy policy, uint32_t shift)
+{
+    for(int i = 0; i < in.num_elements(); ++i)
+    {
+        out[i] = static_cast<int16_t>(in[i]) << shift;
+    }
+}
+
+template <>
+void depth_convert<uint8_t, int32_t>(const Tensor<uint8_t> &in, Tensor<int32_t> &out, ConvertPolicy policy, uint32_t shift)
+{
+    for(int i = 0; i < in.num_elements(); ++i)
+    {
+        out[i] = static_cast<int32_t>(in[i]) << shift;
+    }
+}
+
+template <>
+void depth_convert<uint16_t, uint8_t>(const Tensor<uint16_t> &in, Tensor<uint8_t> &out, ConvertPolicy policy, uint32_t shift)
+{
+    for(int i = 0; i < in.num_elements(); ++i)
+    {
+        uint16_t val = in[i] >> shift;
+        out[i]       = ((policy == ConvertPolicy::SATURATE) ? saturate_cast<uint8_t>(val) : static_cast<uint8_t>(val));
+    }
+}
+
+template <>
+void depth_convert<uint16_t, uint32_t>(const Tensor<uint16_t> &in, Tensor<uint32_t> &out, ConvertPolicy policy, uint32_t shift)
+{
+    for(int i = 0; i < in.num_elements(); ++i)
+    {
+        out[i] = static_cast<uint32_t>(in[i]) << shift;
+    }
+}
+
+template <>
+void depth_convert<int16_t, uint8_t>(const Tensor<int16_t> &in, Tensor<uint8_t> &out, ConvertPolicy policy, uint32_t shift)
+{
+    for(int i = 0; i < in.num_elements(); ++i)
+    {
+        int16_t val = in[i] >> shift;
+        out[i]      = ((policy == ConvertPolicy::SATURATE) ? saturate_cast<uint8_t>(val) : static_cast<uint8_t>(val));
+    }
+}
+template <>
+void depth_convert<int16_t, int32_t>(const Tensor<int16_t> &in, Tensor<int32_t> &out, ConvertPolicy policy, uint32_t shift)
+{
+    for(int i = 0; i < in.num_elements(); ++i)
+    {
+        out[i] = static_cast<int32_t>(in[i]) << shift;
+    }
+}
+
+// Matrix multiplication for floating point type
+template <typename T, typename std::enable_if<std::is_floating_point<T>::value, int>::type * = nullptr>
+void gemm(const Tensor<T> &in1, const Tensor<T> &in2, const Tensor<T> &in3, Tensor<T> &out, float alpha, float beta)
+{
+    const int M = out.shape().y();
+    const int N = out.shape().x();
+    const int K = in1.shape().x();
+
+    for(int r = 0; r < M; ++r)
+    {
+        for(int c = 0; c < N; ++c)
+        {
+            T acc = 0.0f;
+
+            for(int k = 0; k < K; ++k)
+            {
+                const T a0 = in1[r * K + k];
+                const T b0 = in2[k * N + c];
+
+                acc += a0 * b0;
+            }
+
+            // Finalize the result: A * B * alpha + C * beta
+            const T c0     = in3[c + r * N];
+            out[c + r * N] = alpha * acc + beta * c0;
+        }
+    }
+}
+
+// Matrix multiplication for fixed point type
+template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type * = nullptr>
+void gemm(const Tensor<T> &in1, const Tensor<T> &in2, const Tensor<T> &in3, Tensor<T> &out, float alpha, float beta)
+{
+    using namespace fixed_point_arithmetic;
+
+    using promoted_type = typename fixed_point_arithmetic::traits::promote<T>::type;
+
+    const int    M                    = out.shape().y();
+    const int    N                    = out.shape().x();
+    const int    K                    = in1.shape().x();
+    const int8_t fixed_point_position = static_cast<int8_t>(in1.fixed_point_position());
+
+    const fixed_point<T> alpha_q(alpha, fixed_point_position);
+    const fixed_point<T> beta_q(beta, fixed_point_position);
+
+    for(int r = 0; r < M; ++r)
+    {
+        for(int c = 0; c < N; ++c)
+        {
+            fixed_point<promoted_type> acc_q(0, fixed_point_position);
+
+            for(int k = 0; k < K; ++k)
+            {
+                const fixed_point<promoted_type> a0_q(in1[r * K + k], fixed_point_position, true);
+                const fixed_point<promoted_type> b0_q(in2[k * N + c], fixed_point_position, true);
+                const fixed_point<promoted_type> axb_q = a0_q * b0_q;
+
+                acc_q = axb_q + acc_q;
+            }
+
+            // Finalize the result: A * B * alpha + C * beta
+            const fixed_point<T> c0_q(in3[c + r * N], fixed_point_position, true);
+
+            fixed_point<T> res_q(acc_q);
+            res_q = alpha_q * res_q;
+            res_q = (c0_q * beta_q) + res_q;
+
+            // Store the result
+            out[c + r * N] = res_q.raw();
+        }
+    }
+}
+
+// Pixel-wise multiplication
+template <typename T1, typename T2, typename T3>
+void pixel_wise_multiplication(const Tensor<T1> &in1, const Tensor<T2> &in2, Tensor<T3> &out, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy)
+{
+    if(scale < 0)
+    {
+        ARM_COMPUTE_ERROR("Scale of pixel-wise multiplication must be non-negative");
+    }
+    using intermediate_type = typename common_promoted_signed_type<T1, T2, T3>::intermediate_type;
+    for(int i = 0; i < in1.num_elements(); ++i)
+    {
+        double val = static_cast<intermediate_type>(in1[i]) * static_cast<intermediate_type>(in2[i]) * static_cast<double>(scale);
+        if(std::is_floating_point<T3>::value)
+        {
+            out[i] = val;
+        }
+        else
+        {
+            double rounded_val = 0;
+            switch(rounding_policy)
+            {
+                case(RoundingPolicy::TO_ZERO):
+                    rounded_val = cpp11::trunc(val);
+                    break;
+                case(RoundingPolicy::TO_NEAREST_UP):
+                    rounded_val = cpp11::round_half_up(val);
+                    break;
+                case(RoundingPolicy::TO_NEAREST_EVEN):
+                    rounded_val = cpp11::round_half_even(val);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported rounding policy");
+            }
+            out[i] = (convert_policy == ConvertPolicy::SATURATE) ? saturate_cast<T3>(rounded_val) : static_cast<T3>(rounded_val);
+        }
+    }
+}
+
+// Fixed-point Pixel-wise Multiplication
+template <typename T, typename = typename std::enable_if<std::is_integral<T>::value>::type>
+void fixed_point_pixel_wise_multiplication(const Tensor<T> &in1, const Tensor<T> &in2, Tensor<T> &out, int scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy)
+{
+    using namespace fixed_point_arithmetic;
+
+    const int fixed_point_position = in1.fixed_point_position();
+
+    ARM_COMPUTE_ERROR_ON_MSG(in1.data_type() != in2.data_type() || in1.data_type() != out.data_type(),
+                             "Tensors must all have the same DataType");
+    ARM_COMPUTE_ERROR_ON_MSG(fixed_point_position != in2.fixed_point_position() || fixed_point_position != out.fixed_point_position(),
+                             "Fixed-point position must be the same for both inputs and outputs");
+
+    // Validate fixed_point_position
+    ARM_COMPUTE_ERROR_ON((in1.data_type() == DataType::QS8) && (fixed_point_position == 0 || fixed_point_position > 7));
+    ARM_COMPUTE_ERROR_ON((in1.data_type() == DataType::QS16) && (fixed_point_position == 0 || fixed_point_position > 15));
+
+    fixed_point<T> fp_scale(scale, fixed_point_position);
+    const bool     is_sat     = convert_policy == ConvertPolicy::SATURATE;
+    const bool     do_scaling = scale != 1;
+
+    for(int i = 0; i < in1.num_elements(); ++i)
+    {
+        fixed_point<T> val1(in1[i], fixed_point_position, true);
+        fixed_point<T> val2(in2[i], fixed_point_position, true);
+        fixed_point<T> res = (is_sat) ? val1 * val2 : mul<OverflowPolicy::WRAP>(val1, val2);
+        if(do_scaling)
+        {
+            res = (is_sat) ? res * fp_scale : mul<OverflowPolicy::WRAP>(res, fp_scale);
+        }
+        out[i] = res.raw();
+    }
+}
+
+// Threshold
+template <typename T>
+void threshold(const Tensor<T> &in, Tensor<T> &out, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
+{
+    switch(type)
+    {
+        case ThresholdType::BINARY:
+            for(int i = 0; i < in.num_elements(); ++i)
+            {
+                out[i] = ((in[i] > threshold) ? true_value : false_value);
+            }
+            break;
+        case ThresholdType::RANGE:
+            for(int i = 0; i < in.num_elements(); ++i)
+            {
+                if(in[i] > upper)
+                {
+                    out[i] = false_value;
+                }
+                else if(in[i] < threshold)
+                {
+                    out[i] = false_value;
+                }
+                else
+                {
+                    out[i] = true_value;
+                }
+            }
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Thresholding type not recognised");
+            break;
+    }
+}
+
+// Activation Layer for floating point type
+template <typename T, typename std::enable_if<std::is_floating_point<T>::value, int>::type * = nullptr>
+void activation_layer(const Tensor<T> &in, Tensor<T> &out, ActivationLayerInfo act_info)
+{
+    const T a = static_cast<T>(act_info.a());
+    const T b = static_cast<T>(act_info.b());
+
+    for(int i = 0; i < in.num_elements(); ++i)
+    {
+        T x = in[i];
+        switch(act_info.activation())
+        {
+            case ActivationLayerInfo::ActivationFunction::ABS:
+                out[i] = std::abs(x);
+                break;
+            case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                out[i] = std::min<T>(a, std::max<T>(0, x));
+                break;
+            case ActivationLayerInfo::ActivationFunction::LINEAR:
+                out[i] = a * x + b;
+                break;
+            case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                out[i] = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-x));
+                break;
+            case ActivationLayerInfo::ActivationFunction::RELU:
+                out[i] = std::max<T>(0, x);
+                break;
+            case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                out[i] = std::log(static_cast<T>(1) + std::exp(x));
+                break;
+            case ActivationLayerInfo::ActivationFunction::SQRT:
+                out[i] = std::sqrt(x);
+                break;
+            case ActivationLayerInfo::ActivationFunction::SQUARE:
+                out[i] = x * x;
+                break;
+            case ActivationLayerInfo::ActivationFunction::TANH:
+                out[i] = a * std::tanh(b * x);
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Activation function not recognised");
+                break;
+        }
+    }
+}
+
+// Activation Layer for fixed point type
+template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type * = nullptr>
+void activation_layer(const Tensor<T> &in, Tensor<T> &out, ActivationLayerInfo act_info)
+{
+    using namespace fixed_point_arithmetic;
+    int                                     fixed_point_position = in.fixed_point_position();
+    ActivationLayerInfo::ActivationFunction act_func             = act_info.activation();
+    const fixed_point<T>                    a(act_info.a(), fixed_point_position);
+    const fixed_point<T>                    b(act_info.b(), fixed_point_position);
+    const fixed_point<T>                    const_0(0, fixed_point_position);
+    const fixed_point<T>                    const_1(1, fixed_point_position);
+
+    for(int i = 0; i < in.num_elements(); ++i)
+    {
+        fixed_point<T> x(in[i], fixed_point_position, true);
+        switch(act_func)
+        {
+            case ActivationLayerInfo::ActivationFunction::ABS:
+                out[i] = abs(x).raw();
+                break;
+            case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                out[i] = min(a, max(const_0, x)).raw();
+                break;
+            case ActivationLayerInfo::ActivationFunction::LINEAR:
+                out[i] = add(b, mul(a, x)).raw();
+                break;
+            case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                out[i] = (const_1 / (const_1 + exp(-x))).raw();
+                break;
+            case ActivationLayerInfo::ActivationFunction::RELU:
+                out[i] = max(const_0, x).raw();
+                break;
+            case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                out[i] = log(const_1 + exp(x)).raw();
+                break;
+            case ActivationLayerInfo::ActivationFunction::SQRT:
+                out[i] = (const_1 / inv_sqrt(x)).raw();
+                break;
+            case ActivationLayerInfo::ActivationFunction::SQUARE:
+                out[i] = mul(x, x).raw();
+                break;
+            case ActivationLayerInfo::ActivationFunction::TANH:
+                out[i] = tanh(x).raw();
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Activation function not recognised");
+                break;
+        }
+    }
+}
+
+// Batch Normalization Layer for fixed point type
+template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type * = nullptr>
+void batch_normalization_layer(const Tensor<T> &in, Tensor<T> &out, const Tensor<T> &mean, const Tensor<T> &var, const Tensor<T> &beta, const Tensor<T> &gamma, float epsilon, int fixed_point_position)
+{
+    const int cols       = static_cast<int>(in.shape()[0]);
+    const int rows       = static_cast<int>(in.shape()[1]);
+    const int depth      = static_cast<int>(in.shape()[2]);
+    int       upper_dims = in.shape().total_size() / (cols * rows * depth);
+
+    for(int r = 0; r < upper_dims; ++r)
+    {
+        for(int i = 0; i < depth; ++i)
+        {
+            for(int k = 0; k < rows; ++k)
+            {
+                for(int l = 0; l < cols; ++l)
+                {
+                    const int                              pos = l + k * cols + i * rows * cols + r * cols * rows * depth;
+                    fixed_point_arithmetic::fixed_point<T> in_qs8(in[pos], fixed_point_position, true);
+                    fixed_point_arithmetic::fixed_point<T> var_qs8(var[i], fixed_point_position, true);
+                    fixed_point_arithmetic::fixed_point<T> mean_qs8(mean[i], fixed_point_position, true);
+                    fixed_point_arithmetic::fixed_point<T> beta_qs8(beta[i], fixed_point_position, true);
+                    fixed_point_arithmetic::fixed_point<T> gamma_qs8(gamma[i], fixed_point_position, true);
+                    fixed_point_arithmetic::fixed_point<T> epsilon_qs8(epsilon, fixed_point_position);
+
+                    auto denominator = fixed_point_arithmetic::inv_sqrt(var_qs8 + epsilon_qs8);
+                    auto numerator   = in_qs8 - mean_qs8;
+                    auto x_bar       = numerator * denominator;
+                    x_bar            = beta_qs8 + x_bar * gamma_qs8;
+                    out[pos]         = x_bar.raw();
+                }
+            }
+        }
+    }
+}
+
+// Batch Normalization Layer for floating point type
+template <typename T, typename std::enable_if<std::is_floating_point<T>::value, int>::type * = nullptr>
+void batch_normalization_layer(const Tensor<T> &in, Tensor<T> &out, const Tensor<T> &mean, const Tensor<T> &var, const Tensor<T> &beta, const Tensor<T> &gamma, float epsilon, int fixed_point_position)
+{
+    const int cols       = static_cast<int>(in.shape()[0]);
+    const int rows       = static_cast<int>(in.shape()[1]);
+    const int depth      = static_cast<int>(in.shape()[2]);
+    int       upper_dims = in.shape().total_size() / (cols * rows * depth);
+
+    for(int r = 0; r < upper_dims; ++r)
+    {
+        for(int i = 0; i < depth; ++i)
+        {
+            for(int k = 0; k < rows; ++k)
+            {
+                for(int l = 0; l < cols; ++l)
+                {
+                    const int   pos         = l + k * cols + i * rows * cols + r * cols * rows * depth;
+                    const float denominator = sqrt(var[i] + epsilon);
+                    const float numerator   = in[pos] - mean[i];
+                    const float x_bar       = numerator / denominator;
+                    out[pos]                = beta[i] + x_bar * gamma[i];
+                }
+            }
+        }
+    }
+}
+
+// Convolution layer
+template <typename T>
+void convolution_layer(const Tensor<T> &in, const Tensor<T> &weights, const Tensor<T> &bias, Tensor<T> &out, const PadStrideInfo &conv_info)
+{
+    const int width_in       = in.shape().x();
+    const int height_in      = in.shape().y();
+    const int depth_in       = in.shape().z();
+    const int width_out      = out.shape().x();
+    const int height_out     = out.shape().y();
+    const int depth_out      = out.shape().z();
+    const int width_weights  = weights.shape().x();
+    const int height_weights = weights.shape().y();
+    const int depth_weights  = weights.shape().z();
+    const int pad_xi         = std::min(static_cast<int>(conv_info.pad().first), width_weights / 2);
+    const int pad_yi         = std::min(static_cast<int>(conv_info.pad().second), height_weights / 2);
+    const int start_xi       = width_weights / 2 - pad_xi;
+    const int start_yi       = height_weights / 2 - pad_yi;
+    const int end_xi         = width_in - start_xi;
+    const int end_yi         = height_in - start_yi;
+    const int stride_xi      = conv_info.stride().first;
+    const int stride_yi      = conv_info.stride().second;
+    const int num_batches    = in.shape().total_size() / (width_in * height_in * depth_in);
+
+    for(int r = 0; r < num_batches; ++r)
+    {
+        for(int yi = start_yi; yi < end_yi; yi += stride_yi)
+        {
+            for(int xi = start_xi; xi < end_xi; xi += stride_xi)
+            {
+                for(int ofm = 0; ofm < depth_out; ++ofm)
+                {
+                    // Compute input and output offsets
+                    const int offset_in  = r * width_in * height_in * depth_in;
+                    const int xo         = (xi - start_xi) / stride_xi;
+                    const int yo         = (yi - start_yi) / stride_yi;
+                    const int offset_out = xo + yo * width_out + ofm * width_out * height_out + r * width_out * height_out * depth_out;
+
+                    // Compute 3D convolution
+                    convolution3d(in.data() + offset_in,
+                                  weights.data() + ofm * width_weights * height_weights * depth_weights,
+                                  bias.data() + ofm,
+                                  out.data() + offset_out,
+                                  xi, yi,
+                                  width_in, height_in, depth_in,
+                                  width_weights, height_weights,
+                                  static_cast<int8_t>(in.fixed_point_position()));
+                }
+            }
+        }
+    }
+}
+
+// Fully connected layer
+template <typename T>
+void fully_connected_layer(const Tensor<T> &in, const Tensor<T> &weights, const Tensor<T> &bias, Tensor<T> &out)
+{
+    ARM_COMPUTE_ERROR_ON(weights.shape().x() != out.shape().x());
+    ARM_COMPUTE_ERROR_ON(weights.shape().y() != in.shape().x() * in.shape().y() * in.shape().z());
+    const int cols_weights = weights.shape().x();
+    const int rows_weights = weights.shape().y();
+    const int num_batches  = in.shape().total_size() / rows_weights;
+
+    for(int k = 0; k < num_batches; ++k)
+    {
+        vector_matrix_multiply<T>(in.data() + k * rows_weights,
+                                  weights.data(),
+                                  bias.data(),
+                                  out.data() + k * cols_weights,
+                                  cols_weights,
+                                  rows_weights,
+                                  in.fixed_point_position());
+    }
+}
+
+// Normalization Layer for floating point type
+template <typename T, typename std::enable_if<std::is_floating_point<T>::value, int>::type * = nullptr>
+void normalization_layer(const Tensor<T> &in, Tensor<T> &out, NormalizationLayerInfo norm_info)
+{
+    const uint32_t norm_size = norm_info.norm_size();
+    NormType       type      = norm_info.type();
+    float          beta      = norm_info.beta();
+    uint32_t       kappa     = norm_info.kappa();
+
+    const int cols       = static_cast<int>(in.shape()[0]);
+    const int rows       = static_cast<int>(in.shape()[1]);
+    const int depth      = static_cast<int>(in.shape()[2]);
+    int       upper_dims = in.shape().total_size() / (cols * rows);
+
+    float coeff       = norm_info.scale_coeff();
+    int   radius_cols = norm_size / 2;
+    // IN_MAP_1D and CROSS_MAP normalize over a single axis only
+    int radius_rows = (NormType::IN_MAP_2D == type) ? norm_size / 2 : 0;
+
+    if(type == NormType::CROSS_MAP)
+    {
+        // Remove also depth from upper dimensions since it is the axes we want
+        // to use for normalization
+        upper_dims /= depth;
+        for(int r = 0; r < upper_dims; ++r)
+        {
+            for(int i = 0; i < rows; ++i)
+            {
+                for(int k = 0; k < cols; ++k)
+                {
+                    for(int l = 0; l < depth; ++l)
+                    {
+                        float accumulated_scale = 0.f;
+                        for(int j = -radius_cols; j <= radius_cols; ++j)
+                        {
+                            const int z = l + j;
+                            if(z >= 0 && z < depth)
+                            {
+                                const T value = in[k + i * cols + z * rows * cols + r * cols * rows * depth];
+                                accumulated_scale += value * value;
+                            }
+                        }
+                        out[k + i * cols + l * rows * cols + r * cols * rows * depth] = kappa + accumulated_scale * coeff;
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        for(int r = 0; r < upper_dims; ++r)
+        {
+            for(int i = 0; i < rows; ++i)
+            {
+                for(int k = 0; k < cols; ++k)
+                {
+                    float accumulated_scale = 0.f;
+                    for(int j = -radius_rows; j <= radius_rows; ++j)
+                    {
+                        const int y = i + j;
+                        for(int l = -radius_cols; l <= radius_cols; ++l)
+                        {
+                            const int x = k + l;
+                            if((x >= 0 && y >= 0) && (x < cols && y < rows))
+                            {
+                                const T value = in[x + y * cols + r * cols * rows];
+                                accumulated_scale += value * value;
+                            }
+                        }
+                    }
+                    out[k + i * cols + r * cols * rows] = kappa + accumulated_scale * coeff;
+                }
+            }
+        }
+    }
+
+    if(beta == 1.f)
+    {
+        for(int i = 0; i < out.num_elements(); ++i)
+        {
+            out[i] = in[i] / out[i];
+        }
+    }
+    else if(beta == 0.5f)
+    {
+        for(int i = 0; i < out.num_elements(); ++i)
+        {
+            out[i] = in[i] / std::sqrt(out[i]);
+        }
+    }
+    else
+    {
+        for(int i = 0; i < out.num_elements(); ++i)
+        {
+            out[i] = in[i] * std::exp(std::log(out[i]) * -beta);
+        }
+    }
+}
+// Normalization Layer for fixed-point types
+template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type * = nullptr>
+void normalization_layer(const Tensor<T> &in, Tensor<T> &out, NormalizationLayerInfo norm_info)
+{
+    using namespace fixed_point_arithmetic;
+
+    const int fixed_point_position = in.fixed_point_position();
+
+    const uint32_t norm_size = norm_info.norm_size();
+    NormType       type      = norm_info.type();
+    fixed_point<T> beta(norm_info.beta(), fixed_point_position);
+    fixed_point<T> kappa(norm_info.kappa(), fixed_point_position);
+
+    const int cols       = static_cast<int>(in.shape()[0]);
+    const int rows       = static_cast<int>(in.shape()[1]);
+    const int depth      = static_cast<int>(in.shape()[2]);
+    int       upper_dims = in.shape().total_size() / (cols * rows);
+
+    fixed_point<T> coeff(norm_info.scale_coeff(), fixed_point_position);
+    int            radius_cols = norm_size / 2;
+    // IN_MAP_1D and CROSS_MAP normalize over a single axis only
+    int radius_rows = (NormType::IN_MAP_2D == type) ? norm_size / 2 : 0;
+
+    if(type == NormType::CROSS_MAP)
+    {
+        // Remove also depth from upper dimensions since it is the axes we want
+        // to use for normalization
+        upper_dims /= depth;
+        for(int r = 0; r < upper_dims; ++r)
+        {
+            for(int i = 0; i < rows; ++i)
+            {
+                for(int k = 0; k < cols; ++k)
+                {
+                    for(int l = 0; l < depth; ++l)
+                    {
+                        fixed_point<T> accumulated_scale(0.f, fixed_point_position);
+                        for(int j = -radius_cols; j <= radius_cols; ++j)
+                        {
+                            const int z = l + j;
+                            if(z >= 0 && z < depth)
+                            {
+                                const T              value = in[k + i * cols + z * rows * cols + r * cols * rows * depth];
+                                const fixed_point<T> fp_value(value, fixed_point_position, true);
+                                accumulated_scale = add(accumulated_scale, mul(fp_value, fp_value));
+                            }
+                        }
+                        accumulated_scale                                             = add(kappa, mul(accumulated_scale, coeff));
+                        out[k + i * cols + l * rows * cols + r * cols * rows * depth] = accumulated_scale.raw();
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        for(int r = 0; r < upper_dims; ++r)
+        {
+            for(int i = 0; i < rows; ++i)
+            {
+                for(int k = 0; k < cols; ++k)
+                {
+                    fixed_point<T> accumulated_scale(0.f, fixed_point_position);
+                    for(int j = -radius_rows; j <= radius_rows; ++j)
+                    {
+                        const int y = i + j;
+                        for(int l = -radius_cols; l <= radius_cols; ++l)
+                        {
+                            const int x = k + l;
+                            if((x >= 0 && y >= 0) && (x < cols && y < rows))
+                            {
+                                const T              value = in[x + y * cols + r * cols * rows];
+                                const fixed_point<T> fp_value(value, fixed_point_position, true);
+                                accumulated_scale = add(accumulated_scale, mul(fp_value, fp_value));
+                            }
+                        }
+                    }
+                    accumulated_scale                   = add(kappa, mul(accumulated_scale, coeff));
+                    out[k + i * cols + r * cols * rows] = accumulated_scale.raw();
+                }
+            }
+        }
+    }
+
+    if(norm_info.beta() == 1.f)
+    {
+        for(int i = 0; i < out.num_elements(); ++i)
+        {
+            fixed_point<T> res = div(fixed_point<T>(in[i], fixed_point_position, true), fixed_point<T>(out[i], fixed_point_position, true));
+            out[i]             = res.raw();
+        }
+    }
+    else
+    {
+        const fixed_point<T> beta(norm_info.beta(), fixed_point_position);
+        for(int i = 0; i < out.num_elements(); ++i)
+        {
+            fixed_point<T> res = pow(fixed_point<T>(out[i], fixed_point_position, true), beta);
+            res                = div(fixed_point<T>(in[i], fixed_point_position, true), res);
+            out[i]             = res.raw();
+        }
+    }
+}
+
+// Pooling layer
+template <typename T>
+void pooling_layer(const Tensor<T> &in, Tensor<T> &out, PoolingLayerInfo pool_info, int fixed_point_position)
+{
+    const int   pool_size     = pool_info.pool_size();
+    PoolingType type          = pool_info.pool_type();
+    int         pool_stride_x = 0;
+    int         pool_stride_y = 0;
+    int         pad_x         = 0;
+    int         pad_y         = 0;
+    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info().stride();
+    std::tie(pad_x, pad_y)                 = pool_info.pad_stride_info().pad();
+
+    const int cols_in = static_cast<int>(in.shape()[0]);
+    const int rows_in = static_cast<int>(in.shape()[1]);
+
+    const int cols_out = static_cast<int>(out.shape()[0]);
+    const int rows_out = static_cast<int>(out.shape()[1]);
+
+    int upper_dims = in.shape().total_size() / (cols_in * rows_in);
+
+    int pooled_height = static_cast<int>(ceil(static_cast<float>(rows_in + 2 * pad_x - pool_size) / pool_stride_x)) + 1;
+    int pooled_width  = static_cast<int>(ceil(static_cast<float>(cols_in + 2 * pad_y - pool_size) / pool_stride_y)) + 1;
+
+    if((pooled_height - 1) * pool_stride_x >= rows_in + pad_x)
+    {
+        --pooled_height;
+    }
+    if((pooled_width - 1) * pool_stride_y >= cols_in + pad_y)
+    {
+        --pooled_width;
+    }
+
+    if(type == PoolingType::MAX)
+    {
+        for(int r = 0; r < upper_dims; ++r)
+        {
+            for(int i = 0; i < pooled_height; ++i)
+            {
+                for(int k = 0; k < pooled_width; ++k)
+                {
+                    int hstart = i * pool_stride_x - pad_x;
+                    int wstart = k * pool_stride_y - pad_y;
+                    int hend   = std::min(hstart + pool_size, rows_in);
+                    int wend   = std::min(wstart + pool_size, cols_in);
+                    hstart     = std::max(hstart, 0);
+                    wstart     = std::max(wstart, 0);
+
+                    T max_val = std::numeric_limits<T>::lowest();
+                    for(int y = hstart; y < hend; ++y)
+                    {
+                        for(int x = wstart; x < wend; ++x)
+                        {
+                            T val = in[r * cols_in * rows_in + y * cols_in + x];
+                            if(val > max_val)
+                            {
+                                max_val = val;
+                            }
+                        }
+                    }
+
+                    out[r * rows_out * cols_out + i * pooled_width + k] = max_val;
+                }
+            }
+        }
+    }
+    else // Average pooling
+    {
+        for(int r = 0; r < upper_dims; ++r)
+        {
+            for(int i = 0; i < pooled_height; ++i)
+            {
+                for(int k = 0; k < pooled_width; ++k)
+                {
+                    T avg_val = 0;
+
+                    int hstart = i * pool_stride_x - pad_x;
+                    int wstart = k * pool_stride_y - pad_y;
+                    int hend   = std::min(hstart + pool_size, cols_in + pad_x);
+                    int wend   = std::min(wstart + pool_size, rows_in + pad_y);
+                    int pool   = (hend - hstart) * (wend - wstart);
+                    hstart     = std::max(hstart, 0);
+                    wstart     = std::max(wstart, 0);
+                    hend       = std::min(hend, rows_in);
+                    wend       = std::min(wend, cols_in);
+
+                    if(std::is_floating_point<T>::value)
+                    {
+                        for(int y = hstart; y < hend; ++y)
+                        {
+                            for(int x = wstart; x < wend; ++x)
+                            {
+                                avg_val += in[r * cols_in * rows_in + y * cols_in + x];
+                            }
+                        }
+                        out[r * rows_out * cols_out + i * pooled_width + k] = avg_val / pool;
+                    }
+                    else
+                    {
+                        static std::array<qint8_t, 10> scale_values_q8 =
+                        { { 0x0, 0x0, 0x40, 0x2A, 0x20, 0x19, 0x15, 0x12, 0x10, 0xE } };
+
+                        for(int y = hstart; y < hend; ++y)
+                        {
+                            for(int x = wstart; x < wend; ++x)
+                            {
+                                avg_val = sqadd_qs8(avg_val, in[r * cols_in * rows_in + y * cols_in + x]);
+                            }
+                        }
+                        out[r * rows_out * cols_out + i * pooled_width + k] = sqmul_qs8(avg_val, (scale_values_q8[pool] >> (7 - fixed_point_position)), fixed_point_position);
+                    }
+                }
+            }
+        }
+    }
+}
+
+// Softmax Layer
+template <typename T, typename std::enable_if<std::is_floating_point<T>::value, int>::type * = nullptr>
+void softmax_layer(const Tensor<T> &in, Tensor<T> &out)
+{
+    const int cols       = static_cast<int>(in.shape()[0]);
+    const int upper_dims = in.shape().total_size() / cols;
+    for(int r = 0; r < upper_dims; ++r)
+    {
+        // Find max
+        T max = std::numeric_limits<T>::lowest();
+        for(int c = 0; c < cols; ++c)
+        {
+            const T x = in[r * cols + c];
+            if(x > max)
+            {
+                max = x;
+            }
+        }
+
+        // Regularize
+        T sum = 0;
+        for(int c = 0; c < cols; ++c)
+        {
+            const T res       = exp(in[r * cols + c] - max);
+            out[r * cols + c] = res;
+            sum += res;
+        }
+
+        // Normalize
+        const T norm_val = 1 / sum;
+        for(int c = 0; c < cols; ++c)
+        {
+            out[r * cols + c] *= norm_val;
+        }
+    }
+}
+template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type * = nullptr>
+void softmax_layer(const Tensor<T> &in, Tensor<T> &out)
+{
+    using namespace fixed_point_arithmetic;
+    using promoted_T = typename test::traits::promote<T>::type;
+
+    const int fixed_point_position = in.fixed_point_position();
+    const int cols                 = static_cast<int>(in.shape()[0]);
+    const int upper_dims           = in.shape().total_size() / cols;
+
+    for(int r = 0; r < upper_dims; ++r)
+    {
+        // Find max
+        fixed_point<T> max(std::numeric_limits<T>::lowest(), fixed_point_position, true);
+        for(int c = 0; c < cols; ++c)
+        {
+            const fixed_point<T> x(in[r * cols + c], fixed_point_position, true);
+            if(x > max)
+            {
+                max = x;
+            }
+        }
+
+        // Regularize
+        fixed_point<promoted_T> sum(0, fixed_point_position);
+        for(int c = 0; c < cols; ++c)
+        {
+            const fixed_point<T> x(in[r * cols + c], fixed_point_position, true);
+            fixed_point<T>       res = exp(x - max);
+            out[r * cols + c]        = res.raw();
+            sum                      = add(sum, static_cast<fixed_point<promoted_T>>(res));
+        }
+
+        // Normalize
+        fixed_point<T> sat_sum(sum);
+        for(int c = 0; c < cols; ++c)
+        {
+            const fixed_point<T> x(out[r * cols + c], fixed_point_position, true);
+            out[r * cols + c] = div(x, sat_sum).raw();
+        }
+    }
+}
+
+// Fixed point operations
+template <typename T>
+void fixed_point_operation(const Tensor<T> &in, Tensor<T> &out, FixedPointOp op)
+{
+    int p = in.fixed_point_position();
+    switch(op)
+    {
+        case FixedPointOp::EXP:
+            for(int i = 0; i < in.num_elements(); ++i)
+            {
+                out[i] = fixed_point_arithmetic::exp(fixed_point_arithmetic::fixed_point<T>(in[i], p, true)).raw();
+            }
+            break;
+        case FixedPointOp::LOG:
+            for(int i = 0; i < in.num_elements(); ++i)
+            {
+                out[i] = fixed_point_arithmetic::log(fixed_point_arithmetic::fixed_point<T>(in[i], p, true)).raw();
+            }
+            break;
+        case FixedPointOp::INV_SQRT:
+            for(int i = 0; i < in.num_elements(); ++i)
+            {
+                out[i] = fixed_point_arithmetic::inv_sqrt(fixed_point_arithmetic::fixed_point<T>(in[i], p, true)).raw();
+            }
+            break;
+        case FixedPointOp::RECIPROCAL:
+            for(int i = 0; i < in.num_elements(); ++i)
+            {
+                out[i] = fixed_point_arithmetic::div(fixed_point_arithmetic::fixed_point<T>(1, p), fixed_point_arithmetic::fixed_point<T>(in[i], p, true)).raw();
+            }
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Fixed point operation not supported");
+            break;
+    }
+}
+
+// Tensor print
+template <typename T>
+void print(const Tensor<T> &in, std::ostream &out)
+{
+    out << "\n";
+    for(int i = 0; i < in.num_elements(); ++i)
+    {
+        out << in[i] << " ";
+    }
+    out << "\n";
+}
+} // namespace tensor_operations
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+
+#endif /* __ARM_COMPUTE_TEST_TENSOR_OPERATIONS_H__ */
diff --git a/tests/validation/TensorVisitors.h b/tests/validation/TensorVisitors.h
new file mode 100644
index 0000000000..a274140734
--- /dev/null
+++ b/tests/validation/TensorVisitors.h
@@ -0,0 +1,386 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_TENSOR_VISITORS_H__
+#define __ARM_COMPUTE_TEST_TENSOR_VISITORS_H__
+
+#include "Tensor.h"
+#include "TensorOperations.h"
+#include "arm_compute/core/Error.h"
+
+#include "boost_wrapper.h"
+
+#include <ostream>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace tensor_visitors
+{
+// Absolute Difference visitor
+struct absolute_difference_visitor : public boost::static_visitor<>
+{
+public:
+    template <typename T1, typename T2, typename T3>
+    void operator()(const Tensor<T1> &in1, const Tensor<T2> &in2, Tensor<T3> &out) const
+    {
+        tensor_operations::absolute_difference(in1, in2, out);
+    }
+};
+// Arithmetic Addition visitor
+struct arithmetic_addition_visitor : public boost::static_visitor<>
+{
+public:
+    explicit arithmetic_addition_visitor(ConvertPolicy convert_policy)
+        : _policy(convert_policy)
+    {
+    }
+
+    template <typename T1, typename T2, typename T3>
+    void operator()(const Tensor<T1> &in1, const Tensor<T2> &in2, Tensor<T3> &out) const
+    {
+        tensor_operations::arithmetic_addition(in1, in2, out, _policy);
+    }
+
+private:
+    ConvertPolicy _policy;
+};
+// Arithmetic Subtraction visitor
+struct arithmetic_subtraction_visitor : public boost::static_visitor<>
+{
+public:
+    explicit arithmetic_subtraction_visitor(ConvertPolicy convert_policy)
+        : _policy(convert_policy)
+    {
+    }
+
+    template <typename T1, typename T2, typename T3>
+    void operator()(const Tensor<T1> &in1, const Tensor<T2> &in2, Tensor<T3> &out) const
+    {
+        tensor_operations::arithmetic_subtraction(in1, in2, out, _policy);
+    }
+
+private:
+    ConvertPolicy _policy;
+};
+// Depth Convert visitor
+struct depth_convert_visitor : public boost::static_visitor<>
+{
+public:
+    explicit depth_convert_visitor(ConvertPolicy policy, uint32_t shift)
+        : _policy(policy), _shift(shift)
+    {
+    }
+
+    template <typename T1, typename T2>
+    void operator()(const Tensor<T1> &in, Tensor<T2> &out) const
+    {
+        tensor_operations::depth_convert(in, out, _policy, _shift);
+    }
+
+private:
+    ConvertPolicy _policy;
+    uint32_t      _shift;
+};
+// GEMM visitor
+struct gemm_visitor : public boost::static_visitor<>
+{
+public:
+    explicit gemm_visitor(const TensorVariant &in1, const TensorVariant &in2, const TensorVariant &in3, float alpha, float beta)
+        : _in1(in1), _in2(in2), _in3(in3), _alpha(alpha), _beta(beta)
+    {
+    }
+
+    template <typename T>
+    void operator()(Tensor<T> &out) const
+    {
+        const Tensor<T> &in1 = boost::get<Tensor<T>>(_in1);
+        const Tensor<T> &in2 = boost::get<Tensor<T>>(_in2);
+        const Tensor<T> &in3 = boost::get<Tensor<T>>(_in3);
+        tensor_operations::gemm(in1, in2, in3, out, _alpha, _beta);
+    }
+
+private:
+    const TensorVariant &_in1, &_in2, &_in3;
+    float                _alpha;
+    float                _beta;
+};
+// Pixel-wise Multiplication visitor
+struct pixel_wise_multiplication_visitor : public boost::static_visitor<>
+{
+public:
+    explicit pixel_wise_multiplication_visitor(float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy)
+        : _scale(scale), _convert_policy(convert_policy), _rounding_policy(rounding_policy)
+    {
+    }
+
+    template <typename T1, typename T2, typename T3>
+    void operator()(const Tensor<T1> &in1, const Tensor<T2> &in2, Tensor<T3> &out) const
+    {
+        tensor_operations::pixel_wise_multiplication(in1, in2, out, _scale, _convert_policy, _rounding_policy);
+    }
+
+private:
+    float          _scale;
+    ConvertPolicy  _convert_policy;
+    RoundingPolicy _rounding_policy;
+};
+// Fixed Point Pixel-wise Multiplication visitor
+struct fixed_point_pixel_wise_multiplication_visitor : public boost::static_visitor<>
+{
+public:
+    explicit fixed_point_pixel_wise_multiplication_visitor(const TensorVariant &in1, const TensorVariant &in2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy)
+        : _in1(in1), _in2(in2), _scale(scale), _convert_policy(convert_policy), _rounding_policy(rounding_policy)
+    {
+    }
+
+    template <typename T, typename = typename std::enable_if<std::is_integral<T>::value>::type>
+    void operator()(Tensor<T> &out) const
+    {
+        const Tensor<T> &in1 = boost::get<Tensor<T>>(_in1);
+        const Tensor<T> &in2 = boost::get<Tensor<T>>(_in2);
+        tensor_operations::fixed_point_pixel_wise_multiplication(in1, in2, out, _scale, _convert_policy, _rounding_policy);
+    }
+    template < typename T, typename std::enable_if < !std::is_integral<T>::value, int >::type = 0 >
+    void operator()(Tensor<T> &out) const
+    {
+        ARM_COMPUTE_ERROR("NOT SUPPORTED!");
+    }
+
+private:
+    const TensorVariant &_in1;
+    const TensorVariant &_in2;
+    float                _scale;
+    ConvertPolicy        _convert_policy;
+    RoundingPolicy       _rounding_policy;
+};
+// Threshold operation
+void threshold_operation(const Tensor<uint8_t> &in, Tensor<uint8_t> &out, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
+{
+    tensor_operations::threshold(in, out, threshold, false_value, true_value, type, upper);
+}
+// Activation layer visitor
+struct activation_layer_visitor : public boost::static_visitor<>
+{
+public:
+    explicit activation_layer_visitor(const TensorVariant &in, ActivationLayerInfo act_info)
+        : _in(in), _act_info(act_info)
+    {
+    }
+
+    template <typename T>
+    void operator()(Tensor<T> &out) const
+    {
+        const auto &in = boost::get<Tensor<T>>(_in);
+        tensor_operations::activation_layer(in, out, _act_info);
+    }
+
+private:
+    const TensorVariant      &_in;
+    const ActivationLayerInfo _act_info;
+};
+// Batch Normalization Layer visitor
+struct batch_normalization_layer_visitor : public boost::static_visitor<>
+{
+public:
+    explicit batch_normalization_layer_visitor(const TensorVariant &in, const TensorVariant &mean, const TensorVariant &var, const TensorVariant &beta, const TensorVariant &gamma, float epsilon,
+                                               int fixed_point_position = 0)
+        : _in(in), _mean(mean), _var(var), _beta(beta), _gamma(gamma), _epsilon(epsilon), _fixed_point_position(fixed_point_position)
+    {
+    }
+
+    template <typename T>
+    void operator()(Tensor<T> &out) const
+    {
+        const Tensor<T> &in    = boost::get<Tensor<T>>(_in);
+        const Tensor<T> &mean  = boost::get<Tensor<T>>(_mean);
+        const Tensor<T> &var   = boost::get<Tensor<T>>(_var);
+        const Tensor<T> &beta  = boost::get<Tensor<T>>(_beta);
+        const Tensor<T> &gamma = boost::get<Tensor<T>>(_gamma);
+        tensor_operations::batch_normalization_layer(in, out, mean, var, beta, gamma, _epsilon, _fixed_point_position);
+    }
+
+private:
+    const TensorVariant &_in, &_mean, &_var, &_beta, &_gamma;
+    float                _epsilon;
+    int                  _fixed_point_position;
+};
+// Convolution Layer visitor
+struct convolution_layer_visitor : public boost::static_visitor<>
+{
+public:
+    explicit convolution_layer_visitor(const TensorVariant &in, const TensorVariant &weights, const TensorVariant &bias, PadStrideInfo conv_info)
+        : _in(in), _weights(weights), _bias(bias), _conv_info(conv_info)
+    {
+    }
+
+    template <typename T>
+    void operator()(Tensor<T> &out) const
+    {
+        const Tensor<T> &in      = boost::get<Tensor<T>>(_in);
+        const Tensor<T> &weights = boost::get<Tensor<T>>(_weights);
+        const Tensor<T> &bias    = boost::get<Tensor<T>>(_bias);
+        tensor_operations::convolution_layer(in, weights, bias, out, _conv_info);
+    }
+
+private:
+    const TensorVariant &_in;
+    const TensorVariant &_weights;
+    const TensorVariant &_bias;
+    PadStrideInfo        _conv_info;
+};
+
+struct fully_connected_layer_visitor : public boost::static_visitor<>
+{
+public:
+    explicit fully_connected_layer_visitor(const TensorVariant &in, const TensorVariant &weights, const TensorVariant &bias)
+        : _in(in), _weights(weights), _bias(bias)
+    {
+    }
+    template <typename T>
+    void operator()(Tensor<T> &out) const
+    {
+        const Tensor<T> &in      = boost::get<Tensor<T>>(_in);
+        const Tensor<T> &weights = boost::get<Tensor<T>>(_weights);
+        const Tensor<T> &bias    = boost::get<Tensor<T>>(_bias);
+        tensor_operations::fully_connected_layer(in, weights, bias, out);
+    }
+
+private:
+    const TensorVariant &_in;
+    const TensorVariant &_weights;
+    const TensorVariant &_bias;
+};
+
+// Normalization Layer visitor
+struct normalization_layer_visitor : public boost::static_visitor<>
+{
+public:
+    explicit normalization_layer_visitor(const TensorVariant &in, NormalizationLayerInfo norm_info)
+        : _in(in), _norm_info(norm_info)
+    {
+    }
+
+    template <typename T>
+    void operator()(Tensor<T> &out) const
+    {
+        const Tensor<T> &in = boost::get<Tensor<T>>(_in);
+        tensor_operations::normalization_layer(in, out, _norm_info);
+    }
+
+private:
+    const TensorVariant   &_in;
+    NormalizationLayerInfo _norm_info;
+};
+// Pooling layer
+struct pooling_layer_visitor : public boost::static_visitor<>
+{
+public:
+    explicit pooling_layer_visitor(const TensorVariant &in, PoolingLayerInfo pool_info, int fixed_point_position = 0)
+        : _in(in), _pool_info(pool_info), _fixed_point_position(fixed_point_position)
+    {
+    }
+
+    template <typename T>
+    void operator()(Tensor<T> &out) const
+    {
+        const Tensor<T> &in = boost::get<Tensor<T>>(_in);
+        tensor_operations::pooling_layer(in, out, _pool_info, _fixed_point_position);
+    }
+
+private:
+    const TensorVariant &_in;
+    PoolingLayerInfo     _pool_info;
+    int                  _fixed_point_position;
+};
+// Softmax Layer visitor
+struct softmax_layer_visitor : public boost::static_visitor<>
+{
+public:
+    explicit softmax_layer_visitor(const TensorVariant &in)
+        : _in(in)
+    {
+    }
+
+    template <typename T>
+    void operator()(Tensor<T> &out) const
+    {
+        const auto &in = boost::get<Tensor<T>>(_in);
+        tensor_operations::softmax_layer(in, out);
+    }
+
+private:
+    const TensorVariant &_in;
+};
+// Fixed Point operations visitor
+struct fixed_point_operation_visitor : public boost::static_visitor<>
+{
+public:
+    explicit fixed_point_operation_visitor(const TensorVariant &in, FixedPointOp op)
+        : _in(in), _op(op)
+    {
+    }
+
+    template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
+    void operator()(Tensor<T> &out) const
+    {
+        const Tensor<T> &in = boost::get<Tensor<T>>(_in);
+        tensor_operations::fixed_point_operation(in, out, _op);
+    }
+    template < typename T, typename std::enable_if < !std::is_integral<T>::value, int >::type = 0 >
+    void operator()(Tensor<T> &out) const
+    {
+        ARM_COMPUTE_ERROR("NOT SUPPORTED!");
+    }
+
+private:
+    const TensorVariant &_in;
+    FixedPointOp         _op;
+};
+// Print Tensor visitor
+struct print_visitor : public boost::static_visitor<>
+{
+public:
+    explicit print_visitor(std::ostream &out)
+        : _out(out)
+    {
+    }
+
+    template <typename T>
+    void operator()(const Tensor<T> &in) const
+    {
+        tensor_operations::print(in, _out);
+    }
+
+private:
+    std::ostream &_out;
+};
+} // namespace tensor_visitors
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+
+#endif /* __ARM_COMPUTE_TEST_TENSOR_VISITORS_H__ */
diff --git a/tests/validation/UNIT/CMakeLists.txt b/tests/validation/UNIT/CMakeLists.txt
new file mode 100644
index 0000000000..a0603f150c
--- /dev/null
+++ b/tests/validation/UNIT/CMakeLists.txt
@@ -0,0 +1,37 @@
+# Copyright (c) 2017 ARM Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+cmake_minimum_required (VERSION 3.1)
+
+set(arm_compute_test_validation_UNIT_SOURCE_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/TensorInfo.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/Utils.cpp
+)
+
+add_library(arm_compute_test_validation_UNIT OBJECT
+    ${arm_compute_test_validation_UNIT_SOURCE_FILES}
+)
+
+set(arm_compute_test_validation_TARGET_OBJECTS
+    ${arm_compute_test_validation_TARGET_OBJECTS}
+    $<TARGET_OBJECTS:arm_compute_test_validation_UNIT>
+    PARENT_SCOPE
+)
diff --git a/tests/validation/UNIT/FixedPoint.cpp b/tests/validation/UNIT/FixedPoint.cpp
new file mode 100644
index 0000000000..63d4150318
--- /dev/null
+++ b/tests/validation/UNIT/FixedPoint.cpp
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "validation/FixedPoint.h"
+
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "validation/Validation.h"
+#include "validation/ValidationUserConfiguration.h"
+
+#include "boost_wrapper.h"
+
+#include <fstream>
+#include <vector>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+std::string func_names[] =
+{
+    "add", "sub", "mul", "exp", "log", "inv_sqrt"
+};
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(UNIT)
+BOOST_AUTO_TEST_SUITE(FixedPoint)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(FixedPointQS8Inputs, boost::unit_test::data::make(func_names) * boost::unit_test::data::xrange(1, 7), func_name, frac_bits)
+{
+    const std::string base_file_name = user_config.path.get() + "/dumps/" + func_name + "_Q8." + cpp11::to_string(frac_bits);
+    std::ifstream     inputs_file{ base_file_name + ".in", std::ios::binary | std::ios::in };
+
+    BOOST_TEST_INFO(base_file_name + ".in");
+    BOOST_TEST_REQUIRE(inputs_file.good());
+
+    float float_val = 0.f;
+
+    // Read first value
+    inputs_file.read(reinterpret_cast<char *>(&float_val), sizeof(float_val));
+
+    while(inputs_file.good())
+    {
+        // Convert to fixed point
+        fixed_point_arithmetic::fixed_point<int8_t> in_val(float_val, frac_bits);
+
+        // Check that the value didn't change
+        BOOST_TEST(static_cast<float>(in_val) == float_val);
+
+        // Read next value
+        inputs_file.read(reinterpret_cast<char *>(&float_val), sizeof(float_val));
+    }
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+//FIXME: Figure out how to handle expected failures properly
+// The last input argument specifies the expected number of failures for a
+// given combination of (function name, number of fractional bits) as defined
+// by the first two arguments.
+BOOST_DATA_TEST_CASE(FixedPointQS8Outputs, (boost::unit_test::data::make(func_names) * boost::unit_test::data::xrange(1, 7)) ^ (boost::unit_test::data::make({ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 13, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 32, 67 })),
+                     func_name, frac_bits, expected_failures)
+{
+    const std::string base_file_name = user_config.path.get() + "/dumps/" + func_name + "_Q8." + cpp11::to_string(frac_bits);
+    std::ifstream     inputs_file{ base_file_name + ".in", std::ios::binary | std::ios::in };
+    std::ifstream     reference_file{ base_file_name + ".out", std::ios::binary | std::ios::in };
+
+    BOOST_TEST_INFO(base_file_name + ".in");
+    BOOST_TEST_REQUIRE(inputs_file.good());
+    BOOST_TEST_INFO(base_file_name + ".out");
+    BOOST_TEST_REQUIRE(reference_file.good());
+
+    const float step_size = std::pow(2.f, -frac_bits);
+
+    float   float_val      = 0.f;
+    float   ref_val        = 0.f;
+    int64_t num_mismatches = 0;
+
+    // Read first values
+    inputs_file.read(reinterpret_cast<char *>(&float_val), sizeof(float_val));
+    reference_file.read(reinterpret_cast<char *>(&ref_val), sizeof(ref_val));
+
+    while(inputs_file.good() && reference_file.good())
+    {
+        fixed_point_arithmetic::fixed_point<int8_t> in_val(float_val, frac_bits);
+        fixed_point_arithmetic::fixed_point<int8_t> out_val(0.f, frac_bits);
+
+        float tolerance = 0.f;
+
+        if(func_name == "add")
+        {
+            out_val = in_val + in_val;
+        }
+        else if(func_name == "sub")
+        {
+            out_val = in_val - in_val; //NOLINT
+        }
+        else if(func_name == "mul")
+        {
+            tolerance = 1.f * step_size;
+            out_val   = in_val * in_val;
+        }
+        else if(func_name == "exp")
+        {
+            tolerance = 2.f * step_size;
+            out_val   = fixed_point_arithmetic::exp(in_val);
+        }
+        else if(func_name == "log")
+        {
+            tolerance = 4.f * step_size;
+            out_val   = fixed_point_arithmetic::log(in_val);
+        }
+        else if(func_name == "inv_sqrt")
+        {
+            tolerance = 5.f * step_size;
+            out_val   = fixed_point_arithmetic::inv_sqrt(in_val);
+        }
+
+        BOOST_TEST_INFO("input = " << in_val);
+        BOOST_TEST_INFO("output = " << out_val);
+        BOOST_TEST_INFO("reference = " << ref_val);
+        BOOST_TEST_INFO("tolerance = " << tolerance);
+        BOOST_TEST_WARN((std::abs(static_cast<float>(out_val) - ref_val) <= tolerance));
+
+        if(std::abs(static_cast<float>(out_val) - ref_val) > tolerance)
+        {
+            ++num_mismatches;
+        }
+
+        // Read next values
+        inputs_file.read(reinterpret_cast<char *>(&float_val), sizeof(float_val));
+        reference_file.read(reinterpret_cast<char *>(&ref_val), sizeof(ref_val));
+    }
+
+    BOOST_TEST(num_mismatches == expected_failures);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/UNIT/TensorInfo.cpp b/tests/validation/UNIT/TensorInfo.cpp
new file mode 100644
index 0000000000..11ed9f6dcc
--- /dev/null
+++ b/tests/validation/UNIT/TensorInfo.cpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "TypePrinter.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+
+#include "boost_wrapper.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::validation;
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(UNIT)
+BOOST_AUTO_TEST_SUITE(TensorInfoValidation)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(AutoPadding,
+                     boost::unit_test::data::make({ TensorShape{},
+                                                    TensorShape{ 10U },
+                                                    TensorShape{ 10U, 10U },
+                                                    TensorShape{ 10U, 10U, 10U },
+                                                    TensorShape{ 10U, 10U, 10U, 10U },
+                                                    TensorShape{ 10U, 10U, 10U, 10U, 10U },
+                                                    TensorShape{ 10U, 10U, 10U, 10U, 10U, 10U }
+                                                  })
+                     ^ boost::unit_test::data::make({ PaddingSize{ 0, 0, 0, 0 },
+                                                      PaddingSize{ 0, 36, 0, 4 },
+                                                      PaddingSize{ 4, 36, 4, 4 },
+                                                      PaddingSize{ 4, 36, 4, 4 },
+                                                      PaddingSize{ 4, 36, 4, 4 },
+                                                      PaddingSize{ 4, 36, 4, 4 },
+                                                      PaddingSize{ 4, 36, 4, 4 }
+                                                    })
+                     ^ boost::unit_test::data::make({ Strides{},
+                                                      Strides{ 1U },
+                                                      Strides{ 1U, 50U },
+                                                      Strides{ 1U, 50U, 900U },
+                                                      Strides{ 1U, 50U, 900U, 9000U },
+                                                      Strides{ 1U, 50U, 900U, 9000U, 90000U },
+                                                      Strides{ 1U, 50U, 900U, 9000U, 90000U, 900000U }
+                                                    })
+                     ^ boost::unit_test::data::make(
+{
+    0,
+    4,
+    204,
+    204,
+    204,
+    204,
+    204,
+}),
+shape, auto_padding, strides, offset)
+{
+    TensorInfo info{ shape, Format::U8 };
+
+    BOOST_TEST(!info.has_padding());
+
+    info.auto_padding();
+
+    validate(info.padding(), auto_padding);
+    BOOST_TEST(compare_dimensions(info.strides_in_bytes(), strides));
+    BOOST_TEST(info.offset_first_element_in_bytes() == offset);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/UNIT/TensorShape.cpp b/tests/validation/UNIT/TensorShape.cpp
new file mode 100644
index 0000000000..2d78cd549a
--- /dev/null
+++ b/tests/validation/UNIT/TensorShape.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "TypePrinter.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/TensorShape.h"
+
+#include "boost_wrapper.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::validation;
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(UNIT)
+BOOST_AUTO_TEST_SUITE(TensorShapeValidation)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Construction,
+                     boost::unit_test::data::make({ TensorShape{},
+                                                    TensorShape{ 1U },
+                                                    TensorShape{ 2U },
+                                                    TensorShape{ 2U, 3U },
+                                                    TensorShape{ 2U, 3U, 5U },
+                                                    TensorShape{ 2U, 3U, 5U, 7U },
+                                                    TensorShape{ 2U, 3U, 5U, 7U, 11U },
+                                                    TensorShape{ 2U, 3U, 5U, 7U, 11U, 13U }
+                                                  })
+                     ^ boost::unit_test::data::make({ 0, 0, 1, 2, 3, 4, 5, 6 }) ^ boost::unit_test::data::make({ 0, 1, 2, 6, 30, 210, 2310, 30030 }),
+                     shape, num_dimensions, total_size)
+{
+    BOOST_TEST(shape.num_dimensions() == num_dimensions);
+    BOOST_TEST(shape.total_size() == total_size);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(SetEmpty, boost::unit_test::data::make({ 0, 1, 2, 3, 4, 5 }), dimension)
+{
+    TensorShape shape;
+
+    shape.set(dimension, 10);
+
+    BOOST_TEST(shape.num_dimensions() == dimension + 1);
+    BOOST_TEST(shape.total_size() == 10);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/UNIT/Utils.cpp b/tests/validation/UNIT/Utils.cpp
new file mode 100644
index 0000000000..7a09be52b5
--- /dev/null
+++ b/tests/validation/UNIT/Utils.cpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Utils.h"
+
+#include "TypePrinter.h"
+#include "validation/Validation.h"
+
+#include "boost_wrapper.h"
+
+#include <stdexcept>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::cpp11;
+using namespace arm_compute::test::validation;
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(UNIT)
+BOOST_AUTO_TEST_SUITE(Utils)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RoundHalfUp, boost::unit_test::data::make({ 1.f, 1.2f, 1.5f, 2.5f, 2.9f, -3.f, -3.5f, -3.8f, -4.3f, -4.5f }) ^ boost::unit_test::data::make({ 1.f, 1.f, 2.f, 3.f, 3.f, -3.f, -3.f, -4.f, -4.f, -4.f }),
+                     value, result)
+{
+    BOOST_TEST(cpp11::round_half_up(value) == result);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RoundHalfEven, boost::unit_test::data::make({ 1.f, 1.2f, 1.5f, 2.5f, 2.9f, -3.f, -3.5f, -3.8f, -4.3f, -4.5f }) ^ boost::unit_test::data::make({ 1.f, 1.f, 2.f, 2.f, 3.f, -3.f, -4.f, -4.f, -4.f, -4.f }),
+                     value, result)
+{
+    BOOST_TEST(cpp11::round_half_even(value) == result);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Index2Coord, boost::unit_test::data::make({ TensorShape{ 1U }, TensorShape{ 2U }, TensorShape{ 2U, 3U } }) ^ boost::unit_test::data::make({ 0, 1, 2 }) ^
+                     boost::unit_test::data::make({ Coordinates{ 0 }, Coordinates{ 1 }, Coordinates{ 0, 1 } }), shape, index, ref_coordinate)
+{
+    Coordinates coordinate = index2coord(shape, index);
+
+    BOOST_TEST(compare_dimensions(coordinate, ref_coordinate));
+}
+
+//FIXME: Negative tests only work in debug mode
+#if 0
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Index2CoordFail, boost::unit_test::data::make({ TensorShape{}, TensorShape{ 2U }, TensorShape{ 2U } }) ^ boost::unit_test::data::make({ 0, -1, 2 }), shape, index)
+{
+    BOOST_CHECK_THROW(index2coord(shape, index), std::runtime_error);
+}
+#endif
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Coord2Index, boost::unit_test::data::make({ TensorShape{ 1U }, TensorShape{ 2U }, TensorShape{ 2U, 3U } }) ^ boost::unit_test::data::make({ Coordinates{ 0 }, Coordinates{ 1 }, Coordinates{ 0, 1 } })
+                     ^ boost::unit_test::data::make({ 0, 1, 2 }),
+                     shape, coordinate, ref_index)
+{
+    int index = coord2index(shape, coordinate);
+
+    BOOST_TEST(index == ref_index);
+}
+
+//FIXME: Negative tests only work in debug mode
+#if 0
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Coord2IndexFail, boost::unit_test::data::make({ TensorShape{}, TensorShape{ 2U } }) ^ boost::unit_test::data::make({ Coordinates{ 0 }, Coordinates{} }), shape, coordinate)
+{
+    BOOST_CHECK_THROW(coord2index(shape, coordinate), std::runtime_error);
+}
+#endif
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/Validation.cpp b/tests/validation/Validation.cpp
new file mode 100644
index 0000000000..335d2644d3
--- /dev/null
+++ b/tests/validation/Validation.cpp
@@ -0,0 +1,359 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Validation.h"
+
+#include "IAccessor.h"
+#include "RawTensor.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/FixedPoint.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <array>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <iomanip>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+/** Get the data from *ptr after casting according to @p data_type and then convert the data to double.
+ *
+ * @param[in] ptr       Pointer to value.
+ * @param[in] data_type Data type of both values.
+ *
+ * @return The data from the ptr after converted to double.
+ */
+double get_double_data(const void *ptr, DataType data_type)
+{
+    switch(data_type)
+    {
+        case DataType::U8:
+            return *reinterpret_cast<const uint8_t *>(ptr);
+        case DataType::S8:
+            return *reinterpret_cast<const int8_t *>(ptr);
+        case DataType::QS8:
+            return *reinterpret_cast<const qint8_t *>(ptr);
+        case DataType::U16:
+            return *reinterpret_cast<const uint16_t *>(ptr);
+        case DataType::S16:
+            return *reinterpret_cast<const int16_t *>(ptr);
+        case DataType::U32:
+            return *reinterpret_cast<const uint32_t *>(ptr);
+        case DataType::S32:
+            return *reinterpret_cast<const int32_t *>(ptr);
+        case DataType::U64:
+            return *reinterpret_cast<const uint64_t *>(ptr);
+        case DataType::S64:
+            return *reinterpret_cast<const int64_t *>(ptr);
+#if ENABLE_FP16
+        case DataType::F16:
+            return *reinterpret_cast<const float16_t *>(ptr);
+#endif
+        case DataType::F32:
+            return *reinterpret_cast<const float *>(ptr);
+        case DataType::F64:
+            return *reinterpret_cast<const double *>(ptr);
+        case DataType::SIZET:
+            return *reinterpret_cast<const size_t *>(ptr);
+        default:
+            ARM_COMPUTE_ERROR("NOT SUPPORTED!");
+    }
+}
+
+void check_border_element(const IAccessor &tensor, const Coordinates &id,
+                          const BorderMode &border_mode, const void *border_value,
+                          int64_t &num_elements, int64_t &num_mismatches)
+{
+    const size_t channel_size = element_size_from_data_type(tensor.data_type());
+    const auto   ptr          = static_cast<const uint8_t *>(tensor(id));
+
+    if(border_mode == BorderMode::REPLICATE)
+    {
+        Coordinates border_id{ id };
+        border_id.set(1, 0);
+        border_value = tensor(border_id);
+    }
+
+    // Iterate over all channels within one element
+    for(int channel = 0; channel < tensor.num_channels(); ++channel)
+    {
+        const size_t channel_offset = channel * channel_size;
+        const double target         = get_double_data(ptr + channel_offset, tensor.data_type());
+        const double ref            = get_double_data(static_cast<const uint8_t *>(border_value) + channel_offset, tensor.data_type());
+        const double difference     = target - ref;
+
+        BOOST_TEST_INFO("id = " << id);
+        BOOST_TEST_INFO("channel = " << channel);
+        BOOST_TEST_INFO("reference = " << std::setprecision(5) << ref);
+        BOOST_TEST_INFO("target = " << std::setprecision(5) << target);
+        BOOST_TEST_WARN(difference == 0);
+
+        if(difference != 0.f)
+        {
+            ++num_mismatches;
+        }
+
+        ++num_elements;
+    }
+}
+
+void check_single_element(const Coordinates &id, const IAccessor &tensor, const RawTensor &reference, float tolerance_value,
+                          uint64_t wrap_range, int min_channels, size_t channel_size, int64_t &num_mismatches, int64_t &num_elements)
+{
+    const auto ptr     = static_cast<const uint8_t *>(tensor(id));
+    const auto ref_ptr = static_cast<const uint8_t *>(reference(id));
+
+    // Iterate over all channels within one element
+    for(int channel = 0; channel < min_channels; ++channel)
+    {
+        const size_t channel_offset = channel * channel_size;
+        const double target         = get_double_data(ptr + channel_offset, reference.data_type());
+        const double ref            = get_double_data(ref_ptr + channel_offset, reference.data_type());
+        const double difference     = target - ref;
+
+        BOOST_TEST_INFO("id = " << id);
+        BOOST_TEST_INFO("channel = " << channel);
+        BOOST_TEST_INFO("reference = " << std::setprecision(5) << ref);
+        BOOST_TEST_INFO("target = " << std::setprecision(5) << target);
+        BOOST_TEST_WARN(difference == 0);
+
+        if(std::abs(difference) > tolerance_value)
+        {
+            // If no special cases for tolerating wrappping cases
+            // or the special case of wrapping exceeds tolerance_value
+            if(wrap_range == 0 || (wrap_range - std::abs(difference)) > tolerance_value)
+            {
+                ++num_mismatches;
+            }
+        }
+        ++num_elements;
+    }
+}
+} // namespace
+
+void validate(const arm_compute::ValidRegion &region, const arm_compute::ValidRegion &reference)
+{
+    BOOST_TEST(region.anchor.num_dimensions() == reference.anchor.num_dimensions());
+    BOOST_TEST(region.shape.num_dimensions() == reference.shape.num_dimensions());
+
+    for(unsigned int d = 0; d < region.anchor.num_dimensions(); ++d)
+    {
+        BOOST_TEST(region.anchor[d] == reference.anchor[d]);
+    }
+
+    for(unsigned int d = 0; d < region.shape.num_dimensions(); ++d)
+    {
+        BOOST_TEST(region.shape[d] == reference.shape[d]);
+    }
+}
+
+void validate(const arm_compute::PaddingSize &padding, const arm_compute::PaddingSize &reference)
+{
+    BOOST_TEST(padding.top == reference.top);
+    BOOST_TEST(padding.right == reference.right);
+    BOOST_TEST(padding.bottom == reference.bottom);
+    BOOST_TEST(padding.left == reference.left);
+}
+
+void validate(const IAccessor &tensor, const RawTensor &reference, float tolerance_value, float tolerance_number, uint64_t wrap_range)
+{
+    // Validate with valid region covering the entire shape
+    validate(tensor, reference, shape_to_valid_region(tensor.shape()), tolerance_value, tolerance_number, wrap_range);
+}
+
+void validate(const IAccessor &tensor, const RawTensor &reference, const ValidRegion &valid_region, float tolerance_value, float tolerance_number, uint64_t wrap_range)
+{
+    int64_t num_mismatches = 0;
+    int64_t num_elements   = 0;
+
+    BOOST_TEST(tensor.element_size() == reference.element_size());
+    BOOST_TEST(tensor.format() == reference.format());
+    BOOST_TEST(tensor.data_type() == reference.data_type());
+    BOOST_TEST(tensor.num_channels() == reference.num_channels());
+    BOOST_TEST(compare_dimensions(tensor.shape(), reference.shape()));
+
+    const int    min_elements = std::min(tensor.num_elements(), reference.num_elements());
+    const int    min_channels = std::min(tensor.num_channels(), reference.num_channels());
+    const size_t channel_size = element_size_from_data_type(reference.data_type());
+
+    // Iterate over all elements within valid region, e.g. U8, S16, RGB888, ...
+    for(int element_idx = 0; element_idx < min_elements; ++element_idx)
+    {
+        const Coordinates id = index2coord(reference.shape(), element_idx);
+        if(is_in_valid_region(valid_region, id))
+        {
+            check_single_element(id, tensor, reference, tolerance_value, wrap_range, min_channels, channel_size, num_mismatches, num_elements);
+        }
+    }
+
+    const int64_t absolute_tolerance_number = tolerance_number * num_elements;
+    const float   percent_mismatches        = static_cast<float>(num_mismatches) / num_elements * 100.f;
+
+    BOOST_TEST(num_mismatches <= absolute_tolerance_number,
+               num_mismatches << " values (" << std::setprecision(2) << percent_mismatches
+               << "%) mismatched (maximum tolerated " << std::setprecision(2) << tolerance_number << "%)");
+}
+
+void validate(const IAccessor &tensor, const void *reference_value)
+{
+    BOOST_TEST_REQUIRE((reference_value != nullptr));
+
+    int64_t      num_mismatches = 0;
+    int64_t      num_elements   = 0;
+    const size_t channel_size   = element_size_from_data_type(tensor.data_type());
+
+    // Iterate over all elements, e.g. U8, S16, RGB888, ...
+    for(int element_idx = 0; element_idx < tensor.num_elements(); ++element_idx)
+    {
+        const Coordinates id = index2coord(tensor.shape(), element_idx);
+
+        const auto ptr = static_cast<const uint8_t *>(tensor(id));
+
+        // Iterate over all channels within one element
+        for(int channel = 0; channel < tensor.num_channels(); ++channel)
+        {
+            const size_t channel_offset = channel * channel_size;
+            const double target         = get_double_data(ptr + channel_offset, tensor.data_type());
+            const double ref            = get_double_data(reference_value, tensor.data_type());
+            const double difference     = target - ref;
+
+            BOOST_TEST_INFO("id = " << id);
+            BOOST_TEST_INFO("channel = " << channel);
+            BOOST_TEST_INFO("reference = " << std::setprecision(5) << ref);
+            BOOST_TEST_INFO("target = " << std::setprecision(5) << target);
+            BOOST_TEST_WARN(difference == 0);
+
+            if(difference != 0.f)
+            {
+                ++num_mismatches;
+            }
+
+            ++num_elements;
+        }
+    }
+
+    const float percent_mismatches = static_cast<float>(num_mismatches) / num_elements * 100.f;
+
+    BOOST_TEST(num_mismatches == 0,
+               num_mismatches << " values (" << std::setprecision(2) << percent_mismatches << "%) mismatched");
+}
+
+void validate(const IAccessor &tensor, BorderSize border_size, const BorderMode &border_mode, const void *border_value)
+{
+    if(border_mode == BorderMode::UNDEFINED)
+    {
+        return;
+    }
+    else if(border_mode == BorderMode::CONSTANT)
+    {
+        BOOST_TEST((border_value != nullptr));
+    }
+
+    int64_t   num_mismatches = 0;
+    int64_t   num_elements   = 0;
+    const int slice_size     = tensor.shape()[0] * tensor.shape()[1];
+
+    for(int element_idx = 0; element_idx < tensor.num_elements(); element_idx += slice_size)
+    {
+        Coordinates id = index2coord(tensor.shape(), element_idx);
+
+        // Top border
+        for(int y = -border_size.top; y < 0; ++y)
+        {
+            id.set(1, y);
+
+            for(int x = -border_size.left; x < static_cast<int>(tensor.shape()[0]) + static_cast<int>(border_size.right); ++x)
+            {
+                id.set(0, x);
+
+                check_border_element(tensor, id, border_mode, border_value, num_elements, num_mismatches);
+            }
+        }
+
+        // Bottom border
+        for(int y = tensor.shape()[1]; y < static_cast<int>(tensor.shape()[1]) + static_cast<int>(border_size.bottom); ++y)
+        {
+            id.set(1, y);
+
+            for(int x = -border_size.left; x < static_cast<int>(tensor.shape()[0]) + static_cast<int>(border_size.right); ++x)
+            {
+                id.set(0, x);
+
+                check_border_element(tensor, id, border_mode, border_value, num_elements, num_mismatches);
+            }
+        }
+
+        // Left/right border
+        for(int y = 0; y < static_cast<int>(tensor.shape()[1]); ++y)
+        {
+            id.set(1, y);
+
+            // Left border
+            for(int x = -border_size.left; x < 0; ++x)
+            {
+                id.set(0, x);
+
+                check_border_element(tensor, id, border_mode, border_value, num_elements, num_mismatches);
+            }
+
+            // Right border
+            for(int x = tensor.shape()[0]; x < static_cast<int>(tensor.shape()[0]) + static_cast<int>(border_size.right); ++x)
+            {
+                id.set(0, x);
+
+                check_border_element(tensor, id, border_mode, border_value, num_elements, num_mismatches);
+            }
+        }
+    }
+
+    const float percent_mismatches = static_cast<float>(num_mismatches) / num_elements * 100.f;
+
+    BOOST_TEST(num_mismatches == 0,
+               num_mismatches << " values (" << std::setprecision(2) << percent_mismatches << "%) mismatched");
+}
+
+void validate(std::vector<unsigned int> classified_labels, std::vector<unsigned int> expected_labels)
+{
+    BOOST_TEST(expected_labels.size() != 0);
+    BOOST_TEST(classified_labels.size() == expected_labels.size());
+
+    for(unsigned int i = 0; i < expected_labels.size(); ++i)
+    {
+        BOOST_TEST(classified_labels[i] == expected_labels[i]);
+    }
+}
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/Validation.h b/tests/validation/Validation.h
new file mode 100644
index 0000000000..865d05b1f6
--- /dev/null
+++ b/tests/validation/Validation.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_REFERENCE_VALIDATION_H__
+#define __ARM_COMPUTE_TEST_REFERENCE_VALIDATION_H__
+
+#include "arm_compute/core/Types.h"
+
+#include "boost_wrapper.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+class Tensor;
+
+namespace test
+{
+class RawTensor;
+class IAccessor;
+
+namespace validation
+{
+template <typename T>
+boost::test_tools::predicate_result compare_dimensions(const Dimensions<T> &dimensions1, const Dimensions<T> &dimensions2)
+{
+    if(dimensions1.num_dimensions() != dimensions2.num_dimensions())
+    {
+        boost::test_tools::predicate_result result(false);
+        result.message() << "Different dimensionality [" << dimensions1.num_dimensions() << "!=" << dimensions2.num_dimensions() << "]";
+        return result;
+    }
+
+    for(unsigned int i = 0; i < dimensions1.num_dimensions(); ++i)
+    {
+        if(dimensions1[i] != dimensions2[i])
+        {
+            boost::test_tools::predicate_result result(false);
+            result.message() << "Mismatch in dimension " << i << " [" << dimensions1[i] << "!=" << dimensions2[i] << "]";
+            return result;
+        }
+    }
+
+    return true;
+}
+
+/** Validate valid regions.
+ *
+ * - Dimensionality has to be the same.
+ * - Anchors have to match.
+ * - Shapes have to match.
+ */
+void validate(const arm_compute::ValidRegion &region, const arm_compute::ValidRegion &reference);
+
+/** Validate padding.
+ *
+ * Padding on all sides has to be the same.
+ */
+void validate(const arm_compute::PaddingSize &padding, const arm_compute::PaddingSize &reference);
+
+/** Validate tensors.
+ *
+ * - Dimensionality has to be the same.
+ * - All values have to match.
+ *
+ * @note: wrap_range allows cases where reference tensor rounds up to the wrapping point, causing it to wrap around to
+ * zero while the test tensor stays at wrapping point to pass. This may permit true erroneous cases (difference between
+ * reference tensor and test tensor is multiple of wrap_range), but such errors would be detected by
+ * other test cases.
+ */
+void validate(const IAccessor &tensor, const RawTensor &reference, float tolerance_value = 0.f, float tolerance_number = 0.f, uint64_t wrap_range = 0);
+
+/** Validate tensors with valid region.
+ *
+ * - Dimensionality has to be the same.
+ * - All values have to match.
+ *
+ * @note: wrap_range allows cases where reference tensor rounds up to the wrapping point, causing it to wrap around to
+ * zero while the test tensor stays at wrapping point to pass. This may permit true erroneous cases (difference between
+ * reference tensor and test tensor is multiple of wrap_range), but such errors would be detected by
+ * other test cases.
+ */
+void validate(const IAccessor &tensor, const RawTensor &reference, const ValidRegion &valid_region, float tolerance_value = 0.f, float tolerance_number = 0.f, uint64_t wrap_range = 0);
+
+/** Validate tensors against constant value.
+ *
+ * - All values have to match.
+ */
+void validate(const IAccessor &tensor, const void *reference_value);
+
+/** Validate border against a constant value.
+ *
+ * - All border values have to match the specified value if mode is CONSTANT.
+ * - All border values have to be replicated if mode is REPLICATE.
+ * - Nothing is validated for mode UNDEFINED.
+ */
+void validate(const IAccessor &tensor, BorderSize border_size, const BorderMode &border_mode, const void *border_value);
+
+/** Validate classified labels against expected ones.
+ *
+ * - All values should match
+ */
+void validate(std::vector<unsigned int> classified_labels, std::vector<unsigned int> expected_labels);
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/validation/ValidationProgramOptions.cpp b/tests/validation/ValidationProgramOptions.cpp
new file mode 100644
index 0000000000..adb8c5ab6c
--- /dev/null
+++ b/tests/validation/ValidationProgramOptions.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ValidationProgramOptions.h"
+
+#include <thread>
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Weffc++"
+#pragma GCC diagnostic ignored "-Wnon-virtual-dtor"
+#pragma GCC diagnostic ignored "-Wctor-dtor-privacy"
+#include "boost/program_options.hpp"
+#pragma GCC diagnostic pop
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+ValidationProgramOptions::ValidationProgramOptions()
+{
+    boost::program_options::options_description options("Validation options");
+    options.add_options()("runs", boost::program_options::value<unsigned int>()->default_value(1), "Repetitions per test");
+    options.add_options()("threads", boost::program_options::value<unsigned int>()->default_value(std::thread::hardware_concurrency()), "Number of parallel CPU threads");
+    add_options(options);
+}
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/ValidationProgramOptions.h b/tests/validation/ValidationProgramOptions.h
new file mode 100644
index 0000000000..bf30db960d
--- /dev/null
+++ b/tests/validation/ValidationProgramOptions.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_VALIDATION_PROGRAM_OPTIONS_H__
+#define __ARM_COMPUTE_TEST_VALIDATION_PROGRAM_OPTIONS_H__
+
+#include "ProgramOptions.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+/** Subclass of @ref ProgramOptions that adds validation specific options. */
+class ValidationProgramOptions : public ProgramOptions
+{
+public:
+    /** Defines additonal options. */
+    ValidationProgramOptions();
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/validation/ValidationUserConfiguration.h b/tests/validation/ValidationUserConfiguration.h
new file mode 100644
index 0000000000..28b58e8375
--- /dev/null
+++ b/tests/validation/ValidationUserConfiguration.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_REFERENCE_VALIDATION_USER_CONFIGURATION_H__
+#define __ARM_COMPUTE_TEST_REFERENCE_VALIDATION_USER_CONFIGURATION_H__
+
+#include "UserConfiguration.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+// Validation requires no specific configuration
+using ValidationUserConfiguration = UserConfiguration;
+} // namespace validation
+
+extern validation::ValidationUserConfiguration user_config;
+} // namespace test
+} // namespace arm_compute
+#endif
diff --git a/tests/validation/main.cpp b/tests/validation/main.cpp
new file mode 100644
index 0000000000..844ee36200
--- /dev/null
+++ b/tests/validation/main.cpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#define BOOST_TEST_ALTERNATIVE_INIT_API
+
+#include "Globals.h"
+#include "TensorLibrary.h"
+#include "Utils.h"
+#include "ValidationProgramOptions.h"
+#include "ValidationUserConfiguration.h"
+
+#include "arm_compute/runtime/Scheduler.h"
+
+#include "boost_wrapper.h"
+
+#include <iostream>
+#include <memory>
+#include <random>
+
+using namespace arm_compute::test;
+using namespace arm_compute::test::validation;
+
+namespace arm_compute
+{
+namespace test
+{
+ValidationUserConfiguration    user_config;
+std::unique_ptr<TensorLibrary> library;
+} // namespace test
+} // namespace arm_compute
+
+struct GlobalFixture
+{
+    GlobalFixture()
+    {
+        if(user_config.seed.is_set())
+        {
+            library = cpp14::make_unique<TensorLibrary>(user_config.path.get(), user_config.seed);
+        }
+        else
+        {
+            library = cpp14::make_unique<TensorLibrary>(user_config.path.get());
+        }
+
+        BOOST_TEST_MESSAGE("Seed: " << library->seed());
+    }
+};
+
+BOOST_GLOBAL_FIXTURE(GlobalFixture);
+
+bool init_unit_test()
+{
+    boost::unit_test::framework::master_test_suite().p_name.value = "Compute Library Validation Tests";
+
+    ValidationProgramOptions options;
+
+    int   &argc = boost::unit_test::framework::master_test_suite().argc;
+    char **argv = boost::unit_test::framework::master_test_suite().argv;
+
+    try
+    {
+        options.parse_commandline(argc, argv);
+
+        if(options.wants_help())
+        {
+            std::cout << "Usage: " << argv[0] << " [options] PATH\n";
+            std::cout << options.get_help() << "\n";
+            return false;
+        }
+
+        user_config = ValidationUserConfiguration(options);
+    }
+    catch(const boost::program_options::required_option &err)
+    {
+        std::cerr << "Error: " << err.what() << "\n";
+        std::cout << "\nUsage: " << argv[0] << " [options] PATH\n";
+        std::cout << options.get_help() << "\n";
+        return false;
+    }
+
+    std::cout << "Using " << user_config.threads << " CPU " << (user_config.threads == 1 ? "thread" : "threads") << "\n";
+    arm_compute::Scheduler::get().set_num_threads(user_config.threads);
+    return true;
+}
diff --git a/tests/validation/system_tests/CL/AlexNet.cpp b/tests/validation/system_tests/CL/AlexNet.cpp
new file mode 100644
index 0000000000..f7a88207c4
--- /dev/null
+++ b/tests/validation/system_tests/CL/AlexNet.cpp
@@ -0,0 +1,111 @@
+#ifdef INTERNAL_ONLY //FIXME Delete this file before the release
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/runtime/CL/CLSubTensor.h"
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
+
+#include "model_objects/AlexNet.h"
+
+#include <array>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::cl;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+using CLAlexNetModel = model_objects::AlexNet<ICLTensor,
+      CLTensor,
+      CLSubTensor,
+      CLAccessor,
+      CLActivationLayer,
+      CLConvolutionLayer,
+      CLFullyConnectedLayer,
+      CLNormalizationLayer,
+      CLPoolingLayer,
+      CLSoftmaxLayer>;
+std::vector<unsigned int> compute_alexnet(unsigned int batches, std::string input_file)
+{
+    std::vector<std::string> weight_files = { "cnn_data/alexnet_model/conv1_w.dat",
+                                              "cnn_data/alexnet_model/conv2_w.dat",
+                                              "cnn_data/alexnet_model/conv3_w.dat",
+                                              "cnn_data/alexnet_model/conv4_w.dat",
+                                              "cnn_data/alexnet_model/conv5_w.dat",
+                                              "cnn_data/alexnet_model/fc6_w.dat",
+                                              "cnn_data/alexnet_model/fc7_w.dat",
+                                              "cnn_data/alexnet_model/fc8_w.dat"
+                                            };
+
+    std::vector<std::string> bias_files = { "cnn_data/alexnet_model/conv1_b.dat",
+                                            "cnn_data/alexnet_model/conv2_b.dat",
+                                            "cnn_data/alexnet_model/conv3_b.dat",
+                                            "cnn_data/alexnet_model/conv4_b.dat",
+                                            "cnn_data/alexnet_model/conv5_b.dat",
+                                            "cnn_data/alexnet_model/fc6_b.dat",
+                                            "cnn_data/alexnet_model/fc7_b.dat",
+                                            "cnn_data/alexnet_model/fc8_b.dat"
+                                          };
+    CLAlexNetModel network{};
+    network.init_weights(batches);
+    network.build();
+    network.allocate();
+    network.fill(weight_files, bias_files);
+    network.feed(std::move(input_file));
+    network.run();
+
+    return network.get_classifications();
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(SYSTEM_TESTS)
+BOOST_AUTO_TEST_SUITE(CL)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_AUTO_TEST_CASE(AlexNet)
+{
+    // Compute alexnet
+    std::vector<unsigned int> classified_labels = compute_alexnet(1, "cnn_data/imagenet_data/shark.dat");
+
+    // Expected labels
+    std::vector<unsigned int> expected_labels = { 2 };
+
+    // Validate labels
+    validate(classified_labels, expected_labels);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
+#endif /* INTERNAL_ONLY */
diff --git a/tests/validation/system_tests/CL/LeNet5.cpp b/tests/validation/system_tests/CL/LeNet5.cpp
new file mode 100644
index 0000000000..8b83cfa3a6
--- /dev/null
+++ b/tests/validation/system_tests/CL/LeNet5.cpp
@@ -0,0 +1,94 @@
+#ifdef INTERNAL_ONLY //FIXME Delete this file before the release
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CL/CLAccessor.h"
+#include "CL/Helper.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
+
+#include "model_objects/LeNet5.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::cl;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+using CLLeNet5Model = model_objects::LeNet5<CLTensor,
+      CLAccessor,
+      CLActivationLayer,
+      CLConvolutionLayer,
+      CLFullyConnectedLayer,
+      CLPoolingLayer,
+      CLSoftmaxLayer>;
+std::vector<unsigned int> compute_lenet5(unsigned int batches, std::string input_file)
+{
+    std::vector<std::string> weight_files = { "cnn_data/lenet_model/conv1_w.dat",
+                                              "cnn_data/lenet_model/conv2_w.dat",
+                                              "cnn_data/lenet_model/ip1_w.dat",
+                                              "cnn_data/lenet_model/ip2_w.dat"
+                                            };
+
+    std::vector<std::string> bias_files = { "cnn_data/lenet_model/conv1_b.dat",
+                                            "cnn_data/lenet_model/conv2_b.dat",
+                                            "cnn_data/lenet_model/ip1_b.dat",
+                                            "cnn_data/lenet_model/ip2_b.dat"
+                                          };
+    CLLeNet5Model network{};
+    network.build(batches);
+    network.fill(weight_files, bias_files);
+    network.feed(std::move(input_file));
+    network.run();
+
+    return network.get_classifications();
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(SYSTEM_TESTS)
+BOOST_AUTO_TEST_SUITE(CL)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_AUTO_TEST_CASE(LeNet5)
+{
+    // Compute alexnet
+    std::vector<unsigned int> classified_labels = compute_lenet5(10, "cnn_data/mnist_data/input100.dat");
+
+    // Expected labels
+    std::vector<unsigned int> expected_labels = { 7, 2, 1, 0, 4, 1, 4, 9, 5, 9 };
+
+    // Validate labels
+    validate(classified_labels, expected_labels);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
+#endif /* INTERNAL_ONLY */
diff --git a/tests/validation/system_tests/NEON/AlexNet.cpp b/tests/validation/system_tests/NEON/AlexNet.cpp
new file mode 100644
index 0000000000..e56110d8de
--- /dev/null
+++ b/tests/validation/system_tests/NEON/AlexNet.cpp
@@ -0,0 +1,112 @@
+#ifdef INTERNAL_ONLY //FIXME Delete this file before the release
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
+#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
+#include "arm_compute/runtime/SubTensor.h"
+
+#include "model_objects/AlexNet.h"
+
+#include <array>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+using NEAlexNetModel = model_objects::AlexNet<ITensor,
+      Tensor,
+      SubTensor,
+      NEAccessor,
+      NEActivationLayer,
+      NEConvolutionLayer,
+      NEFullyConnectedLayer,
+      NENormalizationLayer,
+      NEPoolingLayer,
+      NESoftmaxLayer>;
+std::vector<unsigned int> compute_alexnet(unsigned int batches, std::string input_file)
+{
+    std::vector<std::string> weight_files = { "cnn_data/alexnet_model/conv1_w.dat",
+                                              "cnn_data/alexnet_model/conv2_w.dat",
+                                              "cnn_data/alexnet_model/conv3_w.dat",
+                                              "cnn_data/alexnet_model/conv4_w.dat",
+                                              "cnn_data/alexnet_model/conv5_w.dat",
+                                              "cnn_data/alexnet_model/fc6_w.dat",
+                                              "cnn_data/alexnet_model/fc7_w.dat",
+                                              "cnn_data/alexnet_model/fc8_w.dat"
+                                            };
+
+    std::vector<std::string> bias_files = { "cnn_data/alexnet_model/conv1_b.dat",
+                                            "cnn_data/alexnet_model/conv2_b.dat",
+                                            "cnn_data/alexnet_model/conv3_b.dat",
+                                            "cnn_data/alexnet_model/conv4_b.dat",
+                                            "cnn_data/alexnet_model/conv5_b.dat",
+                                            "cnn_data/alexnet_model/fc6_b.dat",
+                                            "cnn_data/alexnet_model/fc7_b.dat",
+                                            "cnn_data/alexnet_model/fc8_b.dat"
+                                          };
+    NEAlexNetModel network{};
+
+    network.init_weights(batches);
+    network.build();
+    network.allocate();
+    network.fill(weight_files, bias_files);
+    network.feed(std::move(input_file));
+    network.run();
+
+    return network.get_classifications();
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(SYSTEM_TESTS)
+BOOST_AUTO_TEST_SUITE(NEON)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_AUTO_TEST_CASE(AlexNet)
+{
+    // Compute alexnet
+    std::vector<unsigned int> classified_labels = compute_alexnet(1, "cnn_data/imagenet_data/shark.dat");
+
+    // Expected labels
+    std::vector<unsigned int> expected_labels = { 2 };
+
+    // Validate labels
+    validate(classified_labels, expected_labels);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
+#endif /* INTERNAL_ONLY */
diff --git a/tests/validation/system_tests/NEON/LeNet5.cpp b/tests/validation/system_tests/NEON/LeNet5.cpp
new file mode 100644
index 0000000000..a82b84a997
--- /dev/null
+++ b/tests/validation/system_tests/NEON/LeNet5.cpp
@@ -0,0 +1,94 @@
+#ifdef INTERNAL_ONLY //FIXME Delete this file before the release
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
+#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
+
+#include "model_objects/LeNet5.h"
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+using NELeNet5Model = model_objects::LeNet5<Tensor,
+      NEAccessor,
+      NEActivationLayer,
+      NEConvolutionLayer,
+      NEFullyConnectedLayer,
+      NEPoolingLayer,
+      NESoftmaxLayer>;
+std::vector<unsigned int> compute_lenet5(unsigned int batches, std::string input_file)
+{
+    std::vector<std::string> weight_files = { "cnn_data/lenet_model/conv1_w.dat",
+                                              "cnn_data/lenet_model/conv2_w.dat",
+                                              "cnn_data/lenet_model/ip1_w.dat",
+                                              "cnn_data/lenet_model/ip2_w.dat"
+                                            };
+
+    std::vector<std::string> bias_files = { "cnn_data/lenet_model/conv1_b.dat",
+                                            "cnn_data/lenet_model/conv2_b.dat",
+                                            "cnn_data/lenet_model/ip1_b.dat",
+                                            "cnn_data/lenet_model/ip2_b.dat"
+                                          };
+    NELeNet5Model network{};
+    network.build(batches);
+    network.fill(weight_files, bias_files);
+    network.feed(std::move(input_file));
+    network.run();
+
+    return network.get_classifications();
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(SYSTEM_TESTS)
+BOOST_AUTO_TEST_SUITE(NEON)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_AUTO_TEST_CASE(LeNet5)
+{
+    // Compute alexnet
+    std::vector<unsigned int> classified_labels = compute_lenet5(10, "cnn_data/mnist_data/input100.dat");
+
+    // Expected labels
+    std::vector<unsigned int> expected_labels = { 7, 2, 1, 0, 4, 1, 4, 9, 5, 9 };
+
+    // Validate labels
+    validate(classified_labels, expected_labels);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
+#endif /* INTERNAL_ONLY */
diff --git a/utils/Utils.cpp b/utils/Utils.cpp
new file mode 100644
index 0000000000..5316690a3d
--- /dev/null
+++ b/utils/Utils.cpp
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Utils.h"
+
+#include <cctype>
+#include <cerrno>
+#include <iomanip>
+#include <string>
+
+namespace arm_compute
+{
+namespace utils
+{
+namespace
+{
+/* Advance the iterator to the first character which is not a comment
+ *
+ * @param[in,out] fs Stream to drop comments from
+ */
+void discard_comments(std::ifstream &fs)
+{
+    while(fs.peek() == '#')
+    {
+        fs.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
+    }
+}
+
+/* Advance the string iterator to the next character which is neither a space or a comment
+ *
+ * @param[in,out] fs Stream to drop comments from
+ */
+void discard_comments_and_spaces(std::ifstream &fs)
+{
+    while(true)
+    {
+        discard_comments(fs);
+
+        if(isspace(fs.peek()) == 0)
+        {
+            break;
+        }
+
+        fs.ignore(1);
+    }
+}
+} // namespace
+
+int run_example(int argc, const char **argv, example &func)
+{
+    std::cout << "\n"
+              << argv[0] << "\n\n";
+
+    try
+    {
+        func(argc, argv);
+
+        std::cout << "\nTest passed\n";
+        return 0;
+    }
+#ifdef ARM_COMPUTE_CL
+    catch(cl::Error &err)
+    {
+        std::cerr << "!!!!!!!!!!!!!!!!!!!!!!!!!!!" << std::endl;
+        std::cerr << std::endl
+                  << "ERROR " << err.what() << "(" << err.err() << ")" << std::endl;
+        std::cerr << "!!!!!!!!!!!!!!!!!!!!!!!!!!!" << std::endl;
+    }
+#endif /* ARM_COMPUTE_CL */
+    catch(std::runtime_error &err)
+    {
+        std::cerr << "!!!!!!!!!!!!!!!!!!!!!!!!!!!" << std::endl;
+        std::cerr << std::endl
+                  << "ERROR " << err.what() << " " << (errno ? strerror(errno) : "") << std::endl;
+        std::cerr << "!!!!!!!!!!!!!!!!!!!!!!!!!!!" << std::endl;
+    }
+
+    std::cout << "\nTest FAILED\n";
+
+    return -1;
+}
+
+void draw_detection_rectangle(ITensor *tensor, const DetectionWindow &rect, uint8_t r, uint8_t g, uint8_t b)
+{
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(tensor, Format::RGB888);
+
+    uint8_t *top    = tensor->info()->offset_element_in_bytes(Coordinates(rect.x, rect.y)) + tensor->buffer();
+    uint8_t *bottom = tensor->info()->offset_element_in_bytes(Coordinates(rect.x, rect.y + rect.height)) + tensor->buffer();
+    uint8_t *left   = top;
+    uint8_t *right  = tensor->info()->offset_element_in_bytes(Coordinates(rect.x + rect.width, rect.y)) + tensor->buffer();
+    size_t   stride = tensor->info()->strides_in_bytes()[Window::DimY];
+
+    for(size_t x = 0; x < rect.width; ++x)
+    {
+        top[0]    = r;
+        top[1]    = g;
+        top[2]    = b;
+        bottom[0] = r;
+        bottom[1] = g;
+        bottom[2] = b;
+
+        top += 3;
+        bottom += 3;
+    }
+
+    for(size_t y = 0; y < rect.height; ++y)
+    {
+        left[0]  = r;
+        left[1]  = g;
+        left[2]  = b;
+        right[0] = r;
+        right[1] = g;
+        right[2] = b;
+
+        left += stride;
+        right += stride;
+    }
+}
+
+std::tuple<unsigned int, unsigned int, int> parse_ppm_header(std::ifstream &fs)
+{
+    // Check the PPM magic number is valid
+    std::array<char, 2> magic_number{ { 0 } };
+    fs >> magic_number[0] >> magic_number[1];
+    ARM_COMPUTE_ERROR_ON_MSG(magic_number[0] != 'P' || magic_number[1] != '6', "Invalid file type");
+    ARM_COMPUTE_UNUSED(magic_number);
+
+    discard_comments_and_spaces(fs);
+
+    unsigned int width = 0;
+    fs >> width;
+
+    discard_comments_and_spaces(fs);
+
+    unsigned int height = 0;
+    fs >> height;
+
+    discard_comments_and_spaces(fs);
+
+    int max_val = 0;
+    fs >> max_val;
+
+    discard_comments(fs);
+
+    ARM_COMPUTE_ERROR_ON_MSG(isspace(fs.peek()) == 0, "Invalid PPM header");
+    fs.ignore(1);
+
+    return std::make_tuple(width, height, max_val);
+}
+} // namespace utils
+} // namespace arm_compute
diff --git a/utils/Utils.h b/utils/Utils.h
new file mode 100644
index 0000000000..b519f83a83
--- /dev/null
+++ b/utils/Utils.h
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __UTILS_UTILS_H__
+#define __UTILS_UTILS_H__
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#ifdef ARM_COMPUTE_CL
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#endif /* ARM_COMPUTE_CL */
+
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+
+namespace arm_compute
+{
+namespace utils
+{
+/** Signature of an example to run
+ *
+ * @param[in] argc Number of command line arguments
+ * @param[in] argv Command line arguments
+ */
+using example = void(int argc, const char **argv);
+
+/** Run an example and handle the potential exceptions it throws
+ *
+ * @param[in] argc Number of command line arguments
+ * @param[in] argv Command line arguments
+ * @param[in] func Pointer to the function containing the code to run
+ */
+int run_example(int argc, const char **argv, example &func);
+
+/** Draw a RGB rectangular window for the detected object
+ *
+ * @param[in, out] tensor Input tensor where the rectangle will be drawn on. Format supported: RGB888
+ * @param[in]      rect   Geometry of the rectangular window
+ * @param[in]      r      Red colour to use
+ * @param[in]      g      Green colour to use
+ * @param[in]      b      Blue colour to use
+ */
+void draw_detection_rectangle(arm_compute::ITensor *tensor, const arm_compute::DetectionWindow &rect, uint8_t r, uint8_t g, uint8_t b);
+
+/** Parse the ppm header from an input file stream. At the end of the execution,
+ *  the file position pointer will be located at the first pixel stored in the ppm file
+ *
+ * @param[in] fs Input file stream to parse
+ *
+ * @return The width, height and max value stored in the header of the PPM file
+ */
+std::tuple<unsigned int, unsigned int, int> parse_ppm_header(std::ifstream &fs);
+
+/** Class to load the content of a PPM file into an Image
+ */
+class PPMLoader
+{
+public:
+    PPMLoader()
+        : _fs(), _width(0), _height(0)
+    {
+    }
+    /** Open a PPM file and reads its metadata (Width, height)
+     *
+     * @param[in] ppm_filename File to open
+     */
+    void open(const std::string &ppm_filename)
+    {
+        ARM_COMPUTE_ERROR_ON(is_open());
+        try
+        {
+            _fs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
+            _fs.open(ppm_filename, std::ios::in | std::ios::binary);
+
+            unsigned int max_val = 0;
+            std::tie(_width, _height, max_val) = parse_ppm_header(_fs);
+
+            ARM_COMPUTE_ERROR_ON_MSG(max_val >= 256, "2 bytes per colour channel not supported in file %s", ppm_filename.c_str());
+        }
+        catch(const std::ifstream::failure &e)
+        {
+            ARM_COMPUTE_ERROR("Accessing %s: %s", ppm_filename.c_str(), e.what());
+        }
+    }
+    /** Return true if a PPM file is currently open
+         */
+    bool is_open()
+    {
+        return _fs.is_open();
+    }
+
+    /** Initialise an image's metadata with the dimensions of the PPM file currently open
+     *
+     * @param[out] image  Image to initialise
+     * @param[in]  format Format to use for the image (Must be RGB888 or U8)
+     */
+    template <typename T>
+    void init_image(T &image, arm_compute::Format format)
+    {
+        ARM_COMPUTE_ERROR_ON(!is_open());
+        ARM_COMPUTE_ERROR_ON(format != arm_compute::Format::RGB888 && format != arm_compute::Format::U8);
+
+        // Use the size of the input PPM image
+        arm_compute::TensorInfo image_info(_width, _height, format);
+        image.allocator()->init(image_info);
+    }
+
+    /** Fill an image with the content of the currently open PPM file.
+     *
+     * @note If the image is a CLImage, the function maps and unmaps the image
+     *
+     * @param[in,out] image Image to fill (Must be allocated, and of matching dimensions with the opened PPM).
+     */
+    template <typename T>
+    void fill_image(T &image)
+    {
+        ARM_COMPUTE_ERROR_ON(!is_open());
+        ARM_COMPUTE_ERROR_ON(image.info()->dimension(0) != _width || image.info()->dimension(1) != _height);
+        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(&image, arm_compute::Format::U8, arm_compute::Format::RGB888);
+        try
+        {
+#ifdef ARM_COMPUTE_CL
+            // Map buffer if creating a CLTensor
+            if(std::is_same<typename std::decay<T>::type, arm_compute::CLImage>::value)
+            {
+                image.map();
+            }
+#endif
+            // Check if the file is large enough to fill the image
+            const size_t current_position = _fs.tellg();
+            _fs.seekg(0, std::ios_base::end);
+            const size_t end_position = _fs.tellg();
+            _fs.seekg(current_position, std::ios_base::beg);
+
+            ARM_COMPUTE_ERROR_ON_MSG((end_position - current_position) < image.info()->tensor_shape().total_size() * image.info()->element_size(),
+                                     "Not enough data in file");
+            ARM_COMPUTE_UNUSED(end_position);
+
+            switch(image.info()->format())
+            {
+                case arm_compute::Format::U8:
+                {
+                    // We need to convert the data from RGB to grayscale:
+                    // Iterate through every pixel of the image
+                    arm_compute::Window window;
+                    window.set(arm_compute::Window::DimX, arm_compute::Window::Dimension(0, _width, 1));
+                    window.set(arm_compute::Window::DimY, arm_compute::Window::Dimension(0, _height, 1));
+
+                    arm_compute::Iterator out(&image, window);
+
+                    unsigned char red   = 0;
+                    unsigned char green = 0;
+                    unsigned char blue  = 0;
+
+                    arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates & id)
+                    {
+                        red   = _fs.get();
+                        green = _fs.get();
+                        blue  = _fs.get();
+
+                        *out.ptr() = 0.2126f * red + 0.7152f * green + 0.0722f * blue;
+                    },
+                    out);
+
+                    break;
+                }
+                case arm_compute::Format::RGB888:
+                {
+                    // There is no format conversion needed: we can simply copy the content of the input file to the image one row at the time.
+                    // Create a vertical window to iterate through the image's rows:
+                    arm_compute::Window window;
+                    window.set(arm_compute::Window::DimY, arm_compute::Window::Dimension(0, _height, 1));
+
+                    arm_compute::Iterator out(&image, window);
+
+                    arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates & id)
+                    {
+                        // Copy one row from the input file to the current row of the image:
+                        _fs.read(reinterpret_cast<std::fstream::char_type *>(out.ptr()), _width * image.info()->element_size());
+                    },
+                    out);
+
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported format");
+            }
+
+#ifdef ARM_COMPUTE_CL
+            // Unmap buffer if creating a CLTensor
+            if(std::is_same<typename std::decay<T>::type, arm_compute::CLTensor>::value)
+            {
+                image.unmap();
+            }
+#endif
+        }
+        catch(const std::ifstream::failure &e)
+        {
+            ARM_COMPUTE_ERROR("Loading PPM file: %s", e.what());
+        }
+    }
+
+private:
+    std::ifstream _fs;
+    unsigned int  _width, _height;
+};
+
+/** Template helper function to save a tensor image to a PPM file.
+ *
+ * @note Only U8 and RGB888 formats supported.
+ * @note Only works with 2D tensors.
+ * @note If the input tensor is a CLTensor, the function maps and unmaps the image
+ *
+ * @param[in] tensor       The tensor to save as PPM file
+ * @param[in] ppm_filename Filename of the file to create.
+ */
+template <typename T>
+void save_to_ppm(T &tensor, const std::string &ppm_filename)
+{
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(&tensor, arm_compute::Format::RGB888, arm_compute::Format::U8);
+    ARM_COMPUTE_ERROR_ON(tensor.info()->num_dimensions() > 2);
+
+    std::ofstream fs;
+
+    try
+    {
+        fs.exceptions(std::ofstream::failbit | std::ofstream::badbit | std::ofstream::eofbit);
+        fs.open(ppm_filename, std::ios::out | std::ios::binary);
+
+        const unsigned int width  = tensor.info()->tensor_shape()[0];
+        const unsigned int height = tensor.info()->tensor_shape()[1];
+
+        fs << "P6\n"
+           << width << " " << height << " 255\n";
+
+#ifdef ARM_COMPUTE_CL
+        // Map buffer if creating a CLTensor
+        if(std::is_same<typename std::decay<T>::type, arm_compute::CLTensor>::value)
+        {
+            tensor.map();
+        }
+#endif
+
+        switch(tensor.info()->format())
+        {
+            case arm_compute::Format::U8:
+            {
+                arm_compute::Window window;
+                window.set(arm_compute::Window::DimX, arm_compute::Window::Dimension(0, width, 1));
+                window.set(arm_compute::Window::DimY, arm_compute::Window::Dimension(0, height, 1));
+
+                arm_compute::Iterator in(&tensor, window);
+
+                arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates & id)
+                {
+                    const unsigned char value = *in.ptr();
+
+                    fs << value << value << value;
+                },
+                in);
+
+                break;
+            }
+            case arm_compute::Format::RGB888:
+            {
+                arm_compute::Window window;
+                window.set(arm_compute::Window::DimX, arm_compute::Window::Dimension(0, width, width));
+                window.set(arm_compute::Window::DimY, arm_compute::Window::Dimension(0, height, 1));
+
+                arm_compute::Iterator in(&tensor, window);
+
+                arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates & id)
+                {
+                    fs.write(reinterpret_cast<std::fstream::char_type *>(in.ptr()), width * tensor.info()->element_size());
+                },
+                in);
+
+                break;
+            }
+            default:
+                ARM_COMPUTE_ERROR("Unsupported format");
+        }
+#ifdef ARM_COMPUTE_CL
+        // Unmap buffer if creating a CLTensor
+        if(std::is_same<typename std::decay<T>::type, arm_compute::CLTensor>::value)
+        {
+            tensor.unmap();
+        }
+#endif
+    }
+    catch(const std::ofstream::failure &e)
+    {
+        ARM_COMPUTE_ERROR("Writing %s: (%s)", ppm_filename.c_str(), e.what());
+    }
+}
+} // namespace utils
+} // namespace arm_compute
+#endif /* __UTILS_UTILS_H__*/
author	Anthony Barbier <anthony.barbier@arm.com>	2017-09-04 18:44:23 +0100
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-09-17 13:03:09 +0100
commit	6ff3b19ee6120edf015fad8caab2991faa3070af (patch)
tree	a7a6dcd16dfd56d79fa1b56a313caeebcc939b68
download	ComputeLibrary-6ff3b19ee6120edf015fad8caab2991faa3070af.tar.gz