aboutsummaryrefslogtreecommitdiff
path: root/src/backends/cl
diff options
context:
space:
mode:
Diffstat (limited to 'src/backends/cl')
-rw-r--r--src/backends/cl/CMakeLists.txt22
-rw-r--r--src/backends/cl/ClContextControl.cpp235
-rw-r--r--src/backends/cl/ClContextControl.hpp62
-rw-r--r--src/backends/cl/ClLayerSupport.cpp478
-rw-r--r--src/backends/cl/ClLayerSupport.hpp164
-rw-r--r--src/backends/cl/ClTensorHandle.hpp141
-rw-r--r--src/backends/cl/ClWorkloadFactory.cpp506
-rw-r--r--src/backends/cl/ClWorkloadFactory.hpp139
-rw-r--r--src/backends/cl/backend.cmake13
-rw-r--r--src/backends/cl/backend.mk51
-rw-r--r--src/backends/cl/test/CMakeLists.txt0
-rw-r--r--src/backends/cl/workloads/CMakeLists.txt92
-rw-r--r--src/backends/cl/workloads/ClActivationFloatWorkload.cpp56
-rw-r--r--src/backends/cl/workloads/ClActivationFloatWorkload.hpp29
-rw-r--r--src/backends/cl/workloads/ClActivationUint8Workload.cpp44
-rw-r--r--src/backends/cl/workloads/ClActivationUint8Workload.hpp29
-rw-r--r--src/backends/cl/workloads/ClAdditionWorkload.cpp66
-rw-r--r--src/backends/cl/workloads/ClAdditionWorkload.hpp31
-rw-r--r--src/backends/cl/workloads/ClBaseConstantWorkload.cpp64
-rw-r--r--src/backends/cl/workloads/ClBaseConstantWorkload.hpp30
-rw-r--r--src/backends/cl/workloads/ClBaseMergerWorkload.hpp28
-rw-r--r--src/backends/cl/workloads/ClBaseSplitterWorkload.hpp28
-rw-r--r--src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.cpp96
-rw-r--r--src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.hpp46
-rw-r--r--src/backends/cl/workloads/ClConstantFloatWorkload.cpp18
-rw-r--r--src/backends/cl/workloads/ClConstantFloatWorkload.hpp20
-rw-r--r--src/backends/cl/workloads/ClConstantUint8Workload.cpp18
-rw-r--r--src/backends/cl/workloads/ClConstantUint8Workload.hpp20
-rw-r--r--src/backends/cl/workloads/ClConvertFp16ToFp32Workload.cpp66
-rw-r--r--src/backends/cl/workloads/ClConvertFp16ToFp32Workload.hpp30
-rw-r--r--src/backends/cl/workloads/ClConvertFp32ToFp16Workload.cpp66
-rw-r--r--src/backends/cl/workloads/ClConvertFp32ToFp16Workload.hpp30
-rw-r--r--src/backends/cl/workloads/ClConvolution2dBaseWorkload.cpp48
-rw-r--r--src/backends/cl/workloads/ClConvolution2dBaseWorkload.hpp24
-rw-r--r--src/backends/cl/workloads/ClConvolution2dFloatWorkload.cpp81
-rw-r--r--src/backends/cl/workloads/ClConvolution2dFloatWorkload.hpp35
-rw-r--r--src/backends/cl/workloads/ClConvolution2dUint8Workload.cpp81
-rw-r--r--src/backends/cl/workloads/ClConvolution2dUint8Workload.hpp35
-rw-r--r--src/backends/cl/workloads/ClDepthwiseConvolutionBaseWorkload.cpp125
-rw-r--r--src/backends/cl/workloads/ClDepthwiseConvolutionBaseWorkload.hpp40
-rw-r--r--src/backends/cl/workloads/ClDepthwiseConvolutionFloatWorkload.cpp39
-rw-r--r--src/backends/cl/workloads/ClDepthwiseConvolutionFloatWorkload.hpp26
-rw-r--r--src/backends/cl/workloads/ClDepthwiseConvolutionUint8Workload.cpp40
-rw-r--r--src/backends/cl/workloads/ClDepthwiseConvolutionUint8Workload.hpp23
-rw-r--r--src/backends/cl/workloads/ClDivisionFloatWorkload.cpp48
-rw-r--r--src/backends/cl/workloads/ClDivisionFloatWorkload.hpp32
-rw-r--r--src/backends/cl/workloads/ClFloorFloatWorkload.cpp31
-rw-r--r--src/backends/cl/workloads/ClFloorFloatWorkload.hpp30
-rw-r--r--src/backends/cl/workloads/ClFullyConnectedWorkload.cpp96
-rw-r--r--src/backends/cl/workloads/ClFullyConnectedWorkload.hpp43
-rw-r--r--src/backends/cl/workloads/ClL2NormalizationFloatWorkload.cpp50
-rw-r--r--src/backends/cl/workloads/ClL2NormalizationFloatWorkload.hpp35
-rw-r--r--src/backends/cl/workloads/ClLstmFloatWorkload.cpp391
-rw-r--r--src/backends/cl/workloads/ClLstmFloatWorkload.hpp68
-rw-r--r--src/backends/cl/workloads/ClMergerFloatWorkload.cpp20
-rw-r--r--src/backends/cl/workloads/ClMergerFloatWorkload.hpp22
-rw-r--r--src/backends/cl/workloads/ClMergerUint8Workload.cpp19
-rw-r--r--src/backends/cl/workloads/ClMergerUint8Workload.hpp21
-rw-r--r--src/backends/cl/workloads/ClMultiplicationFloatWorkload.cpp60
-rw-r--r--src/backends/cl/workloads/ClMultiplicationFloatWorkload.hpp34
-rw-r--r--src/backends/cl/workloads/ClNormalizationFloatWorkload.cpp51
-rw-r--r--src/backends/cl/workloads/ClNormalizationFloatWorkload.hpp29
-rw-r--r--src/backends/cl/workloads/ClPadWorkload.cpp63
-rw-r--r--src/backends/cl/workloads/ClPadWorkload.hpp32
-rw-r--r--src/backends/cl/workloads/ClPermuteWorkload.cpp56
-rw-r--r--src/backends/cl/workloads/ClPermuteWorkload.hpp42
-rw-r--r--src/backends/cl/workloads/ClPooling2dBaseWorkload.cpp47
-rw-r--r--src/backends/cl/workloads/ClPooling2dBaseWorkload.hpp33
-rw-r--r--src/backends/cl/workloads/ClPooling2dFloatWorkload.cpp26
-rw-r--r--src/backends/cl/workloads/ClPooling2dFloatWorkload.hpp22
-rw-r--r--src/backends/cl/workloads/ClPooling2dUint8Workload.cpp27
-rw-r--r--src/backends/cl/workloads/ClPooling2dUint8Workload.hpp25
-rw-r--r--src/backends/cl/workloads/ClReshapeFloatWorkload.cpp33
-rw-r--r--src/backends/cl/workloads/ClReshapeFloatWorkload.hpp28
-rw-r--r--src/backends/cl/workloads/ClReshapeUint8Workload.cpp31
-rw-r--r--src/backends/cl/workloads/ClReshapeUint8Workload.hpp29
-rw-r--r--src/backends/cl/workloads/ClResizeBilinearFloatWorkload.cpp38
-rw-r--r--src/backends/cl/workloads/ClResizeBilinearFloatWorkload.hpp25
-rw-r--r--src/backends/cl/workloads/ClSoftmaxBaseWorkload.cpp30
-rw-r--r--src/backends/cl/workloads/ClSoftmaxBaseWorkload.hpp17
-rw-r--r--src/backends/cl/workloads/ClSoftmaxFloatWorkload.cpp33
-rw-r--r--src/backends/cl/workloads/ClSoftmaxFloatWorkload.hpp30
-rw-r--r--src/backends/cl/workloads/ClSoftmaxUint8Workload.cpp43
-rw-r--r--src/backends/cl/workloads/ClSoftmaxUint8Workload.hpp31
-rw-r--r--src/backends/cl/workloads/ClSplitterFloatWorkload.cpp19
-rw-r--r--src/backends/cl/workloads/ClSplitterFloatWorkload.hpp20
-rw-r--r--src/backends/cl/workloads/ClSplitterUint8Workload.cpp19
-rw-r--r--src/backends/cl/workloads/ClSplitterUint8Workload.hpp21
-rw-r--r--src/backends/cl/workloads/ClSubtractionWorkload.cpp66
-rw-r--r--src/backends/cl/workloads/ClSubtractionWorkload.hpp31
-rw-r--r--src/backends/cl/workloads/ClWorkloadUtils.hpp63
-rw-r--r--src/backends/cl/workloads/ClWorkloads.hpp41
92 files changed, 5416 insertions, 0 deletions
diff --git a/src/backends/cl/CMakeLists.txt b/src/backends/cl/CMakeLists.txt
new file mode 100644
index 0000000000..80ca0acc08
--- /dev/null
+++ b/src/backends/cl/CMakeLists.txt
@@ -0,0 +1,22 @@
+#
+# Copyright © 2017 Arm Ltd. All rights reserved.
+# SPDX-License-Identifier: MIT
+#
+
+list(APPEND armnnClBackend_sources
+ ClContextControl.cpp
+ ClContextControl.hpp
+ ClLayerSupport.cpp
+ ClLayerSupport.hpp
+ ClWorkloadFactory.cpp
+ ClWorkloadFactory.hpp
+)
+
+if(ARMCOMPUTECL)
+ add_subdirectory(workloads test)
+endif()
+
+add_library(armnnClBackend STATIC ${armnnClBackend_sources})
+target_include_directories(armnnClBackend PRIVATE ${PROJECT_SOURCE_DIR}/src)
+target_include_directories(armnnClBackend PRIVATE ${PROJECT_SOURCE_DIR}/src/armnn)
+target_include_directories(armnnClBackend PRIVATE ${PROJECT_SOURCE_DIR}/src/armnnUtils)
diff --git a/src/backends/cl/ClContextControl.cpp b/src/backends/cl/ClContextControl.cpp
new file mode 100644
index 0000000000..e8b21c942d
--- /dev/null
+++ b/src/backends/cl/ClContextControl.cpp
@@ -0,0 +1,235 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClContextControl.hpp"
+
+#include "armnn/Exceptions.hpp"
+
+#ifdef ARMCOMPUTECL_ENABLED
+#include <arm_compute/core/CL/CLKernelLibrary.h>
+#include <arm_compute/runtime/CL/CLScheduler.h>
+#endif
+
+#include <boost/assert.hpp>
+#include <boost/format.hpp>
+#include <boost/log/trivial.hpp>
+#include <boost/polymorphic_cast.hpp>
+#include <boost/core/ignore_unused.hpp>
+
+#include "LeakChecking.hpp"
+
+namespace cl
+{
+class Context;
+class CommandQueue;
+class Device;
+}
+
+namespace armnn
+{
+
+ClContextControl::ClContextControl(IGpuAccTunedParameters* clTunedParameters,
+ bool profilingEnabled)
+ : m_clTunedParameters(boost::polymorphic_downcast<ClTunedParameters*>(clTunedParameters))
+ , m_ProfilingEnabled(profilingEnabled)
+{
+ // Ignore m_ProfilingEnabled if unused to avoid compiling problems when ArmCompute is disabled.
+ boost::ignore_unused(m_ProfilingEnabled);
+
+#ifdef ARMCOMPUTECL_ENABLED
+ try
+ {
+ std::vector<cl::Platform> platforms;
+ cl::Platform::get(&platforms);
+
+ // Selects default platform for the first element.
+ cl::Platform::setDefault(platforms[0]);
+
+ std::vector<cl::Device> devices;
+ platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &devices);
+
+ // Selects default device for the first element.
+ cl::Device::setDefault(devices[0]);
+ }
+ catch (const cl::Error& clError)
+ {
+ throw ClRuntimeUnavailableException(boost::str(boost::format(
+ "Could not initialize the CL runtime. Error description: %1%. CL error code: %2%"
+ ) % clError.what() % clError.err()));
+ }
+
+ // Removes the use of global CL context.
+ cl::Context::setDefault(cl::Context{});
+ BOOST_ASSERT(cl::Context::getDefault()() == NULL);
+
+ // Removes the use of global CL command queue.
+ cl::CommandQueue::setDefault(cl::CommandQueue{});
+ BOOST_ASSERT(cl::CommandQueue::getDefault()() == NULL);
+
+ // Always load the OpenCL runtime.
+ LoadOpenClRuntime();
+#endif
+}
+
+ClContextControl::~ClContextControl()
+{
+#ifdef ARMCOMPUTECL_ENABLED
+ // Load the OpencCL runtime without the tuned parameters to free the memory for them.
+ try
+ {
+ UnloadOpenClRuntime();
+ }
+ catch (const cl::Error& clError)
+ {
+ // This should not happen, it is ignored if it does.
+
+ // Coverity fix: BOOST_LOG_TRIVIAL (previously used here to report the error) may throw an
+ // exception of type std::length_error.
+ // Using stderr instead in this context as there is no point in nesting try-catch blocks here.
+ std::cerr << "A CL error occurred unloading the runtime tuner parameters: "
+ << clError.what() << ". CL error code is: " << clError.err() << std::endl;
+ }
+#endif
+}
+
+void ClContextControl::LoadOpenClRuntime()
+{
+ DoLoadOpenClRuntime(true);
+}
+
+void ClContextControl::UnloadOpenClRuntime()
+{
+ DoLoadOpenClRuntime(false);
+}
+
+void ClContextControl::DoLoadOpenClRuntime(bool useTunedParameters)
+{
+#ifdef ARMCOMPUTECL_ENABLED
+ cl::Device device = cl::Device::getDefault();
+ cl::Context context;
+ cl::CommandQueue commandQueue;
+
+ if (arm_compute::CLScheduler::get().context()() != NULL)
+ {
+ // Wait for all queued CL requests to finish before reinitialising it.
+ arm_compute::CLScheduler::get().sync();
+ }
+
+ try
+ {
+ arm_compute::CLKernelLibrary::get().clear_programs_cache();
+ // Initialise the scheduler with a dummy context to release the LLVM data (which only happens when there are no
+ // context references); it is initialised again, with a proper context, later.
+ arm_compute::CLScheduler::get().init(context, commandQueue, device);
+ arm_compute::CLKernelLibrary::get().init(".", context, device);
+
+ {
+ //
+ // Here we replace the context with a new one in which
+ // the memory leak checks show it as an extra allocation but
+ // because of the scope of the leak checks, it doesn't count
+ // the disposal of the original object. On the other hand it
+ // does count the creation of this context which it flags
+ // as a memory leak. By adding the following line we prevent
+ // this to happen.
+ //
+ ARMNN_DISABLE_LEAK_CHECKING_IN_SCOPE();
+ context = cl::Context(device);
+ }
+
+ // NOTE: In this specific case profiling has to be enabled on the command queue
+ // in order for the CLTuner to work.
+ bool profilingNeededForClTuner = useTunedParameters && m_clTunedParameters &&
+ m_clTunedParameters->m_Mode == IGpuAccTunedParameters::Mode::UpdateTunedParameters;
+
+ if (m_ProfilingEnabled || profilingNeededForClTuner)
+ {
+ // Create a new queue with profiling enabled.
+ commandQueue = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE);
+ }
+ else
+ {
+ // Use default queue.
+ commandQueue = cl::CommandQueue(context, device);
+ }
+ }
+ catch (const cl::Error& clError)
+ {
+ throw ClRuntimeUnavailableException(boost::str(boost::format(
+ "Could not initialize the CL runtime. Error description: %1%. CL error code: %2%"
+ ) % clError.what() % clError.err()));
+ }
+
+ // Note the first argument (path to cl source code) will be ignored as they should be embedded in the armcompute.
+ arm_compute::CLKernelLibrary::get().init(".", context, device);
+
+ arm_compute::ICLTuner* tuner = nullptr;
+ if (useTunedParameters && m_clTunedParameters)
+ {
+ tuner = &m_clTunedParameters->m_Tuner;
+ }
+ arm_compute::CLScheduler::get().init(context, commandQueue, device, tuner);
+#endif
+}
+
+void ClContextControl::ClearClCache()
+{
+ DoLoadOpenClRuntime(true);
+}
+
+armnn::IGpuAccTunedParameters* IGpuAccTunedParameters::CreateRaw(armnn::IGpuAccTunedParameters::Mode mode)
+{
+ return new ClTunedParameters(mode);
+}
+
+armnn::IGpuAccTunedParametersPtr IGpuAccTunedParameters::Create(armnn::IGpuAccTunedParameters::Mode mode)
+{
+ return IGpuAccTunedParametersPtr(CreateRaw(mode), &IGpuAccTunedParameters::Destroy);
+}
+
+void IGpuAccTunedParameters::Destroy(IGpuAccTunedParameters* params)
+{
+ delete params;
+}
+
+ClTunedParameters::ClTunedParameters(armnn::IGpuAccTunedParameters::Mode mode)
+ : m_Mode(mode)
+#ifdef ARMCOMPUTECL_ENABLED
+ , m_Tuner(mode == ClTunedParameters::Mode::UpdateTunedParameters)
+#endif
+{
+}
+
+void ClTunedParameters::Load(const char* filename)
+{
+#ifdef ARMCOMPUTECL_ENABLED
+ try
+ {
+ m_Tuner.load_from_file(filename);
+ }
+ catch (const std::exception& e)
+ {
+ throw armnn::Exception(std::string("Failed to load tuned parameters file '") + filename + "': " +
+ e.what());
+ }
+#endif
+}
+
+void ClTunedParameters::Save(const char* filename) const
+{
+#ifdef ARMCOMPUTECL_ENABLED
+ try
+ {
+ m_Tuner.save_to_file(filename);
+ }
+ catch (const std::exception& e)
+ {
+ throw armnn::Exception(std::string("Failed to save tuned parameters file to '") + filename + "': " +
+ e.what());
+ }
+#endif
+}
+
+} // namespace armnn
diff --git a/src/backends/cl/ClContextControl.hpp b/src/backends/cl/ClContextControl.hpp
new file mode 100644
index 0000000000..5ac56423bd
--- /dev/null
+++ b/src/backends/cl/ClContextControl.hpp
@@ -0,0 +1,62 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include "armnn/IRuntime.hpp"
+
+#ifdef ARMCOMPUTECL_ENABLED
+#include <arm_compute/runtime/CL/CLTuner.h>
+#endif
+
+namespace armnn
+{
+
+class IGpuAccTunedParameters;
+class ClTunedParameters;
+
+// ARM Compute OpenCL context control.
+class ClContextControl
+{
+public:
+
+ ClContextControl(IGpuAccTunedParameters* clTunedParameters = nullptr,
+ bool profilingEnabled = false);
+
+ virtual ~ClContextControl();
+
+ void LoadOpenClRuntime();
+
+ // Users should call this (after freeing all of the cl::Context objects they use)
+ // to release the cached memory used by the compute library.
+ void UnloadOpenClRuntime();
+
+ // Clear the CL cache, without losing the tuned parameter settings.
+ void ClearClCache();
+
+private:
+
+ void DoLoadOpenClRuntime(bool useTunedParameters);
+
+ ClTunedParameters* m_clTunedParameters;
+
+ bool m_ProfilingEnabled;
+};
+
+class ClTunedParameters : public IGpuAccTunedParameters
+{
+public:
+ ClTunedParameters(armnn::IGpuAccTunedParameters::Mode mode);
+
+ virtual void Load(const char* filename);
+ virtual void Save(const char* filename) const;
+
+ Mode m_Mode;
+
+#ifdef ARMCOMPUTECL_ENABLED
+ arm_compute::CLTuner m_Tuner;
+#endif
+};
+
+} // namespace armnn
diff --git a/src/backends/cl/ClLayerSupport.cpp b/src/backends/cl/ClLayerSupport.cpp
new file mode 100644
index 0000000000..6c1940b02f
--- /dev/null
+++ b/src/backends/cl/ClLayerSupport.cpp
@@ -0,0 +1,478 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "LayerSupportCommon.hpp"
+
+#include "ClLayerSupport.hpp"
+#include "InternalTypes.hpp"
+#include <armnn/Descriptors.hpp>
+#include <armnn/Types.hpp>
+#include <armnn/Tensor.hpp>
+
+#include <boost/core/ignore_unused.hpp>
+
+#ifdef ARMCOMPUTECL_ENABLED
+#include "workloads/ClAdditionWorkload.hpp"
+#include "workloads/ClActivationFloatWorkload.hpp"
+#include "workloads/ClBatchNormalizationFloatWorkload.hpp"
+#include "workloads/ClConvertFp16ToFp32Workload.hpp"
+#include "workloads/ClConvertFp32ToFp16Workload.hpp"
+#include "workloads/ClConvolution2dBaseWorkload.hpp"
+#include "workloads/ClDepthwiseConvolutionBaseWorkload.hpp"
+#include "workloads/ClDivisionFloatWorkload.hpp"
+#include "workloads/ClL2NormalizationFloatWorkload.hpp"
+#include "workloads/ClMultiplicationFloatWorkload.hpp"
+#include "workloads/ClFullyConnectedWorkload.hpp"
+#include "workloads/ClPadWorkload.hpp"
+#include "workloads/ClPooling2dBaseWorkload.hpp"
+#include "workloads/ClPermuteWorkload.hpp"
+#include "workloads/ClNormalizationFloatWorkload.hpp"
+#include "workloads/ClSoftmaxBaseWorkload.hpp"
+#include "workloads/ClSubtractionWorkload.hpp"
+#include "workloads/ClLstmFloatWorkload.hpp"
+#endif
+
+using namespace boost;
+
+namespace armnn
+{
+namespace
+{
+template<unsigned int FilterSize>
+bool IsMatchingSize2d(const TensorInfo& weightInfo)
+{
+ // Width & Height must match.
+ return (weightInfo.GetShape()[3] == FilterSize) && (weightInfo.GetShape()[2] == FilterSize);
+}
+
+template<uint32_t ValidStride>
+bool IsMatchingStride(uint32_t actualStride)
+{
+ return ValidStride == actualStride;
+}
+
+template<uint32_t FirstStride, uint32_t SecondStride, uint32_t... ValidStrides>
+bool IsMatchingStride(uint32_t actualStride)
+{
+ return IsMatchingStride<FirstStride>(actualStride) || IsMatchingStride<SecondStride, ValidStrides...>(actualStride);
+};
+
+bool IsClBackendSupported(std::string* reasonIfUnsupported)
+{
+#if ARMCOMPUTECL_ENABLED
+ return true;
+#else
+ if (reasonIfUnsupported != nullptr)
+ {
+ *reasonIfUnsupported = "The armnn library has been built without CL support";
+ }
+ return false;
+#endif
+}
+
+#if ARMCOMPUTECL_ENABLED
+#define FORWARD_CL_LAYER_SUPPORT_FUNC(expr) (expr)
+#else
+#define FORWARD_CL_LAYER_SUPPORT_FUNC(expr) IsClBackendSupported(reasonIfUnsupported)
+#endif
+
+#if ARMCOMPUTECL_ENABLED
+template<class FuncType, class... Args>
+inline bool IsWorkloadSupported(FuncType&& func, std::string* reasonIfUnsupported, Args&&... args)
+{
+ arm_compute::Status aclStatus = func(std::forward<Args>(args)...);
+ const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK);
+ if (!supported && reasonIfUnsupported)
+ {
+ *reasonIfUnsupported = aclStatus.error_description();
+ }
+ return supported;
+}
+
+#define FORWARD_WORKLOAD_VALIDATE_FUNC(func, reasonIfUnsupported, ...) \
+ return IsWorkloadSupported(func, reasonIfUnsupported, __VA_ARGS__);
+#else
+#define FORWARD_WORKLOAD_VALIDATE_FUNC(func, reasonIfUnsupported, ...) \
+ return IsClBackendSupported(reasonIfUnsupported);
+#endif
+
+} //namespace
+
+template<typename FloatFunc, typename Uint8Func, typename ... Params>
+bool IsSupportedForDataTypeCl(std::string* reasonIfUnsupported,
+ DataType dataType,
+ FloatFunc floatFuncPtr,
+ Uint8Func uint8FuncPtr,
+ Params&&... params)
+{
+ return IsClBackendSupported(reasonIfUnsupported) &&
+ IsSupportedForDataTypeGeneric(reasonIfUnsupported,
+ dataType,
+ floatFuncPtr,
+ floatFuncPtr,
+ uint8FuncPtr,
+ std::forward<Params>(params)...);
+}
+
+bool IsActivationSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ const ActivationDescriptor& descriptor,
+ std::string* reasonIfUnsupported)
+{
+ FORWARD_WORKLOAD_VALIDATE_FUNC(ClActivationWorkloadValidate,
+ reasonIfUnsupported,
+ input,
+ output,
+ descriptor);
+}
+
+bool IsAdditionSupportedCl(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported)
+{
+ return FORWARD_CL_LAYER_SUPPORT_FUNC(ClAdditionValidate(input0,
+ input1,
+ output,
+ reasonIfUnsupported));
+}
+
+bool IsBatchNormalizationSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& mean,
+ const TensorInfo& var,
+ const TensorInfo& beta,
+ const TensorInfo& gamma,
+ const BatchNormalizationDescriptor& descriptor,
+ std::string* reasonIfUnsupported)
+{
+ FORWARD_WORKLOAD_VALIDATE_FUNC(ClBatchNormalizationValidate,
+ reasonIfUnsupported,
+ input,
+ output,
+ mean,
+ var,
+ beta,
+ gamma,
+ descriptor);
+}
+
+bool IsConstantSupportedCl(const TensorInfo& output,
+ std::string* reasonIfUnsupported)
+{
+ return IsSupportedForDataTypeCl(reasonIfUnsupported,
+ output.GetDataType(),
+ &TrueFunc<>,
+ &FalseFuncU8<>);
+}
+
+bool IsClDirectConvolution2dSupported(const TensorInfo& weightInfo, const Convolution2dDescriptor& desc)
+{
+ bool isSupported = false;
+
+ bool strideXIsOneOrTwo = IsMatchingStride<1, 2>(desc.m_StrideX);
+ bool strideXIsThree = IsMatchingStride<3>(desc.m_StrideX);
+
+ bool strideYIsOneOrTwo = IsMatchingStride<1, 2>(desc.m_StrideY);
+ bool strideYIsThree = IsMatchingStride<3>(desc.m_StrideY);
+
+ bool strideIsOneOrTwo = strideXIsOneOrTwo && strideYIsOneOrTwo;
+ bool strideIsOneOrTwoOrThree = ( strideXIsOneOrTwo || strideXIsThree ) && ( strideYIsOneOrTwo || strideYIsThree );
+
+ // 1x1 convolution with strides of 1,2,3.
+ isSupported |= IsMatchingSize2d<1>(weightInfo) && ( strideIsOneOrTwoOrThree );
+
+ // 3x3 convolution with strides of 1,2.
+ isSupported |= IsMatchingSize2d<3>(weightInfo) && ( strideIsOneOrTwo );
+
+ // 5x5 convolution with strides of 1,2
+ isSupported |= IsMatchingSize2d<5>(weightInfo) && ( strideIsOneOrTwo );
+
+ //Fall back to normal convolution for the asymmetric padding case.
+ if (desc.m_PadLeft != desc.m_PadRight ||
+ desc.m_PadTop != desc.m_PadBottom)
+ {
+ //Direct convolution does not support asymmetric padding yet.
+ isSupported = false;
+ }
+
+ return isSupported;
+}
+
+bool IsDirectConvolution2dParamsSupportedCl(std::string* reasonIfUnsupported,
+ const Convolution2dDescriptor& parameters,
+ const TensorInfo& weightInfo)
+{
+ return IsClDirectConvolution2dSupported(weightInfo, parameters);
+}
+
+bool IsConvolution2dSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ const Convolution2dDescriptor& descriptor,
+ const TensorInfo& weights,
+ const boost::optional<TensorInfo>& biases,
+ std::string* reasonIfUnsupported)
+{
+ FORWARD_WORKLOAD_VALIDATE_FUNC(ClConvolution2dWorkloadValidate,
+ reasonIfUnsupported,
+ input,
+ output,
+ descriptor,
+ weights,
+ biases);
+}
+
+bool IsDepthwiseConvolutionSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ const DepthwiseConvolution2dDescriptor& descriptor,
+ const TensorInfo& weights,
+ const boost::optional<TensorInfo>& biases,
+ std::string* reasonIfUnsupported)
+{
+ FORWARD_WORKLOAD_VALIDATE_FUNC(ClDepthwiseConvolutionWorkloadValidate,
+ reasonIfUnsupported,
+ input,
+ output,
+ descriptor,
+ weights,
+ biases);
+}
+
+bool IsDivisionSupportedCl(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported)
+{
+ FORWARD_WORKLOAD_VALIDATE_FUNC(ClDivisionWorkloadValidate,
+ reasonIfUnsupported,
+ input0,
+ input1,
+ output);
+}
+
+bool IsSubtractionSupportedCl(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported)
+{
+ return FORWARD_CL_LAYER_SUPPORT_FUNC(ClSubtractionValidate(input0,
+ input1,
+ output,
+ reasonIfUnsupported));
+}
+
+bool IsFullyConnectedSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& weights,
+ const TensorInfo& biases,
+ const FullyConnectedDescriptor& descriptor,
+ std::string* reasonIfUnsupported)
+{
+ FORWARD_WORKLOAD_VALIDATE_FUNC(ClFullyConnectedWorkloadValidate,
+ reasonIfUnsupported,
+ input,
+ output,
+ weights,
+ biases,
+ descriptor);
+}
+
+bool IsInputSupportedCl(const TensorInfo& input,
+ std::string* reasonIfUnsupported)
+{
+ return IsSupportedForDataTypeCl(reasonIfUnsupported,
+ input.GetDataType(),
+ &TrueFunc<>,
+ &TrueFunc<>);
+}
+
+bool IsL2NormalizationSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ const L2NormalizationDescriptor& descriptor,
+ std::string* reasonIfUnsupported)
+{
+ FORWARD_WORKLOAD_VALIDATE_FUNC(ClL2NormalizationWorkloadValidate, reasonIfUnsupported, input, output, descriptor);
+}
+
+bool IsMergerSupportedCl(const std::vector<const TensorInfo*> inputs,
+ const OriginsDescriptor& descriptor,
+ std::string* reasonIfUnsupported)
+{
+ ignore_unused(descriptor);
+ return IsSupportedForDataTypeCl(reasonIfUnsupported,
+ inputs[0]->GetDataType(),
+ &TrueFunc<>,
+ &FalseFuncU8<>);
+}
+
+bool IsMultiplicationSupportedCl(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported)
+{
+ FORWARD_WORKLOAD_VALIDATE_FUNC(ClMultiplicationWorkloadValidate,
+ reasonIfUnsupported,
+ input0,
+ input1,
+ output);
+}
+
+bool IsNormalizationSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ const NormalizationDescriptor& descriptor,
+ std::string* reasonIfUnsupported)
+{
+ FORWARD_WORKLOAD_VALIDATE_FUNC(ClNormalizationWorkloadValidate, reasonIfUnsupported, input, output, descriptor);
+}
+
+bool IsOutputSupportedCl(const TensorInfo& output,
+ std::string* reasonIfUnsupported)
+{
+ return IsSupportedForDataTypeCl(reasonIfUnsupported,
+ output.GetDataType(),
+ &TrueFunc<>,
+ &TrueFunc<>);
+}
+
+bool IsPadSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ const PadDescriptor& descriptor,
+ std::string* reasonIfUnsupported)
+{
+ return FORWARD_CL_LAYER_SUPPORT_FUNC(ClPadValidate(input, output, descriptor, reasonIfUnsupported));
+}
+
+bool IsPermuteSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ const PermuteDescriptor& descriptor,
+ std::string* reasonIfUnsupported)
+{
+ ignore_unused(input);
+ ignore_unused(output);
+ FORWARD_WORKLOAD_VALIDATE_FUNC(ClPermuteWorkloadValidate, reasonIfUnsupported, descriptor);
+}
+
+bool IsPooling2dSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ const Pooling2dDescriptor& descriptor,
+ std::string* reasonIfUnsupported)
+{
+ FORWARD_WORKLOAD_VALIDATE_FUNC(ClPooling2dWorkloadValidate, reasonIfUnsupported, input, output, descriptor);
+}
+
+bool IsResizeBilinearSupportedCl(const TensorInfo& input,
+ std::string* reasonIfUnsupported)
+{
+ return IsSupportedForDataTypeCl(reasonIfUnsupported,
+ input.GetDataType(),
+ &TrueFunc<>,
+ &FalseFuncU8<>);
+}
+
+bool IsSoftmaxSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ const SoftmaxDescriptor& descriptor,
+ std::string* reasonIfUnsupported)
+{
+ ignore_unused(descriptor);
+ FORWARD_WORKLOAD_VALIDATE_FUNC(ClSoftmaxWorkloadValidate, reasonIfUnsupported, input, output);
+}
+
+bool IsSplitterSupportedCl(const TensorInfo& input,
+ const ViewsDescriptor& descriptor,
+ std::string* reasonIfUnsupported)
+{
+ ignore_unused(descriptor);
+ return IsSupportedForDataTypeCl(reasonIfUnsupported,
+ input.GetDataType(),
+ &TrueFunc<>,
+ &TrueFunc<>);
+}
+
+bool IsFakeQuantizationSupportedCl(const TensorInfo& input,
+ const FakeQuantizationDescriptor& descriptor,
+ std::string* reasonIfUnsupported)
+{
+ ignore_unused(input);
+ ignore_unused(descriptor);
+ return false;
+}
+
+bool IsReshapeSupportedCl(const TensorInfo& input,
+ std::string* reasonIfUnsupported)
+{
+ ignore_unused(input);
+ return true;
+}
+
+bool IsFloorSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported)
+{
+ ignore_unused(output);
+ return IsClBackendSupported(reasonIfUnsupported) &&
+ IsSupportedForDataTypeGeneric(reasonIfUnsupported,
+ input.GetDataType(),
+ &FalseFuncF16<>,
+ &TrueFunc<>,
+ &FalseFuncU8<>);
+}
+
+bool IsLstmSupportedCl(const TensorInfo& input, const TensorInfo& outputStateIn,
+ const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer,
+ const TensorInfo& outputStateOut, const TensorInfo& cellStateOut,
+ const TensorInfo& output, const LstmDescriptor& descriptor,
+ const TensorInfo& inputToForgetWeights, const TensorInfo& inputToCellWeights,
+ const TensorInfo& inputToOutputWeights, const TensorInfo& recurrentToForgetWeights,
+ const TensorInfo& recurrentToCellWeights, const TensorInfo& recurrentToOutputWeights,
+ const TensorInfo& forgetGateBias, const TensorInfo& cellBias,
+ const TensorInfo& outputGateBias, const TensorInfo* inputToInputWeights,
+ const TensorInfo* recurrentToInputWeights, const TensorInfo* cellToInputWeights,
+ const TensorInfo* inputGateBias, const TensorInfo* projectionWeights,
+ const TensorInfo* projectionBias, const TensorInfo* cellToForgetWeights,
+ const TensorInfo* cellToOutputWeights, std::string* reasonIfUnsupported)
+{
+ FORWARD_WORKLOAD_VALIDATE_FUNC(ClLstmFloatWorkloadValidate, reasonIfUnsupported,
+ input, outputStateIn, cellStateIn, scratchBuffer, outputStateOut, cellStateOut,
+ output, descriptor, inputToForgetWeights, inputToCellWeights,
+ inputToOutputWeights, recurrentToForgetWeights,
+ recurrentToCellWeights, recurrentToOutputWeights,
+ forgetGateBias, cellBias, outputGateBias,
+ inputToInputWeights, recurrentToInputWeights,
+ cellToInputWeights, inputGateBias, projectionWeights,
+ projectionBias, cellToForgetWeights, cellToOutputWeights);
+}
+
+bool IsConvertFp16ToFp32SupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported)
+{
+ FORWARD_WORKLOAD_VALIDATE_FUNC(ClConvertFp16ToFp32WorkloadValidate,
+ reasonIfUnsupported,
+ input,
+ output,
+ reasonIfUnsupported);
+}
+
+bool IsConvertFp32ToFp16SupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported)
+{
+ FORWARD_WORKLOAD_VALIDATE_FUNC(ClConvertFp32ToFp16WorkloadValidate,
+ reasonIfUnsupported,
+ input,
+ output,
+ reasonIfUnsupported);
+}
+
+bool IsMeanSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ const MeanDescriptor& descriptor,
+ std::string* reasonIfUnsupported)
+{
+ return false;
+}
+
+}
diff --git a/src/backends/cl/ClLayerSupport.hpp b/src/backends/cl/ClLayerSupport.hpp
new file mode 100644
index 0000000000..700d71801d
--- /dev/null
+++ b/src/backends/cl/ClLayerSupport.hpp
@@ -0,0 +1,164 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <armnn/DescriptorsFwd.hpp>
+#include <armnn/Types.hpp>
+#include <armnn/Tensor.hpp>
+#include <armnn/ArmNN.hpp>
+
+#include <boost/optional.hpp>
+
+namespace armnn
+{
+bool IsClDirectConvolution2dSupported(const TensorInfo& weightInfo, const Convolution2dDescriptor& desc);
+bool IsClDepthwiseConvolution2dDescParamsSupported(std::string* reasonIfUnsupported,
+ const DepthwiseConvolution2dDescriptor& parameters,
+ const TensorInfo& weights);
+
+bool IsActivationSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ const ActivationDescriptor& descriptor,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsAdditionSupportedCl(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsBatchNormalizationSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& mean,
+ const TensorInfo& var,
+ const TensorInfo& beta,
+ const TensorInfo& gamma,
+ const BatchNormalizationDescriptor& descriptor,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsConstantSupportedCl(const TensorInfo& output,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsConvolution2dSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ const Convolution2dDescriptor& descriptor,
+ const TensorInfo& weights,
+ const boost::optional<TensorInfo>& biases,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsDepthwiseConvolutionSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ const DepthwiseConvolution2dDescriptor& descriptor,
+ const TensorInfo& weights,
+ const boost::optional<TensorInfo>& biases,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsDivisionSupportedCl(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsSubtractionSupportedCl(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsFullyConnectedSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& weights,
+ const TensorInfo& biases,
+ const FullyConnectedDescriptor& descriptor,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsInputSupportedCl(const TensorInfo& input,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsL2NormalizationSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ const L2NormalizationDescriptor& descriptor,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsLstmSupportedCl(const TensorInfo& input, const TensorInfo& outputStateIn,
+ const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer,
+ const TensorInfo& outputStateOut, const TensorInfo& cellStateOut,
+ const TensorInfo& output, const LstmDescriptor& descriptor,
+ const TensorInfo& inputToForgetWeights, const TensorInfo& inputToCellWeights,
+ const TensorInfo& inputToOutputWeights, const TensorInfo& recurrentToForgetWeights,
+ const TensorInfo& recurrentToCellWeights, const TensorInfo& recurrentToOutputWeights,
+ const TensorInfo& forgetGateBias, const TensorInfo& cellBias,
+ const TensorInfo& outputGateBias, const TensorInfo* inputToInputWeights,
+ const TensorInfo* recurrentToInputWeights, const TensorInfo* cellToInputWeights,
+ const TensorInfo* inputGateBias, const TensorInfo* projectionWeights,
+ const TensorInfo* projectionBias, const TensorInfo* cellToForgetWeights,
+ const TensorInfo* cellToOutputWeights, std::string* reasonIfUnsupported = nullptr);
+
+bool IsMergerSupportedCl(const std::vector<const TensorInfo*> inputs,
+ const OriginsDescriptor& descriptor,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsMultiplicationSupportedCl(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsNormalizationSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ const NormalizationDescriptor& descriptor,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsOutputSupportedCl(const TensorInfo& output,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsPadSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ const PadDescriptor& descriptor,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsPermuteSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ const PermuteDescriptor& descriptor,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsPooling2dSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ const Pooling2dDescriptor& descriptor,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsResizeBilinearSupportedCl(const TensorInfo& input,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsSoftmaxSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ const SoftmaxDescriptor& descriptor,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsSplitterSupportedCl(const TensorInfo& input,
+ const ViewsDescriptor& descriptor,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsFakeQuantizationSupportedCl(const TensorInfo& input,
+ const FakeQuantizationDescriptor& descriptor,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsReshapeSupportedCl(const TensorInfo& input,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsFloorSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsMeanSupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ const MeanDescriptor& descriptor,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsConvertFp16ToFp32SupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported = nullptr);
+
+bool IsConvertFp32ToFp16SupportedCl(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported = nullptr);
+
+}
diff --git a/src/backends/cl/ClTensorHandle.hpp b/src/backends/cl/ClTensorHandle.hpp
new file mode 100644
index 0000000000..556e4479b6
--- /dev/null
+++ b/src/backends/cl/ClTensorHandle.hpp
@@ -0,0 +1,141 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <backends/OutputHandler.hpp>
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+
+#include <arm_compute/runtime/CL/CLTensor.h>
+#include <arm_compute/runtime/CL/CLSubTensor.h>
+#include <arm_compute/runtime/CL/CLMemoryGroup.h>
+#include <arm_compute/runtime/IMemoryGroup.h>
+#include <arm_compute/core/TensorShape.h>
+#include <arm_compute/core/Coordinates.h>
+
+#include <boost/polymorphic_pointer_cast.hpp>
+
+namespace armnn
+{
+
+
+class IClTensorHandle : public ITensorHandle
+{
+public:
+ virtual arm_compute::ICLTensor& GetTensor() = 0;
+ virtual arm_compute::ICLTensor const& GetTensor() const = 0;
+ virtual arm_compute::DataType GetDataType() const = 0;
+ virtual void SetMemoryGroup(const std::shared_ptr<arm_compute::IMemoryGroup>& memoryGroup) = 0;
+};
+
+class ClTensorHandle : public IClTensorHandle
+{
+public:
+ ClTensorHandle(const TensorInfo& tensorInfo)
+ {
+ armnn::armcomputetensorutils::BuildArmComputeTensor(m_Tensor, tensorInfo);
+ }
+
+ ClTensorHandle(const TensorInfo& tensorInfo, DataLayout dataLayout)
+ {
+ armnn::armcomputetensorutils::BuildArmComputeTensor(m_Tensor, tensorInfo, dataLayout);
+ }
+
+ arm_compute::CLTensor& GetTensor() override { return m_Tensor; }
+ arm_compute::CLTensor const& GetTensor() const override { return m_Tensor; }
+ virtual void Allocate() override {armnn::armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_Tensor);}
+
+ virtual void Manage() override
+ {
+ assert(m_MemoryGroup != nullptr);
+ m_MemoryGroup->manage(&m_Tensor);
+ }
+
+ virtual const void* Map(bool blocking = true) const override
+ {
+ const_cast<arm_compute::CLTensor*>(&m_Tensor)->map(blocking);
+ return static_cast<const void*>(m_Tensor.buffer() + m_Tensor.info()->offset_first_element_in_bytes());
+ }
+ virtual void Unmap() const override { const_cast<arm_compute::CLTensor*>(&m_Tensor)->unmap(); }
+
+ virtual ITensorHandle::Type GetType() const override { return ITensorHandle::CL; }
+
+ virtual ITensorHandle* GetParent() const override { return nullptr; }
+
+ virtual arm_compute::DataType GetDataType() const override
+ {
+ return m_Tensor.info()->data_type();
+ }
+
+ virtual void SetMemoryGroup(const std::shared_ptr<arm_compute::IMemoryGroup>& memoryGroup) override
+ {
+ m_MemoryGroup = boost::polymorphic_pointer_downcast<arm_compute::CLMemoryGroup>(memoryGroup);
+ }
+
+ TensorShape GetStrides() const override
+ {
+ return armcomputetensorutils::GetStrides(m_Tensor.info()->strides_in_bytes());
+ }
+
+ TensorShape GetShape() const override
+ {
+ return armcomputetensorutils::GetShape(m_Tensor.info()->tensor_shape());
+ }
+private:
+ arm_compute::CLTensor m_Tensor;
+ std::shared_ptr<arm_compute::CLMemoryGroup> m_MemoryGroup;
+};
+
+class ClSubTensorHandle : public IClTensorHandle
+{
+public:
+ ClSubTensorHandle(IClTensorHandle* parent,
+ const arm_compute::TensorShape& shape,
+ const arm_compute::Coordinates& coords)
+ : m_Tensor(&parent->GetTensor(), shape, coords)
+ {
+ parentHandle = parent;
+ }
+
+ arm_compute::CLSubTensor& GetTensor() override { return m_Tensor; }
+ arm_compute::CLSubTensor const& GetTensor() const override { return m_Tensor; }
+
+ virtual void Allocate() override {}
+ virtual void Manage() override {}
+
+ virtual const void* Map(bool blocking = true) const override
+ {
+ const_cast<arm_compute::CLSubTensor*>(&m_Tensor)->map(blocking);
+ return static_cast<const void*>(m_Tensor.buffer() + m_Tensor.info()->offset_first_element_in_bytes());
+ }
+ virtual void Unmap() const override { const_cast<arm_compute::CLSubTensor*>(&m_Tensor)->unmap(); }
+
+ virtual ITensorHandle::Type GetType() const override { return ITensorHandle::CL; }
+
+ virtual ITensorHandle* GetParent() const override { return parentHandle; }
+
+ virtual arm_compute::DataType GetDataType() const override
+ {
+ return m_Tensor.info()->data_type();
+ }
+
+ virtual void SetMemoryGroup(const std::shared_ptr<arm_compute::IMemoryGroup>&) override {}
+
+ TensorShape GetStrides() const override
+ {
+ return armcomputetensorutils::GetStrides(m_Tensor.info()->strides_in_bytes());
+ }
+
+ TensorShape GetShape() const override
+ {
+ return armcomputetensorutils::GetShape(m_Tensor.info()->tensor_shape());
+ }
+
+private:
+ mutable arm_compute::CLSubTensor m_Tensor;
+ ITensorHandle* parentHandle = nullptr;
+
+};
+
+}
diff --git a/src/backends/cl/ClWorkloadFactory.cpp b/src/backends/cl/ClWorkloadFactory.cpp
new file mode 100644
index 0000000000..46a96559bf
--- /dev/null
+++ b/src/backends/cl/ClWorkloadFactory.cpp
@@ -0,0 +1,506 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#include "ClWorkloadFactory.hpp"
+
+#include <armnn/Exceptions.hpp>
+#include <armnn/Utils.hpp>
+
+#include <string>
+#include <backends/CpuTensorHandle.hpp>
+#include <Layer.hpp>
+
+#ifdef ARMCOMPUTECL_ENABLED
+#include <arm_compute/core/CL/CLKernelLibrary.h>
+#include <arm_compute/runtime/CL/CLBufferAllocator.h>
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+#include <backends/cl/workloads/ClWorkloads.hpp>
+
+#include <backends/MemCopyWorkload.hpp>
+#include <backends/cl/ClTensorHandle.hpp>
+
+#include <memory/IPoolManager.hpp>
+#endif
+
+#include <backends/MakeWorkloadHelper.hpp>
+
+#include <boost/polymorphic_cast.hpp>
+#include <boost/format.hpp>
+#include <boost/log/trivial.hpp>
+
+namespace armnn
+{
+
+bool ClWorkloadFactory::IsLayerSupported(const Layer& layer,
+ boost::optional<DataType> dataType,
+ std::string& outReasonIfUnsupported)
+{
+ return IWorkloadFactory::IsLayerSupported(Compute::GpuAcc, layer, dataType, outReasonIfUnsupported);
+}
+
+#ifdef ARMCOMPUTECL_ENABLED
+
+ClWorkloadFactory::ClWorkloadFactory()
+: m_MemoryManager(std::make_unique<arm_compute::CLBufferAllocator>())
+{
+}
+
+std::unique_ptr<ITensorHandle> ClWorkloadFactory::CreateTensorHandle(const TensorInfo& tensorInfo) const
+{
+ std::unique_ptr<ClTensorHandle> tensorHandle = std::make_unique<ClTensorHandle>(tensorInfo);
+ tensorHandle->SetMemoryGroup(m_MemoryManager.GetInterLayerMemoryGroup());
+
+ return tensorHandle;
+}
+
+std::unique_ptr<ITensorHandle> ClWorkloadFactory::CreateTensorHandle(const TensorInfo& tensorInfo,
+ DataLayout dataLayout) const
+{
+ std::unique_ptr<ClTensorHandle> tensorHandle = std::make_unique<ClTensorHandle>(tensorInfo, dataLayout);
+ tensorHandle->SetMemoryGroup(m_MemoryManager.GetInterLayerMemoryGroup());
+
+ return tensorHandle;
+}
+
+std::unique_ptr<ITensorHandle> ClWorkloadFactory::CreateSubTensorHandle(ITensorHandle& parent,
+ TensorShape const& subTensorShape,
+ unsigned int const* subTensorOrigin) const
+{
+ BOOST_ASSERT(parent.GetType() == ITensorHandle::CL);
+
+ arm_compute::Coordinates coords;
+ arm_compute::TensorShape shape = armcomputetensorutils::BuildArmComputeTensorShape(subTensorShape);
+
+ coords.set_num_dimensions(subTensorShape.GetNumDimensions());
+ for (unsigned int i = 0; i < subTensorShape.GetNumDimensions(); i++)
+ {
+ // Arm compute indexes tensor coords in reverse order.
+ unsigned int revertedIndex = subTensorShape.GetNumDimensions() - i - 1;
+ coords.set(i, boost::numeric_cast<int>(subTensorOrigin[revertedIndex]));
+ }
+
+ return std::make_unique<ClSubTensorHandle>(
+ boost::polymorphic_downcast<IClTensorHandle*>(&parent), shape, coords);
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateInput(const InputQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<CopyMemGenericWorkload, CopyMemGenericWorkload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateOutput(const OutputQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<CopyMemGenericWorkload, CopyMemGenericWorkload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateActivation(const ActivationQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<ClActivationFloatWorkload, ClActivationUint8Workload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateSoftmax(const SoftmaxQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<ClSoftmaxFloatWorkload, ClSoftmaxUint8Workload>(descriptor, info,
+ m_MemoryManager.GetIntraLayerManager());
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateSplitter(const SplitterQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<ClSplitterFloatWorkload, ClSplitterUint8Workload>(descriptor, info);
+}
+
+std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreateMerger(const MergerQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<ClMergerFloatWorkload, ClMergerUint8Workload>(descriptor, info);
+}
+
+std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreateFullyConnected(
+ const FullyConnectedQueueDescriptor& descriptor, const WorkloadInfo& info) const
+{
+ return MakeWorkload<ClFullyConnectedWorkload, ClFullyConnectedWorkload>(descriptor, info,
+ m_MemoryManager.GetIntraLayerManager());
+}
+
+std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreatePermute(const PermuteQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<ClPermuteFloatWorkload, ClPermuteUint8Workload>(descriptor, info);
+}
+
+std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreatePooling2d(const Pooling2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<ClPooling2dFloatWorkload, ClPooling2dUint8Workload>(descriptor, info);
+}
+
+std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreateConvolution2d(const Convolution2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<ClConvolution2dFloatWorkload, ClConvolution2dUint8Workload>(descriptor, info,
+ m_MemoryManager.GetIntraLayerManager());
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateDepthwiseConvolution2d(
+ const DepthwiseConvolution2dQueueDescriptor& descriptor, const WorkloadInfo& info) const
+{
+ return MakeWorkload<ClDepthwiseConvolutionFloatWorkload, ClDepthwiseConvolutionUint8Workload>(descriptor, info);
+}
+
+std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreateNormalization(const NormalizationQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<ClNormalizationFloatWorkload, NullWorkload>(descriptor, info);
+}
+
+std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreateAddition(const AdditionQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<ClAdditionWorkload<armnn::DataType::Float16, armnn::DataType::Float32>,
+ ClAdditionWorkload<armnn::DataType::QuantisedAsymm8>>(descriptor, info);
+}
+
+std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreateMultiplication(
+ const MultiplicationQueueDescriptor& descriptor, const WorkloadInfo& info) const
+{
+ return MakeWorkload<ClMultiplicationFloatWorkload, ClMultiplicationFloatWorkload>(descriptor, info);
+}
+
+std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreateDivision(
+ const DivisionQueueDescriptor& descriptor, const WorkloadInfo& info) const
+{
+ return MakeWorkload<ClDivisionFloatWorkload, NullWorkload>(descriptor, info);
+}
+
+std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreateSubtraction(const SubtractionQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<ClSubtractionWorkload<armnn::DataType::Float16, armnn::DataType::Float32>,
+ ClSubtractionWorkload<armnn::DataType::QuantisedAsymm8>>(descriptor, info);
+}
+
+std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreateBatchNormalization(
+ const BatchNormalizationQueueDescriptor& descriptor, const WorkloadInfo& info) const
+{
+ return MakeWorkload<ClBatchNormalizationFloatWorkload, NullWorkload>(descriptor, info);
+}
+
+std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreateMemCopy(const MemCopyQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ if (descriptor.m_Inputs.empty() || !descriptor.m_Inputs[0])
+ {
+ throw InvalidArgumentException("ClWorkloadFactory: Invalid null input for MemCopy workload");
+ }
+
+ return MakeWorkload<CopyMemGenericWorkload, CopyMemGenericWorkload>(descriptor, info);
+}
+
+std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreateResizeBilinear(
+ const ResizeBilinearQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<ClResizeBilinearFloatWorkload, NullWorkload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateFakeQuantization(
+ const FakeQuantizationQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateL2Normalization(const L2NormalizationQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<ClL2NormalizationFloatWorkload, NullWorkload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateConstant(const ConstantQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<ClConstantFloatWorkload, ClConstantUint8Workload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateReshape(const ReshapeQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<ClReshapeFloatWorkload, ClReshapeUint8Workload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateFloor(const FloorQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<ClFloorFloatWorkload, NullWorkload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateLstm(const LstmQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<ClLstmFloatWorkload, NullWorkload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateConvertFp16ToFp32(
+ const ConvertFp16ToFp32QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return std::make_unique<ClConvertFp16ToFp32Workload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateConvertFp32ToFp16(
+ const ConvertFp32ToFp16QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return std::make_unique<ClConvertFp32ToFp16Workload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateMean(const MeanQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<NullWorkload, NullWorkload>(descriptor, info);
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreatePad(const PadQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return MakeWorkload<ClPadWorkload<armnn::DataType::Float16, armnn::DataType::Float32>,
+ ClPadWorkload<armnn::DataType::QuantisedAsymm8>>(descriptor, info);
+}
+
+void ClWorkloadFactory::Finalize()
+{
+ m_MemoryManager.Finalize();
+}
+
+void ClWorkloadFactory::Release()
+{
+ m_MemoryManager.Release();
+}
+
+void ClWorkloadFactory::Acquire()
+{
+ m_MemoryManager.Acquire();
+}
+
+#else // #if ARMCOMPUTECL_ENABLED
+
+ClWorkloadFactory::ClWorkloadFactory()
+{
+}
+
+std::unique_ptr<ITensorHandle> ClWorkloadFactory::CreateTensorHandle(const TensorInfo& tensorInfo) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<ITensorHandle> ClWorkloadFactory::CreateTensorHandle(const TensorInfo& tensorInfo,
+ DataLayout dataLayout) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<ITensorHandle> ClWorkloadFactory::CreateSubTensorHandle(ITensorHandle& parent,
+ TensorShape const& subTensorShape,
+ unsigned int const* subTensorOrigin) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateInput(const InputQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateOutput(const OutputQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateActivation(const ActivationQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateSoftmax(const SoftmaxQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateSplitter(const SplitterQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateMerger(const MergerQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateFullyConnected(const FullyConnectedQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreatePermute(const PermuteQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreatePooling2d(const Pooling2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateConvolution2d(const Convolution2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateDepthwiseConvolution2d(
+ const DepthwiseConvolution2dQueueDescriptor& descriptor, const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateNormalization(const NormalizationQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateAddition(const AdditionQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateMultiplication(const MultiplicationQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateBatchNormalization(
+ const BatchNormalizationQueueDescriptor& descriptor, const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateMemCopy(const MemCopyQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateResizeBilinear(const ResizeBilinearQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateFakeQuantization(const FakeQuantizationQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateL2Normalization(const L2NormalizationQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateConstant(const ConstantQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateReshape(const ReshapeQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateFloor(const FloorQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateLstm(const LstmQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateConvertFp16ToFp32(
+ const ConvertFp16ToFp32QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateConvertFp32ToFp16(
+ const ConvertFp32ToFp16QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateDivision(const DivisionQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateSubtraction(const SubtractionQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateMean(const MeanQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreatePad(const PadQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const
+{
+ return nullptr;
+}
+
+void ClWorkloadFactory::Finalize()
+{
+}
+
+void ClWorkloadFactory::Release()
+{
+}
+
+void ClWorkloadFactory::Acquire()
+{
+}
+
+#endif // #if ARMCOMPUTECL_ENABLED
+
+} // namespace armnn
diff --git a/src/backends/cl/ClWorkloadFactory.hpp b/src/backends/cl/ClWorkloadFactory.hpp
new file mode 100644
index 0000000000..59ae3b343a
--- /dev/null
+++ b/src/backends/cl/ClWorkloadFactory.hpp
@@ -0,0 +1,139 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <backends/OutputHandler.hpp>
+
+#include <armnn/IRuntime.hpp>
+#include <boost/optional.hpp>
+
+#include "memory/BaseMemoryManager.hpp"
+
+namespace armnn
+{
+
+// ARM Compute OpenCL workload factory.
+class ClWorkloadFactory : public IWorkloadFactory
+{
+public:
+ ClWorkloadFactory();
+
+ virtual Compute GetCompute() const override { return Compute::GpuAcc; }
+
+ static bool IsLayerSupported(const Layer& layer, boost::optional<DataType> dataType,
+ std::string& outReasonIfUnsupported);
+
+ virtual bool SupportsSubTensors() const override { return true; }
+
+ virtual std::unique_ptr<ITensorHandle> CreateSubTensorHandle(ITensorHandle& parent,
+ TensorShape const& subTensorShape,
+ unsigned int const* subTensorOrigin) const override;
+
+ virtual std::unique_ptr<ITensorHandle> CreateTensorHandle(const TensorInfo& tensorInfo) const override;
+
+ virtual std::unique_ptr<ITensorHandle> CreateTensorHandle(const TensorInfo& tensorInfo,
+ DataLayout dataLayout) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateInput(const InputQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateOutput(const OutputQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateActivation(const ActivationQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateSoftmax(const SoftmaxQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateSplitter(const SplitterQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateMerger(const MergerQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateFullyConnected(const FullyConnectedQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreatePermute(const PermuteQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreatePooling2d(const Pooling2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateConvolution2d(const Convolution2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateDepthwiseConvolution2d(
+ const DepthwiseConvolution2dQueueDescriptor& descriptor, const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateNormalization(const NormalizationQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateAddition(const AdditionQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateMultiplication(const MultiplicationQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateBatchNormalization(const BatchNormalizationQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateMemCopy(const MemCopyQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateResizeBilinear(const ResizeBilinearQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateFakeQuantization(const FakeQuantizationQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateL2Normalization(const L2NormalizationQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateConstant(const ConstantQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateReshape(const ReshapeQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateFloor(const FloorQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateLstm(const LstmQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateConvertFp16ToFp32(const ConvertFp16ToFp32QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateConvertFp32ToFp16(const ConvertFp32ToFp16QueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateDivision(const DivisionQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateSubtraction(const SubtractionQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreateMean(const MeanQueueDescriptor& descriptor,
+ const WorkloadInfo& Info) const override;
+
+ virtual std::unique_ptr<IWorkload> CreatePad(const PadQueueDescriptor& descriptor,
+ const WorkloadInfo& info) const override;
+
+ virtual void Finalize() override;
+
+ virtual void Release() override;
+
+ virtual void Acquire() override;
+
+private:
+
+#ifdef ARMCOMPUTECL_ENABLED
+ mutable ClMemoryManager m_MemoryManager;
+#endif
+};
+
+} // namespace armnn
diff --git a/src/backends/cl/backend.cmake b/src/backends/cl/backend.cmake
new file mode 100644
index 0000000000..1af88e3c9b
--- /dev/null
+++ b/src/backends/cl/backend.cmake
@@ -0,0 +1,13 @@
+#
+# Copyright © 2017 Arm Ltd. All rights reserved.
+# SPDX-License-Identifier: MIT
+#
+
+if(ARMCOMPUTECL)
+ add_subdirectory(${PROJECT_SOURCE_DIR}/src/backends/cl)
+ list(APPEND armnnLibraries armnnClBackend armnnClBackendWorkloads)
+else()
+ message("CL backend is disabled")
+ add_subdirectory(${PROJECT_SOURCE_DIR}/src/backends/cl)
+ list(APPEND armnnLibraries armnnClBackend)
+endif()
diff --git a/src/backends/cl/backend.mk b/src/backends/cl/backend.mk
new file mode 100644
index 0000000000..2418a24249
--- /dev/null
+++ b/src/backends/cl/backend.mk
@@ -0,0 +1,51 @@
+#
+# Copyright © 2017 ARM Ltd. All rights reserved.
+# SPDX-License-Identifier: MIT
+#
+
+# BACKEND_SOURCES contains the list of files to be included
+# in the Android build and it is picked up by the Android.mk
+# file in the root of ArmNN
+
+BACKEND_SOURCES := \
+ ClContextControl.cpp \
+ ClLayerSupport.cpp \
+ ClWorkloadFactory.cpp \
+ workloads/ClActivationFloatWorkload.cpp \
+ workloads/ClActivationUint8Workload.cpp \
+ workloads/ClAdditionWorkload.cpp \
+ workloads/ClBaseConstantWorkload.cpp \
+ workloads/ClBatchNormalizationFloatWorkload.cpp \
+ workloads/ClConstantFloatWorkload.cpp \
+ workloads/ClConstantUint8Workload.cpp \
+ workloads/ClConvertFp16ToFp32Workload.cpp \
+ workloads/ClConvertFp32ToFp16Workload.cpp \
+ workloads/ClConvolution2dBaseWorkload.cpp \
+ workloads/ClConvolution2dFloatWorkload.cpp \
+ workloads/ClConvolution2dUint8Workload.cpp \
+ workloads/ClDepthwiseConvolutionBaseWorkload.cpp \
+ workloads/ClDepthwiseConvolutionFloatWorkload.cpp \
+ workloads/ClDepthwiseConvolutionUint8Workload.cpp \
+ workloads/ClDivisionFloatWorkload.cpp \
+ workloads/ClFloorFloatWorkload.cpp \
+ workloads/ClFullyConnectedWorkload.cpp \
+ workloads/ClL2NormalizationFloatWorkload.cpp \
+ workloads/ClLstmFloatWorkload.cpp \
+ workloads/ClMergerFloatWorkload.cpp \
+ workloads/ClMergerUint8Workload.cpp \
+ workloads/ClMultiplicationFloatWorkload.cpp \
+ workloads/ClNormalizationFloatWorkload.cpp \
+ workloads/ClPadWorkload.cpp \
+ workloads/ClPermuteWorkload.cpp \
+ workloads/ClPooling2dBaseWorkload.cpp \
+ workloads/ClPooling2dFloatWorkload.cpp \
+ workloads/ClPooling2dUint8Workload.cpp \
+ workloads/ClReshapeFloatWorkload.cpp \
+ workloads/ClReshapeUint8Workload.cpp \
+ workloads/ClResizeBilinearFloatWorkload.cpp \
+ workloads/ClSoftmaxBaseWorkload.cpp \
+ workloads/ClSoftmaxFloatWorkload.cpp \
+ workloads/ClSoftmaxUint8Workload.cpp \
+ workloads/ClSplitterFloatWorkload.cpp \
+ workloads/ClSplitterUint8Workload.cpp \
+ workloads/ClSubtractionWorkload.cpp
diff --git a/src/backends/cl/test/CMakeLists.txt b/src/backends/cl/test/CMakeLists.txt
new file mode 100644
index 0000000000..e69de29bb2
--- /dev/null
+++ b/src/backends/cl/test/CMakeLists.txt
diff --git a/src/backends/cl/workloads/CMakeLists.txt b/src/backends/cl/workloads/CMakeLists.txt
new file mode 100644
index 0000000000..066c37f083
--- /dev/null
+++ b/src/backends/cl/workloads/CMakeLists.txt
@@ -0,0 +1,92 @@
+#
+# Copyright © 2017 Arm Ltd. All rights reserved.
+# SPDX-License-Identifier: MIT
+#
+
+list(APPEND armnnClBackendWorkloads_sources
+ ClActivationFloatWorkload.cpp
+ ClActivationFloatWorkload.hpp
+ ClActivationUint8Workload.cpp
+ ClActivationUint8Workload.hpp
+ ClAdditionWorkload.cpp
+ ClAdditionWorkload.hpp
+ ClBaseConstantWorkload.cpp
+ ClBaseConstantWorkload.hpp
+ ClBaseMergerWorkload.hpp
+ ClBaseSplitterWorkload.hpp
+ ClBatchNormalizationFloatWorkload.cpp
+ ClBatchNormalizationFloatWorkload.hpp
+ ClConstantFloatWorkload.cpp
+ ClConstantFloatWorkload.hpp
+ ClConstantUint8Workload.cpp
+ ClConstantUint8Workload.hpp
+ ClConvertFp16ToFp32Workload.cpp
+ ClConvertFp16ToFp32Workload.hpp
+ ClConvertFp32ToFp16Workload.cpp
+ ClConvertFp32ToFp16Workload.hpp
+ ClConvolution2dBaseWorkload.cpp
+ ClConvolution2dBaseWorkload.hpp
+ ClConvolution2dFloatWorkload.cpp
+ ClConvolution2dFloatWorkload.hpp
+ ClConvolution2dUint8Workload.cpp
+ ClConvolution2dUint8Workload.hpp
+ ClDepthwiseConvolutionBaseWorkload.cpp
+ ClDepthwiseConvolutionBaseWorkload.hpp
+ ClDepthwiseConvolutionFloatWorkload.cpp
+ ClDepthwiseConvolutionFloatWorkload.hpp
+ ClDepthwiseConvolutionUint8Workload.cpp
+ ClDepthwiseConvolutionUint8Workload.hpp
+ ClDivisionFloatWorkload.cpp
+ ClDivisionFloatWorkload.hpp
+ ClFloorFloatWorkload.cpp
+ ClFloorFloatWorkload.hpp
+ ClFullyConnectedWorkload.cpp
+ ClFullyConnectedWorkload.hpp
+ ClL2NormalizationFloatWorkload.cpp
+ ClL2NormalizationFloatWorkload.hpp
+ ClLstmFloatWorkload.cpp
+ ClLstmFloatWorkload.hpp
+ ClMergerFloatWorkload.cpp
+ ClMergerFloatWorkload.hpp
+ ClMergerUint8Workload.cpp
+ ClMergerUint8Workload.hpp
+ ClMultiplicationFloatWorkload.cpp
+ ClMultiplicationFloatWorkload.hpp
+ ClNormalizationFloatWorkload.cpp
+ ClNormalizationFloatWorkload.hpp
+ ClPadWorkload.cpp
+ ClPadWorkload.hpp
+ ClPermuteWorkload.cpp
+ ClPermuteWorkload.hpp
+ ClPooling2dBaseWorkload.cpp
+ ClPooling2dBaseWorkload.hpp
+ ClPooling2dFloatWorkload.cpp
+ ClPooling2dFloatWorkload.hpp
+ ClPooling2dUint8Workload.cpp
+ ClPooling2dUint8Workload.hpp
+ ClReshapeFloatWorkload.cpp
+ ClReshapeFloatWorkload.hpp
+ ClReshapeUint8Workload.cpp
+ ClReshapeUint8Workload.hpp
+ ClResizeBilinearFloatWorkload.cpp
+ ClResizeBilinearFloatWorkload.hpp
+ ClSoftmaxBaseWorkload.cpp
+ ClSoftmaxBaseWorkload.hpp
+ ClSoftmaxFloatWorkload.cpp
+ ClSoftmaxFloatWorkload.hpp
+ ClSoftmaxUint8Workload.cpp
+ ClSoftmaxUint8Workload.hpp
+ ClSplitterFloatWorkload.cpp
+ ClSplitterFloatWorkload.hpp
+ ClSplitterUint8Workload.cpp
+ ClSplitterUint8Workload.hpp
+ ClSubtractionWorkload.cpp
+ ClSubtractionWorkload.hpp
+ ClWorkloads.hpp
+ ClWorkloadUtils.hpp
+)
+
+add_library(armnnClBackendWorkloads STATIC ${armnnClBackendWorkloads_sources})
+target_include_directories(armnnClBackendWorkloads PRIVATE ${PROJECT_SOURCE_DIR}/src)
+target_include_directories(armnnClBackendWorkloads PRIVATE ${PROJECT_SOURCE_DIR}/src/armnn)
+target_include_directories(armnnClBackendWorkloads PRIVATE ${PROJECT_SOURCE_DIR}/src/armnnUtils)
diff --git a/src/backends/cl/workloads/ClActivationFloatWorkload.cpp b/src/backends/cl/workloads/ClActivationFloatWorkload.cpp
new file mode 100644
index 0000000000..cbaac9d226
--- /dev/null
+++ b/src/backends/cl/workloads/ClActivationFloatWorkload.cpp
@@ -0,0 +1,56 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClActivationFloatWorkload.hpp"
+#include <backends/cl/ClTensorHandle.hpp>
+#include <backends/aclCommon/ArmComputeUtils.hpp>
+
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+arm_compute::Status ClActivationWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const ActivationDescriptor& descriptor)
+{
+ const arm_compute::TensorInfo aclInput = armcomputetensorutils::BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+
+ const arm_compute::ActivationLayerInfo activationLayerInfo =
+ ConvertActivationDescriptorToAclActivationLayerInfo(descriptor);
+
+ if (input.GetDataType() == DataType::QuantisedAsymm8 &&
+ activationLayerInfo.activation() == arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC)
+ {
+ return arm_compute::Status{arm_compute::ErrorCode::RUNTIME_ERROR,
+ "CL: Logistic Activations unsupported with QAsymm8 data type."};
+ }
+
+ return arm_compute::CLActivationLayer::validate(&aclInput,
+ &aclOutput,
+ activationLayerInfo);
+}
+
+ClActivationFloatWorkload::ClActivationFloatWorkload(const ActivationQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : FloatWorkload<ActivationQueueDescriptor>(descriptor, info)
+{
+ m_Data.ValidateInputsOutputs("ClActivationFloatWorkload", 1, 1);
+
+ const arm_compute::ActivationLayerInfo activationLayerInfo =
+ ConvertActivationDescriptorToAclActivationLayerInfo(m_Data.m_Parameters);
+
+ arm_compute::ICLTensor& input = static_cast<ClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ICLTensor& output = static_cast<ClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+ m_ActivationLayer.configure(&input, &output, activationLayerInfo);
+}
+
+void ClActivationFloatWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClActivationFloatWorkload_Execute");
+ m_ActivationLayer.run();
+}
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClActivationFloatWorkload.hpp b/src/backends/cl/workloads/ClActivationFloatWorkload.hpp
new file mode 100644
index 0000000000..cb560a791b
--- /dev/null
+++ b/src/backends/cl/workloads/ClActivationFloatWorkload.hpp
@@ -0,0 +1,29 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+
+#include <arm_compute/runtime/CL/CLFunctions.h>
+
+namespace armnn
+{
+arm_compute::Status ClActivationWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const ActivationDescriptor& descriptor);
+
+// Activation layer execution.
+class ClActivationFloatWorkload : public FloatWorkload<ActivationQueueDescriptor>
+{
+public:
+ ClActivationFloatWorkload(const ActivationQueueDescriptor& descriptor, const WorkloadInfo& info);
+ void Execute() const override;
+
+private:
+ mutable arm_compute::CLActivationLayer m_ActivationLayer;
+};
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClActivationUint8Workload.cpp b/src/backends/cl/workloads/ClActivationUint8Workload.cpp
new file mode 100644
index 0000000000..ad6b73074b
--- /dev/null
+++ b/src/backends/cl/workloads/ClActivationUint8Workload.cpp
@@ -0,0 +1,44 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClActivationUint8Workload.hpp"
+#include <backends/cl/ClLayerSupport.hpp>
+
+#include <backends/aclCommon/ArmComputeUtils.hpp>
+#include <backends/cl/ClTensorHandle.hpp>
+#include <backends/CpuTensorHandle.hpp>
+
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+ClActivationUint8Workload::ClActivationUint8Workload(const ActivationQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : Uint8Workload<ActivationQueueDescriptor>(descriptor, info)
+{
+ auto activation = ConvertActivationFunctionToAclActivationFunction(m_Data.m_Parameters.m_Function);
+ arm_compute::ActivationLayerInfo layerInfo(activation,
+ m_Data.m_Parameters.m_A,
+ m_Data.m_Parameters.m_B);
+
+ m_Data.ValidateInputsOutputs("ClActivationUint8Workload", 1, 1);
+
+ arm_compute::ICLTensor& input = static_cast<ClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ICLTensor& output = static_cast<ClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+ m_ActivationLayer.configure(&input, &output, layerInfo);
+}
+
+void ClActivationUint8Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClActivationUint8Workload_Execute");
+
+ m_ActivationLayer.run();
+}
+
+} //namespace Armnn
+
+
diff --git a/src/backends/cl/workloads/ClActivationUint8Workload.hpp b/src/backends/cl/workloads/ClActivationUint8Workload.hpp
new file mode 100644
index 0000000000..d0b7d3a78f
--- /dev/null
+++ b/src/backends/cl/workloads/ClActivationUint8Workload.hpp
@@ -0,0 +1,29 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+
+#include <arm_compute/runtime/CL/CLFunctions.h>
+
+namespace armnn
+{
+
+// Activation layer execution.
+class ClActivationUint8Workload : public Uint8Workload<ActivationQueueDescriptor>
+{
+public:
+ ClActivationUint8Workload(const ActivationQueueDescriptor& descriptor, const WorkloadInfo& info);
+ void Execute() const override;
+
+private:
+ mutable arm_compute::CLActivationLayer m_ActivationLayer;
+};
+
+} //namespace armnn
+
+
+
diff --git a/src/backends/cl/workloads/ClAdditionWorkload.cpp b/src/backends/cl/workloads/ClAdditionWorkload.cpp
new file mode 100644
index 0000000000..aa032e872c
--- /dev/null
+++ b/src/backends/cl/workloads/ClAdditionWorkload.cpp
@@ -0,0 +1,66 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClAdditionWorkload.hpp"
+
+#include <backends/cl/ClTensorHandle.hpp>
+#include <backends/CpuTensorHandle.hpp>
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+using namespace armcomputetensorutils;
+
+static constexpr arm_compute::ConvertPolicy g_AclConvertPolicy = arm_compute::ConvertPolicy::SATURATE;
+
+template <armnn::DataType... T>
+ClAdditionWorkload<T...>::ClAdditionWorkload(const AdditionQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : TypedWorkload<AdditionQueueDescriptor, T...>(descriptor, info)
+{
+ this->m_Data.ValidateInputsOutputs("ClAdditionWorkload", 2, 1);
+
+ arm_compute::ICLTensor& input0 = static_cast<IClTensorHandle*>(this->m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ICLTensor& input1 = static_cast<IClTensorHandle*>(this->m_Data.m_Inputs[1])->GetTensor();
+ arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(this->m_Data.m_Outputs[0])->GetTensor();
+ m_Layer.configure(&input0, &input1, &output, g_AclConvertPolicy);
+}
+
+template <armnn::DataType... T>
+void ClAdditionWorkload<T...>::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClAdditionWorkload_Execute");
+ m_Layer.run();
+}
+
+bool ClAdditionValidate(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported)
+{
+ const arm_compute::TensorInfo aclInput0Info = BuildArmComputeTensorInfo(input0);
+ const arm_compute::TensorInfo aclInput1Info = BuildArmComputeTensorInfo(input1);
+ const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output);
+
+ const arm_compute::Status aclStatus = arm_compute::CLArithmeticAddition::validate(&aclInput0Info,
+ &aclInput1Info,
+ &aclOutputInfo,
+ g_AclConvertPolicy);
+
+ const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK);
+ if (!supported && reasonIfUnsupported)
+ {
+ *reasonIfUnsupported = aclStatus.error_description();
+ }
+
+ return supported;
+}
+
+} //namespace armnn
+
+template class armnn::ClAdditionWorkload<armnn::DataType::Float16, armnn::DataType::Float32>;
+template class armnn::ClAdditionWorkload<armnn::DataType::QuantisedAsymm8>;
diff --git a/src/backends/cl/workloads/ClAdditionWorkload.hpp b/src/backends/cl/workloads/ClAdditionWorkload.hpp
new file mode 100644
index 0000000000..3e4ee26793
--- /dev/null
+++ b/src/backends/cl/workloads/ClAdditionWorkload.hpp
@@ -0,0 +1,31 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+
+#include <arm_compute/runtime/CL/CLFunctions.h>
+
+namespace armnn
+{
+
+template <armnn::DataType... dataTypes>
+class ClAdditionWorkload : public TypedWorkload<AdditionQueueDescriptor, dataTypes...>
+{
+public:
+ ClAdditionWorkload(const AdditionQueueDescriptor& descriptor, const WorkloadInfo& info);
+
+ void Execute() const override;
+
+private:
+ mutable arm_compute::CLArithmeticAddition m_Layer;
+};
+
+bool ClAdditionValidate(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported);
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClBaseConstantWorkload.cpp b/src/backends/cl/workloads/ClBaseConstantWorkload.cpp
new file mode 100644
index 0000000000..2557020b59
--- /dev/null
+++ b/src/backends/cl/workloads/ClBaseConstantWorkload.cpp
@@ -0,0 +1,64 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClBaseConstantWorkload.hpp"
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+#include <backends/cl/ClTensorHandle.hpp>
+#include <backends/CpuTensorHandle.hpp>
+#include <Half.hpp>
+
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+template class ClBaseConstantWorkload<DataType::Float16, DataType::Float32>;
+template class ClBaseConstantWorkload<DataType::QuantisedAsymm8>;
+
+template<armnn::DataType... dataTypes>
+void ClBaseConstantWorkload<dataTypes...>::Execute() const
+{
+ // The intermediate tensor held by the corresponding layer output handler can be initialised with the given data
+ // on the first inference, then reused for subsequent inferences.
+ // The initialisation cannot happen at workload construction time since the ACL kernel for the next layer may not
+ // have been configured at the time.
+ if (!m_RanOnce)
+ {
+ const ConstantQueueDescriptor& data = this->m_Data;
+
+ BOOST_ASSERT(data.m_LayerOutput != nullptr);
+ arm_compute::CLTensor& output = static_cast<ClTensorHandle*>(data.m_Outputs[0])->GetTensor();
+ arm_compute::DataType computeDataType = static_cast<ClTensorHandle*>(data.m_Outputs[0])->GetDataType();
+
+ switch (computeDataType)
+ {
+ case arm_compute::DataType::F16:
+ {
+ CopyArmComputeClTensorData(output, data.m_LayerOutput->GetConstTensor<Half>());
+ break;
+ }
+ case arm_compute::DataType::F32:
+ {
+ CopyArmComputeClTensorData(output, data.m_LayerOutput->GetConstTensor<float>());
+ break;
+ }
+ case arm_compute::DataType::QASYMM8:
+ {
+ CopyArmComputeClTensorData(output, data.m_LayerOutput->GetConstTensor<uint8_t>());
+ break;
+ }
+ default:
+ {
+ BOOST_ASSERT_MSG(false, "Unknown data type");
+ break;
+ }
+ }
+
+ m_RanOnce = true;
+ }
+}
+
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClBaseConstantWorkload.hpp b/src/backends/cl/workloads/ClBaseConstantWorkload.hpp
new file mode 100644
index 0000000000..f7a23a9162
--- /dev/null
+++ b/src/backends/cl/workloads/ClBaseConstantWorkload.hpp
@@ -0,0 +1,30 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+
+#include <arm_compute/runtime/CL/CLFunctions.h>
+
+namespace armnn
+{
+template <armnn::DataType... DataTypes>
+class ClBaseConstantWorkload : public TypedWorkload<ConstantQueueDescriptor, DataTypes...>
+{
+public:
+ ClBaseConstantWorkload(const ConstantQueueDescriptor& descriptor, const WorkloadInfo& info)
+ : TypedWorkload<ConstantQueueDescriptor, DataTypes...>(descriptor, info)
+ , m_RanOnce(false)
+ {
+ }
+
+ void Execute() const override;
+
+private:
+ mutable bool m_RanOnce;
+};
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClBaseMergerWorkload.hpp b/src/backends/cl/workloads/ClBaseMergerWorkload.hpp
new file mode 100644
index 0000000000..f8ff6f9379
--- /dev/null
+++ b/src/backends/cl/workloads/ClBaseMergerWorkload.hpp
@@ -0,0 +1,28 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+
+#include <arm_compute/runtime/CL/CLFunctions.h>
+
+namespace armnn
+{
+
+// Base class template providing an implementation of the Merger layer common to all data types.
+template <armnn::DataType... DataTypes>
+class ClBaseMergerWorkload : public TypedWorkload<MergerQueueDescriptor, DataTypes...>
+{
+public:
+ using TypedWorkload<MergerQueueDescriptor, DataTypes...>::TypedWorkload;
+
+ void Execute() const override
+ {
+ // With subtensors, merger is a no-op.
+ }
+};
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClBaseSplitterWorkload.hpp b/src/backends/cl/workloads/ClBaseSplitterWorkload.hpp
new file mode 100644
index 0000000000..7fdcc84235
--- /dev/null
+++ b/src/backends/cl/workloads/ClBaseSplitterWorkload.hpp
@@ -0,0 +1,28 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+
+#include <arm_compute/runtime/CL/CLFunctions.h>
+
+namespace armnn
+{
+
+// Base class template providing an implementation of the Splitter layer common to all data types.
+template <armnn::DataType... DataTypes>
+class ClBaseSplitterWorkload : public TypedWorkload<SplitterQueueDescriptor, DataTypes...>
+{
+public:
+ using TypedWorkload<SplitterQueueDescriptor, DataTypes...>::TypedWorkload;
+
+ void Execute() const override
+ {
+ // With subtensors, merger is a no-op.
+ }
+};
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.cpp b/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.cpp
new file mode 100644
index 0000000000..5bff7a63c9
--- /dev/null
+++ b/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.cpp
@@ -0,0 +1,96 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClBatchNormalizationFloatWorkload.hpp"
+#include <backends/cl/ClTensorHandle.hpp>
+#include <backends/CpuTensorHandle.hpp>
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+#include <backends/cl/ClLayerSupport.hpp>
+
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+using namespace armcomputetensorutils;
+
+arm_compute::Status ClBatchNormalizationValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& mean,
+ const TensorInfo& var,
+ const TensorInfo& beta,
+ const TensorInfo& gamma,
+ const BatchNormalizationDescriptor &desc)
+{
+ const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output);
+ const arm_compute::TensorInfo aclMeanInfo = BuildArmComputeTensorInfo(mean);
+ const arm_compute::TensorInfo aclVarInfo = BuildArmComputeTensorInfo(var);
+ const arm_compute::TensorInfo aclBetaInfo = BuildArmComputeTensorInfo(beta);
+ const arm_compute::TensorInfo aclGammaInfo = BuildArmComputeTensorInfo(gamma);
+
+ return arm_compute::CLBatchNormalizationLayer::validate(&aclInputInfo,
+ &aclOutputInfo,
+ &aclMeanInfo,
+ &aclVarInfo,
+ &aclBetaInfo,
+ &aclGammaInfo,
+ desc.m_Eps);
+}
+
+ClBatchNormalizationFloatWorkload::ClBatchNormalizationFloatWorkload(
+ const BatchNormalizationQueueDescriptor& descriptor, const WorkloadInfo& info)
+ : FloatWorkload<BatchNormalizationQueueDescriptor>(descriptor, info)
+{
+ m_Mean = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_Mean, m_Data.m_Mean->GetTensorInfo());
+
+ m_Variance = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_Variance, m_Data.m_Variance->GetTensorInfo());
+
+ m_Gamma = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_Gamma, m_Data.m_Gamma->GetTensorInfo());
+
+ m_Beta = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_Beta, m_Data.m_Beta->GetTensorInfo());
+
+ m_Data.ValidateInputsOutputs("ClBatchNormalizationFloatWorkload", 1, 1);
+
+ arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+ m_Layer.configure(&input,
+ &output,
+ m_Mean.get(),
+ m_Variance.get(),
+ m_Beta.get(),
+ m_Gamma.get(),
+ m_Data.m_Parameters.m_Eps);
+
+ InitializeArmComputeClTensorData(*m_Mean, m_Data.m_Mean);
+ InitializeArmComputeClTensorData(*m_Variance, m_Data.m_Variance);
+ InitializeArmComputeClTensorData(*m_Beta, m_Data.m_Beta);
+ InitializeArmComputeClTensorData(*m_Gamma, m_Data.m_Gamma);
+
+ // Force Compute Library to perform the necessary copying and reshaping, after which
+ // delete all the input tensors that will no longer be needed
+ m_Layer.prepare();
+ FreeUnusedTensors();
+}
+
+void ClBatchNormalizationFloatWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClBatchNormalizationFloatWorkload_Execute");
+ m_Layer.run();
+}
+
+void ClBatchNormalizationFloatWorkload::FreeUnusedTensors()
+{
+ FreeTensorIfUnused(m_Mean);
+ FreeTensorIfUnused(m_Variance);
+ FreeTensorIfUnused(m_Gamma);
+ FreeTensorIfUnused(m_Beta);
+}
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.hpp b/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.hpp
new file mode 100644
index 0000000000..804591c444
--- /dev/null
+++ b/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.hpp
@@ -0,0 +1,46 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+
+#include <arm_compute/runtime/CL/CLFunctions.h>
+
+namespace armnn
+{
+
+arm_compute::Status ClBatchNormalizationValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& mean,
+ const TensorInfo& var,
+ const TensorInfo& beta,
+ const TensorInfo& gamma,
+ const BatchNormalizationDescriptor& desc);
+
+class ClBatchNormalizationFloatWorkload : public FloatWorkload<BatchNormalizationQueueDescriptor>
+{
+public:
+ ClBatchNormalizationFloatWorkload(const BatchNormalizationQueueDescriptor& descriptor, const WorkloadInfo& info);
+
+ using FloatWorkload<BatchNormalizationQueueDescriptor>::FloatWorkload;
+ void Execute() const override;
+
+private:
+ mutable arm_compute::CLBatchNormalizationLayer m_Layer;
+
+ std::unique_ptr<arm_compute::CLTensor> m_Mean;
+ std::unique_ptr<arm_compute::CLTensor> m_Variance;
+ std::unique_ptr<arm_compute::CLTensor> m_Gamma;
+ std::unique_ptr<arm_compute::CLTensor> m_Beta;
+
+ void FreeUnusedTensors();
+};
+
+} //namespace armnn
+
+
+
+
diff --git a/src/backends/cl/workloads/ClConstantFloatWorkload.cpp b/src/backends/cl/workloads/ClConstantFloatWorkload.cpp
new file mode 100644
index 0000000000..1565047c22
--- /dev/null
+++ b/src/backends/cl/workloads/ClConstantFloatWorkload.cpp
@@ -0,0 +1,18 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClConstantFloatWorkload.hpp"
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+void ClConstantFloatWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClConstantFloatWorkload_Execute");
+ ClBaseConstantWorkload::Execute();
+}
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClConstantFloatWorkload.hpp b/src/backends/cl/workloads/ClConstantFloatWorkload.hpp
new file mode 100644
index 0000000000..0cbeaad9ea
--- /dev/null
+++ b/src/backends/cl/workloads/ClConstantFloatWorkload.hpp
@@ -0,0 +1,20 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "ClBaseConstantWorkload.hpp"
+
+namespace armnn
+{
+class ClConstantFloatWorkload : public ClBaseConstantWorkload<DataType::Float16, DataType::Float32>
+{
+public:
+ using ClBaseConstantWorkload<DataType::Float16, DataType::Float32>::ClBaseConstantWorkload;
+ void Execute() const override;
+};
+
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClConstantUint8Workload.cpp b/src/backends/cl/workloads/ClConstantUint8Workload.cpp
new file mode 100644
index 0000000000..a5ef0321cd
--- /dev/null
+++ b/src/backends/cl/workloads/ClConstantUint8Workload.cpp
@@ -0,0 +1,18 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClConstantUint8Workload.hpp"
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+void ClConstantUint8Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClConstantUint8Workload_Execute");
+ ClBaseConstantWorkload::Execute();
+}
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClConstantUint8Workload.hpp b/src/backends/cl/workloads/ClConstantUint8Workload.hpp
new file mode 100644
index 0000000000..30556dc0d6
--- /dev/null
+++ b/src/backends/cl/workloads/ClConstantUint8Workload.hpp
@@ -0,0 +1,20 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "ClBaseConstantWorkload.hpp"
+
+namespace armnn
+{
+
+class ClConstantUint8Workload : public ClBaseConstantWorkload<DataType::QuantisedAsymm8>
+{
+public:
+ using ClBaseConstantWorkload<DataType::QuantisedAsymm8>::ClBaseConstantWorkload;
+ void Execute() const override;
+};
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.cpp b/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.cpp
new file mode 100644
index 0000000000..e7663b4ca4
--- /dev/null
+++ b/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.cpp
@@ -0,0 +1,66 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClConvertFp16ToFp32Workload.hpp"
+#include <backends/cl/ClTensorHandle.hpp>
+
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+using namespace armcomputetensorutils;
+
+static constexpr arm_compute::ConvertPolicy g_AclConvertPolicy = arm_compute::ConvertPolicy::SATURATE;
+
+ClConvertFp16ToFp32Workload::ClConvertFp16ToFp32Workload(
+ const ConvertFp16ToFp32QueueDescriptor& descriptor, const WorkloadInfo& info) :
+ Float16ToFloat32Workload<ConvertFp16ToFp32QueueDescriptor>(descriptor, info)
+{
+ this->m_Data.ValidateInputsOutputs("ClConvertFp16ToFp32Workload", 1, 1);
+
+ arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(this->m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(this->m_Data.m_Outputs[0])->GetTensor();
+
+ m_Layer.configure(&input, &output, g_AclConvertPolicy, 0);
+}
+
+void ClConvertFp16ToFp32Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClConvertFp16ToFp32Workload_Execute");
+ m_Layer.run();
+}
+
+arm_compute::Status ClConvertFp16ToFp32WorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported)
+{
+ if (input.GetDataType() != DataType::Float16)
+ {
+ *reasonIfUnsupported = "Input should be Float16";
+ return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, *reasonIfUnsupported);
+ }
+ if (output.GetDataType() != DataType::Float32)
+ {
+ *reasonIfUnsupported = "Output should be Float32";
+ return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, *reasonIfUnsupported);
+ }
+
+ const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output);
+
+ const arm_compute::Status aclStatus = arm_compute::CLDepthConvertLayer::validate(
+ &aclInputInfo, &aclOutputInfo, g_AclConvertPolicy, 0);
+
+ const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK);
+ if (!supported && reasonIfUnsupported)
+ {
+ *reasonIfUnsupported = aclStatus.error_description();
+ }
+
+ return aclStatus;
+}
+
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.hpp b/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.hpp
new file mode 100644
index 0000000000..b6447488f7
--- /dev/null
+++ b/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.hpp
@@ -0,0 +1,30 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+
+#include <arm_compute/runtime/CL/CLFunctions.h>
+
+namespace armnn
+{
+
+class ClConvertFp16ToFp32Workload : public Float16ToFloat32Workload<ConvertFp16ToFp32QueueDescriptor>
+{
+public:
+
+ ClConvertFp16ToFp32Workload(const ConvertFp16ToFp32QueueDescriptor& descriptor, const WorkloadInfo& info);
+ virtual void Execute() const override;
+
+private:
+ mutable arm_compute::CLDepthConvertLayer m_Layer;
+};
+
+arm_compute::Status ClConvertFp16ToFp32WorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported);
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.cpp b/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.cpp
new file mode 100644
index 0000000000..2ae4adc424
--- /dev/null
+++ b/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.cpp
@@ -0,0 +1,66 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClConvertFp32ToFp16Workload.hpp"
+#include <backends/cl/ClTensorHandle.hpp>
+
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+using namespace armcomputetensorutils;
+
+static constexpr arm_compute::ConvertPolicy g_AclConvertPolicy = arm_compute::ConvertPolicy::SATURATE;
+
+ClConvertFp32ToFp16Workload::ClConvertFp32ToFp16Workload(
+ const ConvertFp32ToFp16QueueDescriptor& descriptor, const WorkloadInfo& info) :
+ Float32ToFloat16Workload<ConvertFp32ToFp16QueueDescriptor>(descriptor, info)
+{
+ this->m_Data.ValidateInputsOutputs("ClConvertFp32ToFp16Workload", 1, 1);
+
+ arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(this->m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(this->m_Data.m_Outputs[0])->GetTensor();
+
+ m_Layer.configure(&input, &output, g_AclConvertPolicy, 0);
+}
+
+void ClConvertFp32ToFp16Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClConvertFp32ToFp16Workload_Execute");
+ m_Layer.run();
+}
+
+arm_compute::Status ClConvertFp32ToFp16WorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported)
+{
+ if (input.GetDataType() != DataType::Float32)
+ {
+ *reasonIfUnsupported = "Input should be Float32";
+ return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, *reasonIfUnsupported);
+ }
+ if (output.GetDataType() != DataType::Float16)
+ {
+ *reasonIfUnsupported = "Output should be Float16";
+ return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, *reasonIfUnsupported);
+ }
+
+ const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output);
+
+ const arm_compute::Status aclStatus = arm_compute::CLDepthConvertLayer::validate(
+ &aclInputInfo, &aclOutputInfo, g_AclConvertPolicy, 0);
+
+ const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK);
+ if (!supported && reasonIfUnsupported)
+ {
+ *reasonIfUnsupported = aclStatus.error_description();
+ }
+
+ return aclStatus;
+}
+
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.hpp b/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.hpp
new file mode 100644
index 0000000000..95d19905d7
--- /dev/null
+++ b/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.hpp
@@ -0,0 +1,30 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+
+#include <arm_compute/runtime/CL/CLFunctions.h>
+
+namespace armnn
+{
+
+class ClConvertFp32ToFp16Workload : public Float32ToFloat16Workload<ConvertFp32ToFp16QueueDescriptor>
+{
+public:
+
+ ClConvertFp32ToFp16Workload(const ConvertFp32ToFp16QueueDescriptor& descriptor, const WorkloadInfo& info);
+ virtual void Execute() const override;
+
+private:
+ mutable arm_compute::CLDepthConvertLayer m_Layer;
+};
+
+arm_compute::Status ClConvertFp32ToFp16WorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported);
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClConvolution2dBaseWorkload.cpp b/src/backends/cl/workloads/ClConvolution2dBaseWorkload.cpp
new file mode 100644
index 0000000000..58699a8287
--- /dev/null
+++ b/src/backends/cl/workloads/ClConvolution2dBaseWorkload.cpp
@@ -0,0 +1,48 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClConvolution2dBaseWorkload.hpp"
+#include <backends/cl/ClLayerSupport.hpp>
+#include <backends/cl/ClTensorHandle.hpp>
+#include <backends/aclCommon/ArmComputeUtils.hpp>
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+
+#include <arm_compute/runtime/CL/functions/CLConvolutionLayer.h>
+
+namespace armnn
+{
+using namespace armcomputetensorutils;
+
+arm_compute::Status ClConvolution2dWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const Convolution2dDescriptor& descriptor,
+ const TensorInfo& weights,
+ const boost::optional<TensorInfo>& biases)
+{
+ const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input, descriptor.m_DataLayout);
+ const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output, descriptor.m_DataLayout);
+ const arm_compute::TensorInfo aclWeightsInfo = BuildArmComputeTensorInfo(weights, descriptor.m_DataLayout);
+
+ arm_compute::TensorInfo aclBiasesInfo;
+ arm_compute::TensorInfo *optionalAclBiasesInfo = nullptr;
+
+ if (descriptor.m_BiasEnabled)
+ {
+ BOOST_ASSERT(biases.is_initialized());
+
+ aclBiasesInfo = BuildArmComputeTensorInfo(biases.get(), descriptor.m_DataLayout);
+ optionalAclBiasesInfo = &aclBiasesInfo;
+ }
+
+ arm_compute::PadStrideInfo layerInfo = BuildArmComputePadStrideInfo(descriptor);
+
+ return arm_compute::CLConvolutionLayer::validate(&aclInputInfo,
+ &aclWeightsInfo,
+ optionalAclBiasesInfo,
+ &aclOutputInfo,
+ layerInfo);
+}
+
+}
diff --git a/src/backends/cl/workloads/ClConvolution2dBaseWorkload.hpp b/src/backends/cl/workloads/ClConvolution2dBaseWorkload.hpp
new file mode 100644
index 0000000000..a983dba79a
--- /dev/null
+++ b/src/backends/cl/workloads/ClConvolution2dBaseWorkload.hpp
@@ -0,0 +1,24 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <armnn/Tensor.hpp>
+#include <armnn/Descriptors.hpp>
+
+#include <boost/optional.hpp>
+
+#include <arm_compute/core/Error.h>
+
+namespace armnn
+{
+
+arm_compute::Status ClConvolution2dWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const Convolution2dDescriptor& descriptor,
+ const TensorInfo& weights,
+ const boost::optional<TensorInfo>& biases);
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClConvolution2dFloatWorkload.cpp b/src/backends/cl/workloads/ClConvolution2dFloatWorkload.cpp
new file mode 100644
index 0000000000..813808345e
--- /dev/null
+++ b/src/backends/cl/workloads/ClConvolution2dFloatWorkload.cpp
@@ -0,0 +1,81 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClConvolution2dFloatWorkload.hpp"
+#include <backends/cl/ClTensorHandle.hpp>
+#include <backends/CpuTensorHandle.hpp>
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+#include <backends/cl/ClLayerSupport.hpp>
+
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+using namespace armcomputetensorutils;
+
+ClConvolution2dFloatWorkload::ClConvolution2dFloatWorkload(const Convolution2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
+ : FloatWorkload<Convolution2dQueueDescriptor>(descriptor, info)
+ , m_ConvolutionLayer(memoryManager)
+{
+
+ // todo: check tensor shapes match.
+ const TensorInfo& weightInfo = m_Data.m_Weight->GetTensorInfo();
+
+ m_KernelTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_KernelTensor, weightInfo, descriptor.m_DataLayout);
+
+ arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX,
+ m_Data.m_Parameters.m_StrideY,
+ m_Data.m_Parameters.m_PadLeft,
+ m_Data.m_Parameters.m_PadRight,
+ m_Data.m_Parameters.m_PadTop,
+ m_Data.m_Parameters.m_PadBottom,
+ arm_compute::DimensionRoundingType::FLOOR);
+
+ if (m_Data.m_Parameters.m_BiasEnabled)
+ {
+ m_BiasTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo(), descriptor.m_DataLayout);
+ }
+
+ m_Data.ValidateInputsOutputs("ClConvolution2dFloat32Workload", 1, 1);
+
+ arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+ m_ConvolutionLayer.configure(&input,
+ m_KernelTensor.get(),
+ m_BiasTensor.get(),
+ &output,
+ padStrideInfo);
+
+ InitializeArmComputeClTensorData(*m_KernelTensor, m_Data.m_Weight);
+
+ if (m_BiasTensor)
+ {
+ InitializeArmComputeClTensorData(*m_BiasTensor, m_Data.m_Bias);
+ }
+
+ // Force Compute Library to perform the necessary copying and reshaping, after which
+ // delete all the input tensors that will no longer be needed
+ m_ConvolutionLayer.prepare();
+ FreeUnusedTensors();
+}
+
+void ClConvolution2dFloatWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClConvolution2dFloat32Workload_Execute");
+
+ m_ConvolutionLayer.run();
+}
+
+void ClConvolution2dFloatWorkload::FreeUnusedTensors()
+{
+ FreeTensorIfUnused(m_KernelTensor);
+ FreeTensorIfUnused(m_BiasTensor);
+}
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClConvolution2dFloatWorkload.hpp b/src/backends/cl/workloads/ClConvolution2dFloatWorkload.hpp
new file mode 100644
index 0000000000..1f9710e1ea
--- /dev/null
+++ b/src/backends/cl/workloads/ClConvolution2dFloatWorkload.hpp
@@ -0,0 +1,35 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+
+#include <arm_compute/runtime/CL/CLFunctions.h>
+#include <arm_compute/runtime/MemoryManagerOnDemand.h>
+
+#include <memory>
+
+namespace armnn
+{
+
+class ClConvolution2dFloatWorkload : public FloatWorkload<Convolution2dQueueDescriptor>
+{
+public:
+ ClConvolution2dFloatWorkload(const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info,
+ std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager);
+ void Execute() const override;
+
+private:
+ mutable arm_compute::CLConvolutionLayer m_ConvolutionLayer;
+
+ std::unique_ptr<arm_compute::CLTensor> m_KernelTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_BiasTensor;
+
+ void FreeUnusedTensors();
+};
+
+} //namespace armnn
+
diff --git a/src/backends/cl/workloads/ClConvolution2dUint8Workload.cpp b/src/backends/cl/workloads/ClConvolution2dUint8Workload.cpp
new file mode 100644
index 0000000000..d9b9dfd833
--- /dev/null
+++ b/src/backends/cl/workloads/ClConvolution2dUint8Workload.cpp
@@ -0,0 +1,81 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClConvolution2dUint8Workload.hpp"
+#include <backends/cl/ClTensorHandle.hpp>
+#include <backends/CpuTensorHandle.hpp>
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+#include <backends/cl/ClLayerSupport.hpp>
+
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+using namespace armcomputetensorutils;
+
+ClConvolution2dUint8Workload::ClConvolution2dUint8Workload(const Convolution2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
+ : Uint8Workload<Convolution2dQueueDescriptor>(descriptor, info)
+ , m_ConvolutionLayer(memoryManager)
+{
+ // todo: check tensor shapes match
+ const TensorInfo& weightInfo = m_Data.m_Weight->GetTensorInfo();
+
+ m_KernelTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_KernelTensor, weightInfo, descriptor.m_DataLayout);
+
+ arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX,
+ m_Data.m_Parameters.m_StrideY,
+ m_Data.m_Parameters.m_PadLeft,
+ m_Data.m_Parameters.m_PadRight,
+ m_Data.m_Parameters.m_PadTop,
+ m_Data.m_Parameters.m_PadBottom,
+ arm_compute::DimensionRoundingType::FLOOR);
+
+ if (m_Data.m_Parameters.m_BiasEnabled)
+ {
+ m_BiasTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo(), descriptor.m_DataLayout);
+ }
+
+ m_Data.ValidateInputsOutputs("ClConvolution2dUint8Workload", 1, 1);
+
+ arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+ m_ConvolutionLayer.configure(&input,
+ m_KernelTensor.get(),
+ m_BiasTensor.get(),
+ &output,
+ padStrideInfo);
+
+ InitializeArmComputeClTensorData(*m_KernelTensor, m_Data.m_Weight);
+
+ if (m_BiasTensor)
+ {
+ InitializeArmComputeClTensorData(*m_BiasTensor, m_Data.m_Bias);
+ }
+
+ // Force Compute Library to perform the necessary copying and reshaping, after which
+ // delete all the input tensors that will no longer be needed
+ m_ConvolutionLayer.prepare();
+ FreeUnusedTensors();
+}
+
+void ClConvolution2dUint8Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClConvolution2dUint8Workload_Execute");
+
+ m_ConvolutionLayer.run();
+}
+
+void ClConvolution2dUint8Workload::FreeUnusedTensors()
+{
+ FreeTensorIfUnused(m_KernelTensor);
+ FreeTensorIfUnused(m_BiasTensor);
+}
+
+} //namespace armnn
+
diff --git a/src/backends/cl/workloads/ClConvolution2dUint8Workload.hpp b/src/backends/cl/workloads/ClConvolution2dUint8Workload.hpp
new file mode 100644
index 0000000000..1720ec935c
--- /dev/null
+++ b/src/backends/cl/workloads/ClConvolution2dUint8Workload.hpp
@@ -0,0 +1,35 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+
+#include <arm_compute/runtime/CL/CLFunctions.h>
+#include <arm_compute/runtime/MemoryManagerOnDemand.h>
+
+#include <memory>
+
+namespace armnn
+{
+
+class ClConvolution2dUint8Workload : public Uint8Workload<Convolution2dQueueDescriptor>
+{
+public:
+ ClConvolution2dUint8Workload(const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info,
+ std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager);
+ void Execute() const override;
+
+private:
+ mutable arm_compute::CLConvolutionLayer m_ConvolutionLayer;
+
+ std::unique_ptr<arm_compute::CLTensor> m_KernelTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_BiasTensor;
+
+ void FreeUnusedTensors();
+};
+
+} //namespace armnn
+
diff --git a/src/backends/cl/workloads/ClDepthwiseConvolutionBaseWorkload.cpp b/src/backends/cl/workloads/ClDepthwiseConvolutionBaseWorkload.cpp
new file mode 100644
index 0000000000..5a036db922
--- /dev/null
+++ b/src/backends/cl/workloads/ClDepthwiseConvolutionBaseWorkload.cpp
@@ -0,0 +1,125 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClDepthwiseConvolutionBaseWorkload.hpp"
+
+#include "TypeUtils.hpp"
+
+#include <backends/aclCommon/ArmComputeUtils.hpp>
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+#include <backends/cl/ClTensorHandle.hpp>
+#include <backends/CpuTensorHandle.hpp>
+
+namespace armnn
+{
+
+using namespace armcomputetensorutils;
+
+arm_compute::Status ClDepthwiseConvolutionWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const DepthwiseConvolution2dDescriptor& descriptor,
+ const TensorInfo& weights,
+ const boost::optional<TensorInfo>& biases)
+{
+ const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input, descriptor.m_DataLayout);
+ const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output, descriptor.m_DataLayout);
+ const arm_compute::TensorInfo aclWeightsInfo = BuildArmComputeTensorInfo(weights, descriptor.m_DataLayout);
+
+ arm_compute::TensorInfo aclBiasesInfo;
+ arm_compute::TensorInfo *optionalAclBiasesInfo = nullptr;
+
+ if (descriptor.m_BiasEnabled)
+ {
+ BOOST_ASSERT(biases.is_initialized());
+
+ aclBiasesInfo = BuildArmComputeTensorInfo(biases.get(), descriptor.m_DataLayout);
+ optionalAclBiasesInfo = &aclBiasesInfo;
+ }
+
+ const arm_compute::PadStrideInfo aclPadStrideInfo = BuildArmComputePadStrideInfo(descriptor);
+ const unsigned int aclDepthMultiplier = weights.GetShape()[0];
+
+ return arm_compute::CLDepthwiseConvolutionLayer::validate(&aclInputInfo,
+ &aclWeightsInfo,
+ optionalAclBiasesInfo,
+ &aclOutputInfo,
+ aclPadStrideInfo,
+ aclDepthMultiplier);
+}
+
+template<armnn::DataType... dataTypes>
+ClDepthwiseConvolutionBaseWorkload<dataTypes...>::ClDepthwiseConvolutionBaseWorkload(
+ const DepthwiseConvolution2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : TypedWorkload<DepthwiseConvolution2dQueueDescriptor, dataTypes...>(descriptor, info)
+{
+ auto& weightInfo = m_Data.m_Weight->GetTensorInfo();
+
+ m_KernelTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_KernelTensor, weightInfo);
+
+ if (m_Data.m_Parameters.m_BiasEnabled)
+ {
+ m_BiasTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo());
+ }
+
+ arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX,
+ m_Data.m_Parameters.m_StrideY,
+ m_Data.m_Parameters.m_PadLeft,
+ m_Data.m_Parameters.m_PadRight,
+ m_Data.m_Parameters.m_PadTop,
+ m_Data.m_Parameters.m_PadBottom,
+ arm_compute::DimensionRoundingType::FLOOR);
+
+ std::string name = std::string("ClDepthwiseConvolution") +
+ GetDataTypeName(m_Data.m_Weight->GetTensorInfo().GetDataType()) + "Workload";
+ m_Data.ValidateInputsOutputs(name, 1, 1);
+
+ arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+ const unsigned int depthMultiplier = weightInfo.GetShape()[0];
+
+ //Check for optimisation opportunities.
+ bool use3x3Optimisation = (weightInfo.GetShape()[3] == 3) && (weightInfo.GetShape()[2] == 3);
+ if (use3x3Optimisation)
+ {
+ m_DepthwiseConvolutionLayer = std::make_unique<arm_compute::CLDepthwiseConvolutionLayer3x3>();
+ static_cast<arm_compute::CLDepthwiseConvolutionLayer3x3*>(m_DepthwiseConvolutionLayer.get())->configure(
+ &input,
+ m_KernelTensor.get(),
+ m_BiasTensor.get(),
+ &output,
+ padStrideInfo,
+ depthMultiplier);
+ }
+ else
+ {
+ m_DepthwiseConvolutionLayer = std::make_unique<arm_compute::CLDepthwiseConvolutionLayer>();
+ static_cast<arm_compute::CLDepthwiseConvolutionLayer*>(m_DepthwiseConvolutionLayer.get())->configure(
+ &input,
+ m_KernelTensor.get(),
+ m_BiasTensor.get(),
+ &output,
+ padStrideInfo,
+ depthMultiplier);
+ }
+
+ BOOST_ASSERT(m_DepthwiseConvolutionLayer);
+}
+
+template<armnn::DataType... dataTypes>
+void ClDepthwiseConvolutionBaseWorkload<dataTypes...>::FreeUnusedTensors()
+{
+ FreeTensorIfUnused(m_KernelTensor);
+ FreeTensorIfUnused(m_BiasTensor);
+}
+
+// Generate known implementations for linker
+template class ClDepthwiseConvolutionBaseWorkload<DataType::Float16, DataType::Float32>;
+template class ClDepthwiseConvolutionBaseWorkload<DataType::QuantisedAsymm8>;
+
+} // namespace armnn
diff --git a/src/backends/cl/workloads/ClDepthwiseConvolutionBaseWorkload.hpp b/src/backends/cl/workloads/ClDepthwiseConvolutionBaseWorkload.hpp
new file mode 100644
index 0000000000..9d5cde30b6
--- /dev/null
+++ b/src/backends/cl/workloads/ClDepthwiseConvolutionBaseWorkload.hpp
@@ -0,0 +1,40 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+#include <boost/optional.hpp>
+
+#include <arm_compute/runtime/CL/CLFunctions.h>
+
+namespace armnn
+{
+
+arm_compute::Status ClDepthwiseConvolutionWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const DepthwiseConvolution2dDescriptor& descriptor,
+ const TensorInfo& weights,
+ const boost::optional<TensorInfo>& biases);
+
+template<armnn::DataType... dataTypes>
+class ClDepthwiseConvolutionBaseWorkload : public TypedWorkload<DepthwiseConvolution2dQueueDescriptor, dataTypes...>
+{
+public:
+ using TypedWorkload<DepthwiseConvolution2dQueueDescriptor, dataTypes...>::m_Data;
+
+ ClDepthwiseConvolutionBaseWorkload(const DepthwiseConvolution2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info);
+
+protected:
+ std::unique_ptr<arm_compute::IFunction> m_DepthwiseConvolutionLayer;
+
+ std::unique_ptr<arm_compute::CLTensor> m_KernelTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_BiasTensor;
+
+ void FreeUnusedTensors();
+};
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClDepthwiseConvolutionFloatWorkload.cpp b/src/backends/cl/workloads/ClDepthwiseConvolutionFloatWorkload.cpp
new file mode 100644
index 0000000000..17ecd29307
--- /dev/null
+++ b/src/backends/cl/workloads/ClDepthwiseConvolutionFloatWorkload.cpp
@@ -0,0 +1,39 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClDepthwiseConvolutionFloatWorkload.hpp"
+
+#include <backends/CpuTensorHandle.hpp>
+
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+ClDepthwiseConvolutionFloatWorkload::ClDepthwiseConvolutionFloatWorkload(
+ const DepthwiseConvolution2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : ClDepthwiseConvolutionBaseWorkload(descriptor, info)
+{
+ InitializeArmComputeClTensorData(*m_KernelTensor, m_Data.m_Weight);
+
+ if (m_BiasTensor)
+ {
+ InitializeArmComputeClTensorData(*m_BiasTensor, m_Data.m_Bias);
+ }
+
+ m_DepthwiseConvolutionLayer->prepare();
+ FreeUnusedTensors();
+}
+
+void ClDepthwiseConvolutionFloatWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClDepthwiseConvolutionFloatWorkload_Execute");
+ BOOST_ASSERT(m_DepthwiseConvolutionLayer);
+
+ m_DepthwiseConvolutionLayer->run();
+}
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClDepthwiseConvolutionFloatWorkload.hpp b/src/backends/cl/workloads/ClDepthwiseConvolutionFloatWorkload.hpp
new file mode 100644
index 0000000000..4f9d5f332e
--- /dev/null
+++ b/src/backends/cl/workloads/ClDepthwiseConvolutionFloatWorkload.hpp
@@ -0,0 +1,26 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "ClDepthwiseConvolutionBaseWorkload.hpp"
+
+namespace armnn
+{
+
+class ClDepthwiseConvolutionFloatWorkload : public ClDepthwiseConvolutionBaseWorkload<DataType::Float16,
+ DataType::Float32>
+{
+public:
+ ClDepthwiseConvolutionFloatWorkload(const DepthwiseConvolution2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info);
+ void Execute() const override;
+};
+
+} //namespace armnn
+
+
+
+
diff --git a/src/backends/cl/workloads/ClDepthwiseConvolutionUint8Workload.cpp b/src/backends/cl/workloads/ClDepthwiseConvolutionUint8Workload.cpp
new file mode 100644
index 0000000000..22922e4df6
--- /dev/null
+++ b/src/backends/cl/workloads/ClDepthwiseConvolutionUint8Workload.cpp
@@ -0,0 +1,40 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClDepthwiseConvolutionUint8Workload.hpp"
+
+#include <backends/CpuTensorHandle.hpp>
+
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+ClDepthwiseConvolutionUint8Workload::ClDepthwiseConvolutionUint8Workload(
+ const DepthwiseConvolution2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : ClDepthwiseConvolutionBaseWorkload(descriptor, info)
+{
+ InitializeArmComputeClTensorData(*m_KernelTensor, m_Data.m_Weight);
+
+ if (m_BiasTensor)
+ {
+ InitializeArmComputeClTensorData(*m_BiasTensor, m_Data.m_Bias);
+ }
+
+ m_DepthwiseConvolutionLayer->prepare();
+ FreeUnusedTensors();
+}
+
+void ClDepthwiseConvolutionUint8Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClDepthwiseConvolutionUint8Workload_Execute");
+ BOOST_ASSERT(m_DepthwiseConvolutionLayer);
+
+ m_DepthwiseConvolutionLayer->run();
+}
+
+} //namespace armnn
+
diff --git a/src/backends/cl/workloads/ClDepthwiseConvolutionUint8Workload.hpp b/src/backends/cl/workloads/ClDepthwiseConvolutionUint8Workload.hpp
new file mode 100644
index 0000000000..b9f676de94
--- /dev/null
+++ b/src/backends/cl/workloads/ClDepthwiseConvolutionUint8Workload.hpp
@@ -0,0 +1,23 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "ClDepthwiseConvolutionBaseWorkload.hpp"
+
+namespace armnn
+{
+
+class ClDepthwiseConvolutionUint8Workload : public ClDepthwiseConvolutionBaseWorkload<DataType::QuantisedAsymm8>
+{
+public:
+ ClDepthwiseConvolutionUint8Workload(const DepthwiseConvolution2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info);
+ void Execute() const override;
+};
+
+} //namespace armnn
+
+
diff --git a/src/backends/cl/workloads/ClDivisionFloatWorkload.cpp b/src/backends/cl/workloads/ClDivisionFloatWorkload.cpp
new file mode 100644
index 0000000000..a2d8534682
--- /dev/null
+++ b/src/backends/cl/workloads/ClDivisionFloatWorkload.cpp
@@ -0,0 +1,48 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClDivisionFloatWorkload.hpp"
+#include <backends/cl/ClTensorHandle.hpp>
+#include <backends/CpuTensorHandle.hpp>
+
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+arm_compute::Status ClDivisionWorkloadValidate(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output)
+{
+ const arm_compute::TensorInfo aclInput1 = armcomputetensorutils::BuildArmComputeTensorInfo(input0);
+ const arm_compute::TensorInfo aclInput2 = armcomputetensorutils::BuildArmComputeTensorInfo(input1);
+ const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+
+ return arm_compute::CLArithmeticDivision::validate(&aclInput1, &aclInput2, &aclOutput);
+}
+
+
+ClDivisionFloatWorkload::ClDivisionFloatWorkload(const DivisionQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : FloatWorkload<DivisionQueueDescriptor>(descriptor, info)
+{
+ m_Data.ValidateInputsOutputs("ClDivisionFloatWorkload", 2, 1);
+
+ arm_compute::ICLTensor& input0 = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ICLTensor& input1 = static_cast<IClTensorHandle*>(m_Data.m_Inputs[1])->GetTensor();
+ arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+ // Construct
+ m_ArithmeticDivision.configure(&input0, &input1, &output);
+}
+
+void ClDivisionFloatWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClDivisionFloatWorkload_Execute");
+
+ // Executes the layer.
+ m_ArithmeticDivision.run();
+}
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClDivisionFloatWorkload.hpp b/src/backends/cl/workloads/ClDivisionFloatWorkload.hpp
new file mode 100644
index 0000000000..1aa7ec69f6
--- /dev/null
+++ b/src/backends/cl/workloads/ClDivisionFloatWorkload.hpp
@@ -0,0 +1,32 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+
+#include <arm_compute/runtime/CL/CLFunctions.h>
+
+namespace armnn
+{
+
+arm_compute::Status ClDivisionWorkloadValidate(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output);
+
+class ClDivisionFloatWorkload : public FloatWorkload<DivisionQueueDescriptor>
+{
+public:
+ ClDivisionFloatWorkload(const DivisionQueueDescriptor& descriptor, const
+ WorkloadInfo& info);
+
+ using FloatWorkload<DivisionQueueDescriptor>::FloatWorkload;
+ void Execute() const override;
+
+private:
+ mutable arm_compute::CLArithmeticDivision m_ArithmeticDivision;
+};
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClFloorFloatWorkload.cpp b/src/backends/cl/workloads/ClFloorFloatWorkload.cpp
new file mode 100644
index 0000000000..0a60fc3b5c
--- /dev/null
+++ b/src/backends/cl/workloads/ClFloorFloatWorkload.cpp
@@ -0,0 +1,31 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClFloorFloatWorkload.hpp"
+#include <backends/cl/ClTensorHandle.hpp>
+
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+ClFloorFloatWorkload::ClFloorFloatWorkload(const FloorQueueDescriptor& descriptor, const WorkloadInfo& info)
+ : FloatWorkload<FloorQueueDescriptor>(descriptor, info)
+{
+ m_Data.ValidateInputsOutputs("ClFloorFloatWorkload", 1, 1);
+
+ arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+ m_Layer.configure(&input, &output);
+}
+
+void ClFloorFloatWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClFloorFloatWorkload_Execute");
+ m_Layer.run();
+}
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClFloorFloatWorkload.hpp b/src/backends/cl/workloads/ClFloorFloatWorkload.hpp
new file mode 100644
index 0000000000..513862a4d7
--- /dev/null
+++ b/src/backends/cl/workloads/ClFloorFloatWorkload.hpp
@@ -0,0 +1,30 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+
+#include <arm_compute/runtime/CL/CLFunctions.h>
+
+namespace armnn
+{
+
+class ClFloorFloatWorkload : public FloatWorkload<FloorQueueDescriptor>
+{
+public:
+ ClFloorFloatWorkload(const FloorQueueDescriptor& descriptor, const WorkloadInfo& info);
+
+ void Execute() const override;
+
+private:
+ mutable arm_compute::CLFloor m_Layer;
+};
+
+} //namespace armnn
+
+
+
+
diff --git a/src/backends/cl/workloads/ClFullyConnectedWorkload.cpp b/src/backends/cl/workloads/ClFullyConnectedWorkload.cpp
new file mode 100644
index 0000000000..b3a97f35f8
--- /dev/null
+++ b/src/backends/cl/workloads/ClFullyConnectedWorkload.cpp
@@ -0,0 +1,96 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClFullyConnectedWorkload.hpp"
+#include <backends/cl/ClTensorHandle.hpp>
+#include <backends/CpuTensorHandle.hpp>
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+#include <backends/aclCommon/ArmComputeUtils.hpp>
+#include <backends/cl/ClLayerSupport.hpp>
+
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+using namespace armcomputetensorutils;
+
+arm_compute::Status ClFullyConnectedWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& weights,
+ const TensorInfo& biases,
+ const FullyConnectedDescriptor& descriptor)
+{
+ const arm_compute::TensorInfo aclInput = BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutput = BuildArmComputeTensorInfo(output);
+ const arm_compute::TensorInfo aclWeights = BuildArmComputeTensorInfo(weights);
+
+ arm_compute::TensorInfo aclBiases;
+ arm_compute::TensorInfo *optionalAclBiases = nullptr;
+ if (descriptor.m_BiasEnabled)
+ {
+ aclBiases = BuildArmComputeTensorInfo(biases);
+ optionalAclBiases = &aclBiases;
+ }
+
+ const arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo =
+ ConvertFullyConnectedDescriptorToAclFullyConnectedLayerInfo(descriptor);
+
+ return arm_compute::CLFullyConnectedLayer::validate(&aclInput,
+ &aclWeights,
+ optionalAclBiases,
+ &aclOutput,
+ fullyConnectedLayerInfo);
+}
+
+ClFullyConnectedWorkload::ClFullyConnectedWorkload(const FullyConnectedQueueDescriptor& descriptor,
+ const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
+ : BaseWorkload<FullyConnectedQueueDescriptor>(descriptor, info)
+ , m_FullyConnectedLayer(memoryManager)
+{
+ m_WeightsTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_WeightsTensor, m_Data.m_Weight->GetTensorInfo());
+
+ if (m_Data.m_Parameters.m_BiasEnabled)
+ {
+ m_BiasesTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_BiasesTensor, m_Data.m_Bias->GetTensorInfo());
+ }
+
+ m_Data.ValidateInputsOutputs("ClFullyConnectedWorkload", 1, 1);
+
+ arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+ // Construct
+ arm_compute::FullyConnectedLayerInfo fc_info;
+ fc_info.transpose_weights = m_Data.m_Parameters.m_TransposeWeightMatrix;
+ m_FullyConnectedLayer.configure(&input, m_WeightsTensor.get(), m_BiasesTensor.get(), &output, fc_info);
+
+ InitializeArmComputeClTensorData(*m_WeightsTensor, m_Data.m_Weight);
+
+ if (m_BiasesTensor)
+ {
+ InitializeArmComputeClTensorData(*m_BiasesTensor, m_Data.m_Bias);
+ }
+
+ // Force Compute Library to perform the necessary copying and reshaping, after which
+ // delete all the input tensors that will no longer be needed
+ m_FullyConnectedLayer.prepare();
+ FreeUnusedTensors();
+}
+
+void ClFullyConnectedWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClFullyConnectedWorkload_Execute");
+ m_FullyConnectedLayer.run();
+}
+
+void ClFullyConnectedWorkload::FreeUnusedTensors()
+{
+ FreeTensorIfUnused(m_WeightsTensor);
+ FreeTensorIfUnused(m_BiasesTensor);
+}
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClFullyConnectedWorkload.hpp b/src/backends/cl/workloads/ClFullyConnectedWorkload.hpp
new file mode 100644
index 0000000000..0c9047235b
--- /dev/null
+++ b/src/backends/cl/workloads/ClFullyConnectedWorkload.hpp
@@ -0,0 +1,43 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+
+#include <arm_compute/runtime/CL/CLFunctions.h>
+#include <arm_compute/runtime/MemoryManagerOnDemand.h>
+
+#include <memory>
+
+namespace armnn
+{
+
+arm_compute::Status ClFullyConnectedWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const TensorInfo& weights,
+ const TensorInfo& biases,
+ const FullyConnectedDescriptor& descriptor);
+
+class ClFullyConnectedWorkload : public armnn::BaseWorkload<armnn::FullyConnectedQueueDescriptor>
+{
+public:
+ ClFullyConnectedWorkload(const armnn::FullyConnectedQueueDescriptor& descriptor,
+ const armnn::WorkloadInfo& info,
+ std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager);
+
+ using armnn::BaseWorkload<armnn::FullyConnectedQueueDescriptor>::m_Data;
+ void Execute() const override;
+
+private:
+ mutable arm_compute::CLFullyConnectedLayer m_FullyConnectedLayer;
+
+ std::unique_ptr<arm_compute::CLTensor> m_WeightsTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_BiasesTensor;
+
+ void FreeUnusedTensors();
+};
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.cpp b/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.cpp
new file mode 100644
index 0000000000..edc13bcfea
--- /dev/null
+++ b/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.cpp
@@ -0,0 +1,50 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClL2NormalizationFloatWorkload.hpp"
+#include <backends/cl/ClTensorHandle.hpp>
+#include <backends/CpuTensorHandle.hpp>
+#include <backends/aclCommon/ArmComputeUtils.hpp>
+
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+using namespace armcomputetensorutils;
+
+arm_compute::Status ClL2NormalizationWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const L2NormalizationDescriptor& descriptor)
+{
+ const arm_compute::TensorInfo aclInput = BuildArmComputeTensorInfo(input, descriptor.m_DataLayout);
+ const arm_compute::TensorInfo aclOutput = BuildArmComputeTensorInfo(output, descriptor.m_DataLayout);
+
+ arm_compute::NormalizationLayerInfo normalizationInfo =
+ CreateAclNormalizationLayerInfoForL2Normalization(input);
+
+ return arm_compute::CLNormalizationLayer::validate(&aclInput, &aclOutput, normalizationInfo);
+}
+
+ClL2NormalizationFloatWorkload::ClL2NormalizationFloatWorkload(const L2NormalizationQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : FloatWorkload<L2NormalizationQueueDescriptor>(descriptor, info)
+{
+ m_Data.ValidateInputsOutputs("ClL2NormalizationFloatWorkload", 1, 1);
+
+ arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+ m_Layer.configure(&input, &output, CreateAclNormalizationLayerInfoForL2Normalization(info.m_InputTensorInfos[0]));
+}
+
+void ClL2NormalizationFloatWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClL2NormalizationFloatWorkload_Execute");
+ m_Layer.run();
+}
+
+} //namespace armnn
+
+
+
diff --git a/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.hpp b/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.hpp
new file mode 100644
index 0000000000..f7b7911f4c
--- /dev/null
+++ b/src/backends/cl/workloads/ClL2NormalizationFloatWorkload.hpp
@@ -0,0 +1,35 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+
+#include <arm_compute/runtime/CL/CLFunctions.h>
+
+namespace armnn
+{
+
+arm_compute::Status ClL2NormalizationWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const L2NormalizationDescriptor& descriptor);
+
+class ClL2NormalizationFloatWorkload : public FloatWorkload<L2NormalizationQueueDescriptor>
+{
+public:
+ ClL2NormalizationFloatWorkload(const L2NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info);
+
+ void Execute() const override;
+
+private:
+ // Purposely not a CLL2Normalize function. See constructor.
+ mutable arm_compute::CLNormalizationLayer m_Layer;
+};
+
+} //namespace armnn
+
+
+
+
diff --git a/src/backends/cl/workloads/ClLstmFloatWorkload.cpp b/src/backends/cl/workloads/ClLstmFloatWorkload.cpp
new file mode 100644
index 0000000000..352698ad1b
--- /dev/null
+++ b/src/backends/cl/workloads/ClLstmFloatWorkload.cpp
@@ -0,0 +1,391 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClLstmFloatWorkload.hpp"
+#include <backends/cl/ClTensorHandle.hpp>
+#include <backends/CpuTensorHandle.hpp>
+#include <backends/cl/ClLayerSupport.hpp>
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+
+#include <arm_compute/runtime/CL/functions/CLLSTMLayer.h>
+
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+using namespace armcomputetensorutils;
+
+ClLstmFloatWorkload::ClLstmFloatWorkload(const LstmQueueDescriptor &descriptor, const WorkloadInfo &info)
+ : FloatWorkload<LstmQueueDescriptor>(descriptor, info)
+{
+ arm_compute::LSTMParams<arm_compute::ICLTensor> lstm_param;
+
+ // Basic parameters
+ m_InputToForgetWeightsTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_InputToForgetWeightsTensor, m_Data.m_InputToForgetWeights->GetTensorInfo());
+
+ m_InputToCellWeightsTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_InputToCellWeightsTensor, m_Data.m_InputToCellWeights->GetTensorInfo());
+
+ m_InputToOutputWeightsTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_InputToOutputWeightsTensor, m_Data.m_InputToOutputWeights->GetTensorInfo());
+
+ m_RecurrentToForgetWeightsTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_RecurrentToForgetWeightsTensor, m_Data.m_RecurrentToForgetWeights->GetTensorInfo());
+
+ m_RecurrentToCellWeightsTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_RecurrentToCellWeightsTensor, m_Data.m_RecurrentToCellWeights->GetTensorInfo());
+
+ m_RecurrentToOutputWeightsTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_RecurrentToOutputWeightsTensor, m_Data.m_RecurrentToOutputWeights->GetTensorInfo());
+
+ m_ForgetGateBiasTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_ForgetGateBiasTensor, m_Data.m_ForgetGateBias->GetTensorInfo());
+
+ m_CellBiasTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_CellBiasTensor, m_Data.m_CellBias->GetTensorInfo());
+
+ m_OutputGateBiasTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_OutputGateBiasTensor, m_Data.m_OutputGateBias->GetTensorInfo());
+
+ // for future reference: check the AndroidNN API for the logic here
+ if (!m_Data.m_Parameters.m_CifgEnabled)
+ {
+ m_InputToInputWeightsTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_InputToInputWeightsTensor, m_Data.m_InputToInputWeights->GetTensorInfo());
+
+ m_RecurrentToInputWeightsTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_RecurrentToInputWeightsTensor, m_Data.m_RecurrentToInputWeights->GetTensorInfo());
+
+ m_CellToInputWeightsTensor = std::make_unique<arm_compute::CLTensor>();
+ if (m_Data.m_CellToInputWeights != nullptr)
+ {
+ BuildArmComputeTensor(*m_CellToInputWeightsTensor, m_Data.m_CellToInputWeights->GetTensorInfo());
+ }
+
+ m_InputGateBiasTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_InputGateBiasTensor, m_Data.m_InputGateBias->GetTensorInfo());
+
+ lstm_param.set_cifg_params(m_InputToInputWeightsTensor.get(),
+ m_RecurrentToInputWeightsTensor.get(),
+ m_Data.m_CellToInputWeights != nullptr ? m_CellToInputWeightsTensor.get() : nullptr,
+ m_InputGateBiasTensor.get());
+ }
+
+ if (m_Data.m_Parameters.m_ProjectionEnabled)
+ {
+ m_ProjectionWeightsTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_ProjectionWeightsTensor, m_Data.m_ProjectionWeights->GetTensorInfo());
+
+ m_ProjectionBiasTensor = std::make_unique<arm_compute::CLTensor>();
+ if (m_Data.m_ProjectionBias != nullptr)
+ {
+ BuildArmComputeTensor(*m_ProjectionBiasTensor, m_Data.m_ProjectionBias->GetTensorInfo());
+ }
+
+ lstm_param.set_projection_params(m_ProjectionWeightsTensor.get(),
+ m_Data.m_ProjectionBias != nullptr ? m_ProjectionBiasTensor.get() : nullptr);
+ }
+
+ if (m_Data.m_Parameters.m_PeepholeEnabled)
+ {
+ m_CellToForgetWeightsTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_CellToForgetWeightsTensor, m_Data.m_CellToForgetWeights->GetTensorInfo());
+
+ m_CellToOutputWeightsTensor = std::make_unique<arm_compute::CLTensor>();
+ BuildArmComputeTensor(*m_CellToOutputWeightsTensor, m_Data.m_CellToOutputWeights->GetTensorInfo());
+
+ lstm_param.set_peephole_params(m_CellToForgetWeightsTensor.get(), m_CellToOutputWeightsTensor.get());
+ }
+
+ const arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ const arm_compute::ICLTensor& output_state_in = static_cast<IClTensorHandle*>(m_Data.m_Inputs[1])->GetTensor();
+ const arm_compute::ICLTensor& cell_state_in = static_cast<IClTensorHandle*>(m_Data.m_Inputs[2])->GetTensor();
+
+ arm_compute::ICLTensor& output_state_out = static_cast<IClTensorHandle*>(m_Data.m_Outputs[1])->GetTensor();
+ arm_compute::ICLTensor& cell_state_out = static_cast<IClTensorHandle*>(m_Data.m_Outputs[2])->GetTensor();
+ arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[3])->GetTensor();
+
+ // Get the batch_size and the num_units from the cellStateIn dimensions
+ const TensorInfo& inputTensorInfo = info.m_InputTensorInfos[2];
+ const unsigned int batch_size = boost::numeric_cast<unsigned int>(inputTensorInfo.GetShape()[0]);
+ const unsigned int num_units = boost::numeric_cast<unsigned int>(inputTensorInfo.GetShape()[1]);
+
+ m_ScratchBuffer = std::make_unique<arm_compute::CLTensor>();
+ if (m_Data.m_Parameters.m_CifgEnabled)
+ {
+ // 2D tensor with dimensions [num_units * 4, batch_size] with CIFG
+ armnn::TensorInfo scratchBuffer1({ batch_size, num_units * 4 }, DataType::Float32);
+ BuildArmComputeTensor(*m_ScratchBuffer, scratchBuffer1);
+ }
+ else
+ {
+ // scratch_buffer [num_units * 3, batch_size] without CIFG
+ armnn::TensorInfo scratchBuffer2({ batch_size, num_units * 3 }, DataType::Float32);
+ BuildArmComputeTensor(*m_ScratchBuffer, scratchBuffer2);
+ }
+
+ float cell_threshold = m_Data.m_Parameters.m_ClippingThresCell;
+ float projection_threshold = m_Data.m_Parameters.m_ClippingThresProj;
+
+ // for preparing the object for the class ActivationLayerInfo, we need to consider 5 situations
+ arm_compute::ActivationLayerInfo activationLayerInfo;
+ if (m_Data.m_Parameters.m_ActivationFunc == 0)
+ {
+ // no activation, do nothing
+ }
+ else if (m_Data.m_Parameters.m_ActivationFunc == 1)
+ {
+ activationLayerInfo = arm_compute::ActivationLayerInfo(
+ arm_compute::ActivationLayerInfo::ActivationFunction::RELU);
+ }
+ else if (m_Data.m_Parameters.m_ActivationFunc == 3)
+ {
+ activationLayerInfo = arm_compute::ActivationLayerInfo(
+ arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0);
+ }
+ else if (m_Data.m_Parameters.m_ActivationFunc == 4)
+ {
+ activationLayerInfo = arm_compute::ActivationLayerInfo(
+ arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0, 1.0);
+ }
+ else if (m_Data.m_Parameters.m_ActivationFunc == 6)
+ {
+ activationLayerInfo = arm_compute::ActivationLayerInfo(
+ arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC);
+ }
+ else
+ {
+ throw armnn::Exception("Wrong Type of Activation Function!");
+ }
+
+
+ m_LstmLayer.configure(&input, m_InputToForgetWeightsTensor.get(), m_InputToCellWeightsTensor.get(),
+ m_InputToOutputWeightsTensor.get(), m_RecurrentToForgetWeightsTensor.get(),
+ m_RecurrentToCellWeightsTensor.get(), m_RecurrentToOutputWeightsTensor.get(),
+ m_ForgetGateBiasTensor.get(), m_CellBiasTensor.get(), m_OutputGateBiasTensor.get(),
+ &output_state_in, &cell_state_in, m_ScratchBuffer.get(), &output_state_out,
+ &cell_state_out, &output, lstm_param, activationLayerInfo,
+ cell_threshold, projection_threshold);
+
+ armcomputetensorutils::InitialiseArmComputeTensorEmpty(*m_ScratchBuffer);
+
+ InitializeArmComputeClTensorData(*m_InputToForgetWeightsTensor, m_Data.m_InputToForgetWeights);
+ InitializeArmComputeClTensorData(*m_InputToCellWeightsTensor, m_Data.m_InputToCellWeights);
+ InitializeArmComputeClTensorData(*m_InputToOutputWeightsTensor, m_Data.m_InputToOutputWeights);
+ InitializeArmComputeClTensorData(*m_RecurrentToForgetWeightsTensor, m_Data.m_RecurrentToForgetWeights);
+ InitializeArmComputeClTensorData(*m_RecurrentToCellWeightsTensor, m_Data.m_RecurrentToCellWeights);
+ InitializeArmComputeClTensorData(*m_RecurrentToOutputWeightsTensor, m_Data.m_RecurrentToOutputWeights);
+ InitializeArmComputeClTensorData(*m_ForgetGateBiasTensor, m_Data.m_ForgetGateBias);
+ InitializeArmComputeClTensorData(*m_CellBiasTensor, m_Data.m_CellBias);
+ InitializeArmComputeClTensorData(*m_OutputGateBiasTensor, m_Data.m_OutputGateBias);
+
+ if (!m_Data.m_Parameters.m_CifgEnabled)
+ {
+ InitializeArmComputeClTensorData(*m_InputToInputWeightsTensor, m_Data.m_InputToInputWeights);
+ InitializeArmComputeClTensorData(*m_RecurrentToInputWeightsTensor, m_Data.m_RecurrentToInputWeights);
+ if (m_Data.m_CellToInputWeights != nullptr)
+ {
+ InitializeArmComputeClTensorData(*m_CellToInputWeightsTensor, m_Data.m_CellToInputWeights);
+ }
+ InitializeArmComputeClTensorData(*m_InputGateBiasTensor, m_Data.m_InputGateBias);
+ }
+
+ if (m_Data.m_Parameters.m_ProjectionEnabled)
+ {
+ InitializeArmComputeClTensorData(*m_ProjectionWeightsTensor, m_Data.m_ProjectionWeights);
+ if (m_Data.m_ProjectionBias != nullptr)
+ {
+ InitializeArmComputeClTensorData(*m_ProjectionBiasTensor, m_Data.m_ProjectionBias);
+ }
+ }
+
+ if (m_Data.m_Parameters.m_PeepholeEnabled)
+ {
+ InitializeArmComputeClTensorData(*m_CellToForgetWeightsTensor, m_Data.m_CellToForgetWeights);
+ InitializeArmComputeClTensorData(*m_CellToOutputWeightsTensor, m_Data.m_CellToOutputWeights);
+ }
+
+ // Force Compute Library to perform the necessary copying and reshaping, after which
+ // delete all the input tensors that will no longer be needed
+ m_LstmLayer.prepare();
+ FreeUnusedTensors();
+}
+
+void ClLstmFloatWorkload::Execute() const
+{
+ m_LstmLayer.run();
+}
+
+arm_compute::Status ClLstmFloatWorkloadValidate(const TensorInfo& input, const TensorInfo& outputStateIn,
+ const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer,
+ const TensorInfo& outputStateOut, const TensorInfo& cellStateOut,
+ const TensorInfo& output, const LstmDescriptor& descriptor,
+ const TensorInfo& inputToForgetWeights,
+ const TensorInfo& inputToCellWeights,
+ const TensorInfo& inputToOutputWeights,
+ const TensorInfo& recurrentToForgetWeights,
+ const TensorInfo& recurrentToCellWeights,
+ const TensorInfo& recurrentToOutputWeights,
+ const TensorInfo& forgetGateBias, const TensorInfo& cellBias,
+ const TensorInfo& outputGateBias,
+ const TensorInfo* inputToInputWeights,
+ const TensorInfo* recurrentToInputWeights,
+ const TensorInfo* cellToInputWeights,
+ const TensorInfo* inputGateBias,
+ const TensorInfo* projectionWeights,
+ const TensorInfo* projectionBias,
+ const TensorInfo* cellToForgetWeights,
+ const TensorInfo* cellToOutputWeights)
+{
+ arm_compute::LSTMParams<arm_compute::ITensorInfo> lstm_params_info;
+
+ // The inputs and the outputs
+ const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutputStateInInfo = BuildArmComputeTensorInfo(outputStateIn);
+ const arm_compute::TensorInfo aclCellStateInInfo = BuildArmComputeTensorInfo(cellStateIn);
+ const arm_compute::TensorInfo aclScratchBufferInfo = BuildArmComputeTensorInfo(scratchBuffer);
+ const arm_compute::TensorInfo aclOutputStateOutInfo = BuildArmComputeTensorInfo(outputStateOut);
+ const arm_compute::TensorInfo aclCellStateOutInfo = BuildArmComputeTensorInfo(cellStateOut);
+ const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output);
+
+ // Basic parameters
+ const arm_compute::TensorInfo aclInputToForgetWeightsInfo = BuildArmComputeTensorInfo(inputToForgetWeights);
+ const arm_compute::TensorInfo aclInputToCellWeightsInfo = BuildArmComputeTensorInfo(inputToCellWeights);
+ const arm_compute::TensorInfo aclInputToOutputWeightsInfo = BuildArmComputeTensorInfo(inputToOutputWeights);
+ const arm_compute::TensorInfo aclRecurrentToForgetWeightsInfo
+ = BuildArmComputeTensorInfo(recurrentToForgetWeights);
+ const arm_compute::TensorInfo aclRecurrentToCellWeightsInfo
+ = BuildArmComputeTensorInfo(recurrentToCellWeights);
+ const arm_compute::TensorInfo aclRecurrentToOutputWeightsInfo
+ = BuildArmComputeTensorInfo(recurrentToOutputWeights);
+ const arm_compute::TensorInfo aclForgetGateBiasInfo = BuildArmComputeTensorInfo(forgetGateBias);
+ const arm_compute::TensorInfo aclCellBiasInfo = BuildArmComputeTensorInfo(cellBias);
+ const arm_compute::TensorInfo aclOutputGateBiasInfo = BuildArmComputeTensorInfo(outputGateBias);
+
+ arm_compute::TensorInfo aclInputToInputWeightsInfo;
+ arm_compute::TensorInfo aclRecurrentToInputWeightsInfo;
+ arm_compute::TensorInfo aclCellToInputWeightsInfo;
+ arm_compute::TensorInfo aclInputGateBiasInfo;
+ arm_compute::TensorInfo aclProjectionWeightsInfo;
+ arm_compute::TensorInfo aclProjectionBiasInfo;
+ arm_compute::TensorInfo aclCellToForgetWeightsInfo;
+ arm_compute::TensorInfo aclCellToOutputWeightsInfo;
+
+ if (!descriptor.m_CifgEnabled)
+ {
+ armnn::TensorInfo inputToInputWInfo = *inputToInputWeights;
+ aclInputToInputWeightsInfo = BuildArmComputeTensorInfo(inputToInputWInfo);
+ armnn::TensorInfo recurrentToInputWInfo = *recurrentToInputWeights;
+ aclRecurrentToInputWeightsInfo = BuildArmComputeTensorInfo(recurrentToInputWInfo);
+
+ if (cellToInputWeights != nullptr)
+ {
+ armnn::TensorInfo cellToInputWInfo = *cellToInputWeights;
+ aclCellToInputWeightsInfo = BuildArmComputeTensorInfo(cellToInputWInfo);
+ }
+ armnn::TensorInfo inputGateBiasInfo = *inputGateBias;
+ aclInputGateBiasInfo = BuildArmComputeTensorInfo(inputGateBiasInfo);
+ lstm_params_info.set_cifg_params(&aclInputToInputWeightsInfo, &aclRecurrentToInputWeightsInfo,
+ cellToInputWeights != nullptr ? &aclCellToInputWeightsInfo: nullptr,
+ &aclInputGateBiasInfo);
+ }
+
+ if (descriptor.m_ProjectionEnabled)
+ {
+ const armnn::TensorInfo& projectionWInfo = *projectionWeights;
+ aclProjectionWeightsInfo = BuildArmComputeTensorInfo(projectionWInfo);
+
+ if (projectionBias != nullptr)
+ {
+ const armnn::TensorInfo& projectionBiasInfo = *projectionBias;
+ aclProjectionBiasInfo = BuildArmComputeTensorInfo(projectionBiasInfo);
+ }
+ lstm_params_info.set_projection_params(&aclProjectionWeightsInfo,
+ projectionBias != nullptr ? &aclProjectionBiasInfo: nullptr);
+ }
+
+ if (descriptor.m_PeepholeEnabled)
+ {
+ const armnn::TensorInfo& cellToForgetWInfo = *cellToForgetWeights;
+ aclCellToForgetWeightsInfo = BuildArmComputeTensorInfo(cellToForgetWInfo);
+ const armnn::TensorInfo& cellToOutputWInfo = *cellToOutputWeights;
+ aclCellToOutputWeightsInfo = BuildArmComputeTensorInfo(cellToOutputWInfo);
+ lstm_params_info.set_peephole_params(&aclCellToForgetWeightsInfo, &aclCellToOutputWeightsInfo);
+ }
+
+ float cell_threshold = descriptor.m_ClippingThresCell;
+ float projection_threshold = descriptor.m_ClippingThresProj;
+
+ // for preparing the object for the class ActivationLayerInfo, we need to consider 5 situations
+ arm_compute::ActivationLayerInfo activationLayerInfo;
+ if (descriptor.m_ActivationFunc == 0)
+ {
+ // no activation, do nothing
+ }
+ else if (descriptor.m_ActivationFunc == 1)
+ {
+ activationLayerInfo = arm_compute::ActivationLayerInfo(
+ arm_compute::ActivationLayerInfo::ActivationFunction::RELU);
+ }
+ else if (descriptor.m_ActivationFunc == 3)
+ {
+ activationLayerInfo = arm_compute::ActivationLayerInfo(
+ arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0);
+ }
+ else if (descriptor.m_ActivationFunc == 4)
+ {
+ activationLayerInfo = arm_compute::ActivationLayerInfo(
+ arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0, 1.0);
+ }
+ else if (descriptor.m_ActivationFunc == 6)
+ {
+ activationLayerInfo = arm_compute::ActivationLayerInfo(
+ arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC);
+ }
+ else
+ {
+ throw armnn::Exception("Wrong Type of Activation Function!");
+ }
+
+ return arm_compute::CLLSTMLayer::validate(&aclInputInfo, &aclInputToForgetWeightsInfo,
+ &aclInputToCellWeightsInfo,
+ &aclInputToOutputWeightsInfo,
+ &aclRecurrentToForgetWeightsInfo,
+ &aclRecurrentToCellWeightsInfo,
+ &aclRecurrentToOutputWeightsInfo,
+ &aclForgetGateBiasInfo,
+ &aclCellBiasInfo,
+ &aclOutputGateBiasInfo,
+ &aclOutputStateInInfo, &aclCellStateInInfo,
+ &aclScratchBufferInfo, &aclOutputStateOutInfo,
+ &aclCellStateOutInfo, &aclOutputInfo,
+ lstm_params_info, activationLayerInfo,
+ cell_threshold, projection_threshold);
+}
+
+void ClLstmFloatWorkload::FreeUnusedTensors()
+{
+ FreeTensorIfUnused(m_InputToInputWeightsTensor);
+ FreeTensorIfUnused(m_InputToForgetWeightsTensor);
+ FreeTensorIfUnused(m_InputToCellWeightsTensor);
+ FreeTensorIfUnused(m_InputToOutputWeightsTensor);
+ FreeTensorIfUnused(m_RecurrentToInputWeightsTensor);
+ FreeTensorIfUnused(m_RecurrentToForgetWeightsTensor);
+ FreeTensorIfUnused(m_RecurrentToCellWeightsTensor);
+ FreeTensorIfUnused(m_RecurrentToOutputWeightsTensor);
+ FreeTensorIfUnused(m_CellToInputWeightsTensor);
+ FreeTensorIfUnused(m_CellToForgetWeightsTensor);
+ FreeTensorIfUnused(m_CellToOutputWeightsTensor);
+ FreeTensorIfUnused(m_InputGateBiasTensor);
+ FreeTensorIfUnused(m_ForgetGateBiasTensor);
+ FreeTensorIfUnused(m_CellBiasTensor);
+ FreeTensorIfUnused(m_OutputGateBiasTensor);
+ FreeTensorIfUnused(m_ProjectionWeightsTensor);
+ FreeTensorIfUnused(m_ProjectionBiasTensor);
+ FreeTensorIfUnused(m_ScratchBuffer);
+}
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClLstmFloatWorkload.hpp b/src/backends/cl/workloads/ClLstmFloatWorkload.hpp
new file mode 100644
index 0000000000..352d774a99
--- /dev/null
+++ b/src/backends/cl/workloads/ClLstmFloatWorkload.hpp
@@ -0,0 +1,68 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+#include <backends/WorkloadData.hpp>
+
+#include <arm_compute/runtime/CL/CLFunctions.h>
+
+namespace armnn
+{
+
+class ClLstmFloatWorkload : public FloatWorkload<LstmQueueDescriptor>
+{
+public:
+ ClLstmFloatWorkload(const LstmQueueDescriptor& descriptor, const WorkloadInfo& info);
+ void Execute() const override;
+
+private:
+ mutable arm_compute::CLLSTMLayer m_LstmLayer;
+
+ std::unique_ptr<arm_compute::CLTensor> m_InputToInputWeightsTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_InputToForgetWeightsTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_InputToCellWeightsTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_InputToOutputWeightsTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_RecurrentToInputWeightsTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_RecurrentToForgetWeightsTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_RecurrentToCellWeightsTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_RecurrentToOutputWeightsTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_CellToInputWeightsTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_CellToForgetWeightsTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_CellToOutputWeightsTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_InputGateBiasTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_ForgetGateBiasTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_CellBiasTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_OutputGateBiasTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_ProjectionWeightsTensor;
+ std::unique_ptr<arm_compute::CLTensor> m_ProjectionBiasTensor;
+
+ std::unique_ptr<arm_compute::CLTensor> m_ScratchBuffer;
+
+ void FreeUnusedTensors();
+};
+
+arm_compute::Status ClLstmFloatWorkloadValidate(const TensorInfo& input, const TensorInfo& outputStateIn,
+ const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer,
+ const TensorInfo& outputStateOut, const TensorInfo& cellStateOut,
+ const TensorInfo& output, const LstmDescriptor &descriptor,
+ const TensorInfo& inputToForgetWeights,
+ const TensorInfo& inputToCellWeights,
+ const TensorInfo& inputToOutputWeights,
+ const TensorInfo& recurrentToForgetWeights,
+ const TensorInfo& recurrentToCellWeights,
+ const TensorInfo& recurrentToOutputWeights,
+ const TensorInfo& forgetGateBias, const TensorInfo& cellBias,
+ const TensorInfo& outputGateBias,
+ const TensorInfo* inputToInputWeights,
+ const TensorInfo* recurrentToInputWeights,
+ const TensorInfo* cellToInputWeights,
+ const TensorInfo* inputGateBias,
+ const TensorInfo* projectionWeights,
+ const TensorInfo* projectionBias,
+ const TensorInfo* cellToForgetWeights,
+ const TensorInfo* cellToOutputWeights);
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClMergerFloatWorkload.cpp b/src/backends/cl/workloads/ClMergerFloatWorkload.cpp
new file mode 100644
index 0000000000..151f1e0ee7
--- /dev/null
+++ b/src/backends/cl/workloads/ClMergerFloatWorkload.cpp
@@ -0,0 +1,20 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClMergerFloatWorkload.hpp"
+
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+void ClMergerFloatWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClMergerFloatWorkload_Execute");
+ ClBaseMergerWorkload::Execute();
+}
+
+} //namespace armnn
+
diff --git a/src/backends/cl/workloads/ClMergerFloatWorkload.hpp b/src/backends/cl/workloads/ClMergerFloatWorkload.hpp
new file mode 100644
index 0000000000..9782f7a8f3
--- /dev/null
+++ b/src/backends/cl/workloads/ClMergerFloatWorkload.hpp
@@ -0,0 +1,22 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "ClBaseMergerWorkload.hpp"
+
+namespace armnn
+{
+
+class ClMergerFloatWorkload : public ClBaseMergerWorkload<DataType::Float16, DataType::Float32>
+{
+public:
+ using ClBaseMergerWorkload<DataType::Float16, DataType::Float32>::ClBaseMergerWorkload;
+ virtual void Execute() const override;
+};
+
+} //namespace armnn
+
+
diff --git a/src/backends/cl/workloads/ClMergerUint8Workload.cpp b/src/backends/cl/workloads/ClMergerUint8Workload.cpp
new file mode 100644
index 0000000000..9d1060d857
--- /dev/null
+++ b/src/backends/cl/workloads/ClMergerUint8Workload.cpp
@@ -0,0 +1,19 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClMergerUint8Workload.hpp"
+
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+void ClMergerUint8Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClMergerUint8Workload_Execute");
+ ClBaseMergerWorkload<DataType::QuantisedAsymm8>::Execute();
+}
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClMergerUint8Workload.hpp b/src/backends/cl/workloads/ClMergerUint8Workload.hpp
new file mode 100644
index 0000000000..cbfc19a0f2
--- /dev/null
+++ b/src/backends/cl/workloads/ClMergerUint8Workload.hpp
@@ -0,0 +1,21 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "ClBaseMergerWorkload.hpp"
+
+namespace armnn
+{
+
+class ClMergerUint8Workload : public ClBaseMergerWorkload<armnn::DataType::QuantisedAsymm8>
+{
+public:
+ using ClBaseMergerWorkload<armnn::DataType::QuantisedAsymm8>::ClBaseMergerWorkload;
+ virtual void Execute() const override;
+};
+
+} //namespace armnn
+
diff --git a/src/backends/cl/workloads/ClMultiplicationFloatWorkload.cpp b/src/backends/cl/workloads/ClMultiplicationFloatWorkload.cpp
new file mode 100644
index 0000000000..d53e149129
--- /dev/null
+++ b/src/backends/cl/workloads/ClMultiplicationFloatWorkload.cpp
@@ -0,0 +1,60 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClMultiplicationFloatWorkload.hpp"
+#include <backends/cl/ClTensorHandle.hpp>
+#include <backends/CpuTensorHandle.hpp>
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+arm_compute::Status ClMultiplicationWorkloadValidate(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output)
+{
+ const arm_compute::TensorInfo aclInput1 = armcomputetensorutils::BuildArmComputeTensorInfo(input0);
+ const arm_compute::TensorInfo aclInput2 = armcomputetensorutils::BuildArmComputeTensorInfo(input1);
+ const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+
+ // At the time of writing, configure() will fail if a rounding policy other than TO_ZERO is supplied to it,
+ // when providing a scale of 1.0 for F32 tensors, even though the provided rounding policy appears to be
+ // ignored for F32 tensors.
+ return arm_compute::CLPixelWiseMultiplication::validate(&aclInput1,
+ &aclInput2,
+ &aclOutput,
+ 1.0f,
+ arm_compute::ConvertPolicy::SATURATE,
+ arm_compute::RoundingPolicy::TO_ZERO);
+}
+
+
+ClMultiplicationFloatWorkload::ClMultiplicationFloatWorkload(const MultiplicationQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : FloatWorkload<MultiplicationQueueDescriptor>(descriptor, info)
+{
+ m_Data.ValidateInputsOutputs("ClMultiplicationFloatWorkload", 2, 1);
+
+ arm_compute::ICLTensor& input0 = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ICLTensor& input1 = static_cast<IClTensorHandle*>(m_Data.m_Inputs[1])->GetTensor();
+ arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+ // Construct
+ m_PixelWiseMultiplication.configure(&input0,
+ &input1,
+ &output,
+ 1.0f,
+ arm_compute::ConvertPolicy::SATURATE,
+ arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+}
+
+void ClMultiplicationFloatWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClMultiplicationFloatWorkload_Execute");
+
+ // Executes the layer.
+ m_PixelWiseMultiplication.run();
+}
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClMultiplicationFloatWorkload.hpp b/src/backends/cl/workloads/ClMultiplicationFloatWorkload.hpp
new file mode 100644
index 0000000000..a793ac64df
--- /dev/null
+++ b/src/backends/cl/workloads/ClMultiplicationFloatWorkload.hpp
@@ -0,0 +1,34 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+
+#include <arm_compute/runtime/CL/CLFunctions.h>
+
+namespace armnn
+{
+
+arm_compute::Status ClMultiplicationWorkloadValidate(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output);
+
+class ClMultiplicationFloatWorkload : public FloatWorkload<MultiplicationQueueDescriptor>
+{
+public:
+ ClMultiplicationFloatWorkload(const MultiplicationQueueDescriptor& descriptor, const WorkloadInfo& info);
+
+ using FloatWorkload<MultiplicationQueueDescriptor>::FloatWorkload;
+ void Execute() const override;
+
+private:
+ mutable arm_compute::CLPixelWiseMultiplication m_PixelWiseMultiplication;
+};
+
+} //namespace armnn
+
+
+
diff --git a/src/backends/cl/workloads/ClNormalizationFloatWorkload.cpp b/src/backends/cl/workloads/ClNormalizationFloatWorkload.cpp
new file mode 100644
index 0000000000..969c9bb08b
--- /dev/null
+++ b/src/backends/cl/workloads/ClNormalizationFloatWorkload.cpp
@@ -0,0 +1,51 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClNormalizationFloatWorkload.hpp"
+#include <backends/cl/ClTensorHandle.hpp>
+#include <backends/CpuTensorHandle.hpp>
+#include <backends/cl/ClLayerSupport.hpp>
+#include <backends/aclCommon/ArmComputeUtils.hpp>
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+#include "ClWorkloadUtils.hpp"
+
+using namespace armnn::armcomputetensorutils;
+
+namespace armnn
+{
+
+arm_compute::Status ClNormalizationWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const NormalizationDescriptor& descriptor)
+{
+ const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input, descriptor.m_DataLayout);
+ const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output, descriptor.m_DataLayout);
+
+ arm_compute::NormalizationLayerInfo layerInfo = BuildArmComputeNormalizationLayerInfo(descriptor);
+
+ return arm_compute::CLNormalizationLayer::validate(&aclInputInfo, &aclOutputInfo, layerInfo);
+}
+
+ClNormalizationFloatWorkload::ClNormalizationFloatWorkload(const NormalizationQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : FloatWorkload<NormalizationQueueDescriptor>(descriptor, info)
+{
+ m_Data.ValidateInputsOutputs("ClNormalizationFloatWorkload", 1, 1);
+
+ arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+ arm_compute::NormalizationLayerInfo normalizationInfo = BuildArmComputeNormalizationLayerInfo(m_Data.m_Parameters);
+
+ m_NormalizationLayer.configure(&input, &output, normalizationInfo);
+};
+
+void ClNormalizationFloatWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClNormalizationFloatWorkload_Execute");
+ m_NormalizationLayer.run();
+}
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClNormalizationFloatWorkload.hpp b/src/backends/cl/workloads/ClNormalizationFloatWorkload.hpp
new file mode 100644
index 0000000000..f30be91aaa
--- /dev/null
+++ b/src/backends/cl/workloads/ClNormalizationFloatWorkload.hpp
@@ -0,0 +1,29 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+
+#include <arm_compute/runtime/CL/CLFunctions.h>
+
+namespace armnn
+{
+
+arm_compute::Status ClNormalizationWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const NormalizationDescriptor& descriptor);
+
+class ClNormalizationFloatWorkload : public FloatWorkload<NormalizationQueueDescriptor>
+{
+public:
+ ClNormalizationFloatWorkload(const NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info);
+ void Execute() const override;
+
+private:
+ mutable arm_compute::CLNormalizationLayer m_NormalizationLayer;
+};
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClPadWorkload.cpp b/src/backends/cl/workloads/ClPadWorkload.cpp
new file mode 100644
index 0000000000..45dc5e8be7
--- /dev/null
+++ b/src/backends/cl/workloads/ClPadWorkload.cpp
@@ -0,0 +1,63 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClPadWorkload.hpp"
+
+#include <backends/cl/ClTensorHandle.hpp>
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+#include <arm_compute/core/Types.h>
+
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+using namespace armcomputetensorutils;
+
+template <armnn::DataType... T>
+ClPadWorkload<T...>::ClPadWorkload(const PadQueueDescriptor& descriptor, const WorkloadInfo& info)
+: TypedWorkload<PadQueueDescriptor, T...>(descriptor, info)
+{
+ this->m_Data.ValidateInputsOutputs("ClPadWorkload", 1, 1);
+
+ arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(this->m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(this->m_Data.m_Outputs[0])->GetTensor();
+ arm_compute::PaddingList padList = static_cast<arm_compute::PaddingList>(descriptor.m_Parameters.m_PadList);
+
+ m_Layer.configure(&input, &output, padList);
+}
+
+template <armnn::DataType... T>
+void ClPadWorkload<T...>::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClPadWorkload_Execute");
+ m_Layer.run();
+}
+
+bool ClPadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const PadDescriptor& descriptor,
+ std::string* reasonIfUnsupported)
+{
+ const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output);
+ arm_compute::PaddingList padList = static_cast<arm_compute::PaddingList>(descriptor.m_PadList);
+
+ const arm_compute::Status aclStatus = arm_compute::CLPadLayer::validate(&aclInputInfo,
+ &aclOutputInfo,
+ padList);
+
+ const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK);
+ if (!supported && reasonIfUnsupported)
+ {
+ *reasonIfUnsupported = aclStatus.error_description();
+ }
+
+ return supported;
+}
+
+} // namespace armnn
+
+template class armnn::ClPadWorkload<armnn::DataType::Float16, armnn::DataType::Float32>;
+template class armnn::ClPadWorkload<armnn::DataType::QuantisedAsymm8>;
diff --git a/src/backends/cl/workloads/ClPadWorkload.hpp b/src/backends/cl/workloads/ClPadWorkload.hpp
new file mode 100644
index 0000000000..a7ad6670a7
--- /dev/null
+++ b/src/backends/cl/workloads/ClPadWorkload.hpp
@@ -0,0 +1,32 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/WorkloadData.hpp>
+#include <backends/Workload.hpp>
+#include <arm_compute/runtime/CL/functions/CLPadLayer.h>
+
+namespace armnn {
+
+template <armnn::DataType... dataTypes>
+class ClPadWorkload : public TypedWorkload<PadQueueDescriptor, dataTypes...>
+{
+public:
+ ClPadWorkload(const PadQueueDescriptor& descriptor, const WorkloadInfo& info);
+
+ void Execute() const override;
+
+private:
+ mutable arm_compute::CLPadLayer m_Layer;
+};
+
+bool ClPadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const PadDescriptor& descriptor,
+ std::string* reasonIfUnsupported);
+
+} //namespace armnn
+
diff --git a/src/backends/cl/workloads/ClPermuteWorkload.cpp b/src/backends/cl/workloads/ClPermuteWorkload.cpp
new file mode 100644
index 0000000000..079772dbaf
--- /dev/null
+++ b/src/backends/cl/workloads/ClPermuteWorkload.cpp
@@ -0,0 +1,56 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClPermuteWorkload.hpp"
+#include <backends/cl/ClTensorHandle.hpp>
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+
+#include <arm_compute/core/Error.h>
+
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+arm_compute::Status ClPermuteWorkloadValidate(const PermuteDescriptor& descriptor)
+{
+ const armnn::PermutationVector& perm = descriptor.m_DimMappings;
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!perm.IsEqual({ 0U, 3U, 1U, 2U })
+ && !perm.IsEqual({ 0U, 2U, 3U, 1U })
+ && !perm.IsEqual({ 3U, 2U, 0U, 1U }),
+ "Only [0, 3, 1, 2], [0, 2, 3, 1] and [3, 2, 0, 1] permutations are supported");
+
+ return arm_compute::Status{};
+}
+
+template <armnn::DataType... DataTypes>
+ClPermuteWorkload<DataTypes...>::ClPermuteWorkload(const PermuteQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : TypedWorkload<PermuteQueueDescriptor, DataTypes...>(descriptor, info)
+{
+ using armcomputetensorutils::BuildArmComputePermutationVector;
+
+ m_Data.ValidateInputsOutputs(GetName(), 1, 1);
+
+ const arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+ const armnn::PermutationVector& mappings = m_Data.m_Parameters.m_DimMappings;
+
+ // Run the layer.
+ m_PermuteFunction.configure(&input, &output, BuildArmComputePermutationVector(mappings));
+}
+
+template <armnn::DataType... DataTypes>
+void ClPermuteWorkload<DataTypes...>::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL( GetName() + "_Execute");
+ m_PermuteFunction.run();
+}
+
+template class ClPermuteWorkload<DataType::Float16, DataType::Float32>;
+template class ClPermuteWorkload<DataType::QuantisedAsymm8>;
+
+} // namespace armnn
diff --git a/src/backends/cl/workloads/ClPermuteWorkload.hpp b/src/backends/cl/workloads/ClPermuteWorkload.hpp
new file mode 100644
index 0000000000..8ff5707ad6
--- /dev/null
+++ b/src/backends/cl/workloads/ClPermuteWorkload.hpp
@@ -0,0 +1,42 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+#include <backends/WorkloadData.hpp>
+
+#include <armnn/TypesUtils.hpp>
+#include <arm_compute/runtime/CL/functions/CLPermute.h>
+
+#include <string>
+
+namespace armnn
+{
+
+arm_compute::Status ClPermuteWorkloadValidate(const PermuteDescriptor& descriptor);
+
+template<armnn::DataType... DataTypes>
+class ClPermuteWorkload : public TypedWorkload<PermuteQueueDescriptor, DataTypes...>
+{
+public:
+ static const std::string& GetName()
+ {
+ static const std::string name = std::string("ClPermuteWorkload");
+ return name;
+ }
+
+ ClPermuteWorkload(const PermuteQueueDescriptor& descriptor, const WorkloadInfo& info);
+ void Execute() const override;
+
+private:
+ using TypedWorkload<PermuteQueueDescriptor, DataTypes...>::m_Data;
+ mutable arm_compute::CLPermute m_PermuteFunction;
+};
+
+using ClPermuteFloatWorkload = ClPermuteWorkload<DataType::Float16, DataType::Float32>;
+using ClPermuteUint8Workload = ClPermuteWorkload<DataType::QuantisedAsymm8>;
+
+} // namespace armnn
diff --git a/src/backends/cl/workloads/ClPooling2dBaseWorkload.cpp b/src/backends/cl/workloads/ClPooling2dBaseWorkload.cpp
new file mode 100644
index 0000000000..98911856fe
--- /dev/null
+++ b/src/backends/cl/workloads/ClPooling2dBaseWorkload.cpp
@@ -0,0 +1,47 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClPooling2dBaseWorkload.hpp"
+#include <backends/cl/ClLayerSupport.hpp>
+#include <backends/cl/ClTensorHandle.hpp>
+#include <backends/aclCommon/ArmComputeUtils.hpp>
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+
+namespace armnn
+{
+using namespace armcomputetensorutils;
+
+arm_compute::Status ClPooling2dWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const Pooling2dDescriptor& descriptor)
+{
+ const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output);
+
+ arm_compute::PoolingLayerInfo layerInfo = BuildArmComputePoolingLayerInfo(descriptor);
+
+ return arm_compute::CLPoolingLayer::validate(&aclInputInfo, &aclOutputInfo, layerInfo);
+}
+
+template <armnn::DataType... dataTypes>
+ClPooling2dBaseWorkload<dataTypes...>::ClPooling2dBaseWorkload(
+ const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info, const std::string& name)
+ : TypedWorkload<Pooling2dQueueDescriptor, dataTypes...>(descriptor, info)
+{
+ m_Data.ValidateInputsOutputs(name, 1, 1);
+
+ arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+ arm_compute::PoolingLayerInfo layerInfo = BuildArmComputePoolingLayerInfo(m_Data.m_Parameters);
+
+ // Run the layer.
+ m_PoolingLayer.configure(&input, &output, layerInfo);
+}
+
+template class ClPooling2dBaseWorkload<DataType::Float16, DataType::Float32>;
+template class ClPooling2dBaseWorkload<DataType::QuantisedAsymm8>;
+
+}
diff --git a/src/backends/cl/workloads/ClPooling2dBaseWorkload.hpp b/src/backends/cl/workloads/ClPooling2dBaseWorkload.hpp
new file mode 100644
index 0000000000..8f9db08ddc
--- /dev/null
+++ b/src/backends/cl/workloads/ClPooling2dBaseWorkload.hpp
@@ -0,0 +1,33 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+
+#include <arm_compute/runtime/CL/CLFunctions.h>
+
+namespace armnn
+{
+
+arm_compute::Status ClPooling2dWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output,
+ const Pooling2dDescriptor& descriptor);
+
+// Base class template providing an implementation of the Pooling2d layer common to all data types.
+template <armnn::DataType... dataTypes>
+class ClPooling2dBaseWorkload : public TypedWorkload<Pooling2dQueueDescriptor, dataTypes...>
+{
+public:
+ using TypedWorkload<Pooling2dQueueDescriptor, dataTypes...>::m_Data;
+
+ ClPooling2dBaseWorkload(const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info,
+ const std::string& name);
+
+protected:
+ mutable arm_compute::CLPoolingLayer m_PoolingLayer;
+};
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClPooling2dFloatWorkload.cpp b/src/backends/cl/workloads/ClPooling2dFloatWorkload.cpp
new file mode 100644
index 0000000000..dc9d17f0ae
--- /dev/null
+++ b/src/backends/cl/workloads/ClPooling2dFloatWorkload.cpp
@@ -0,0 +1,26 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClPooling2dFloatWorkload.hpp"
+
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+ClPooling2dFloatWorkload::ClPooling2dFloatWorkload(const Pooling2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : ClPooling2dBaseWorkload<DataType::Float16, DataType::Float32>(descriptor, info, "ClPooling2dFloatWorkload")
+{
+}
+
+void ClPooling2dFloatWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClPooling2dFloatWorkload_Execute");
+ m_PoolingLayer.run();
+}
+
+} //namespace armnn
+
diff --git a/src/backends/cl/workloads/ClPooling2dFloatWorkload.hpp b/src/backends/cl/workloads/ClPooling2dFloatWorkload.hpp
new file mode 100644
index 0000000000..ba9294c40f
--- /dev/null
+++ b/src/backends/cl/workloads/ClPooling2dFloatWorkload.hpp
@@ -0,0 +1,22 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+
+#include "ClPooling2dBaseWorkload.hpp"
+
+namespace armnn
+{
+class ClPooling2dFloatWorkload : public ClPooling2dBaseWorkload<DataType::Float16, DataType::Float32>
+{
+public:
+ ClPooling2dFloatWorkload(const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info);
+ void Execute() const override;
+
+};
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClPooling2dUint8Workload.cpp b/src/backends/cl/workloads/ClPooling2dUint8Workload.cpp
new file mode 100644
index 0000000000..0b4b15f806
--- /dev/null
+++ b/src/backends/cl/workloads/ClPooling2dUint8Workload.cpp
@@ -0,0 +1,27 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClPooling2dUint8Workload.hpp"
+
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+ClPooling2dUint8Workload::ClPooling2dUint8Workload(const Pooling2dQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : ClPooling2dBaseWorkload<DataType::QuantisedAsymm8>(descriptor, info, "ClPooling2dUint8Workload")
+{
+}
+
+void ClPooling2dUint8Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClPooling2dUint8Workload_Execute");
+ m_PoolingLayer.run();
+}
+
+} //namespace armnn
+
+
diff --git a/src/backends/cl/workloads/ClPooling2dUint8Workload.hpp b/src/backends/cl/workloads/ClPooling2dUint8Workload.hpp
new file mode 100644
index 0000000000..b07f955343
--- /dev/null
+++ b/src/backends/cl/workloads/ClPooling2dUint8Workload.hpp
@@ -0,0 +1,25 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+
+#include "ClPooling2dBaseWorkload.hpp"
+
+namespace armnn
+{
+
+class ClPooling2dUint8Workload : public ClPooling2dBaseWorkload<DataType::QuantisedAsymm8>
+{
+public:
+ ClPooling2dUint8Workload(const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info);
+ void Execute() const override;
+
+};
+
+} //namespace armnn
+
+
diff --git a/src/backends/cl/workloads/ClReshapeFloatWorkload.cpp b/src/backends/cl/workloads/ClReshapeFloatWorkload.cpp
new file mode 100644
index 0000000000..4da3bbd703
--- /dev/null
+++ b/src/backends/cl/workloads/ClReshapeFloatWorkload.cpp
@@ -0,0 +1,33 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClReshapeFloatWorkload.hpp"
+#include <backends/cl/ClTensorHandle.hpp>
+#include <backends/CpuTensorHandle.hpp>
+
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+ClReshapeFloatWorkload::ClReshapeFloatWorkload(const ReshapeQueueDescriptor& descriptor, const WorkloadInfo& info)
+ : FloatWorkload<ReshapeQueueDescriptor>(descriptor, info)
+{
+ m_Data.ValidateInputsOutputs("ClReshapeFloatWorkload", 1, 1);
+
+ arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+ m_Layer.configure(&input, &output);
+}
+
+void ClReshapeFloatWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClReshapeFloatWorkload_Execute");
+ m_Layer.run();
+}
+
+} //namespace armnn
+
diff --git a/src/backends/cl/workloads/ClReshapeFloatWorkload.hpp b/src/backends/cl/workloads/ClReshapeFloatWorkload.hpp
new file mode 100644
index 0000000000..e5fc20ec8b
--- /dev/null
+++ b/src/backends/cl/workloads/ClReshapeFloatWorkload.hpp
@@ -0,0 +1,28 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+
+#include <arm_compute/runtime/CL/CLFunctions.h>
+
+namespace armnn
+{
+
+class ClReshapeFloatWorkload : public FloatWorkload<ReshapeQueueDescriptor>
+{
+public:
+ ClReshapeFloatWorkload(const ReshapeQueueDescriptor& descriptor, const WorkloadInfo& info);
+
+ void Execute() const override;
+
+private:
+ mutable arm_compute::CLReshapeLayer m_Layer;
+};
+
+} //namespace armnn
+
+
diff --git a/src/backends/cl/workloads/ClReshapeUint8Workload.cpp b/src/backends/cl/workloads/ClReshapeUint8Workload.cpp
new file mode 100644
index 0000000000..8fbee151fc
--- /dev/null
+++ b/src/backends/cl/workloads/ClReshapeUint8Workload.cpp
@@ -0,0 +1,31 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClReshapeUint8Workload.hpp"
+#include <backends/cl/ClTensorHandle.hpp>
+#include <backends/CpuTensorHandle.hpp>
+
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+ClReshapeUint8Workload::ClReshapeUint8Workload(const ReshapeQueueDescriptor& descriptor, const WorkloadInfo& info)
+ : Uint8Workload<ReshapeQueueDescriptor>(descriptor, info)
+{
+ m_Data.ValidateInputsOutputs("ClReshapeUint8Workload", 1, 1);
+
+ arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+ m_Layer.configure(&input, &output);
+}
+
+void ClReshapeUint8Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClReshapeUint8Workload_Execute");
+
+ m_Layer.run();
+}
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClReshapeUint8Workload.hpp b/src/backends/cl/workloads/ClReshapeUint8Workload.hpp
new file mode 100644
index 0000000000..654437a4c1
--- /dev/null
+++ b/src/backends/cl/workloads/ClReshapeUint8Workload.hpp
@@ -0,0 +1,29 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+
+#include <arm_compute/runtime/CL/CLFunctions.h>
+
+namespace armnn
+{
+
+// Reshape
+class ClReshapeUint8Workload : public Uint8Workload<ReshapeQueueDescriptor>
+{
+public:
+ ClReshapeUint8Workload( const ReshapeQueueDescriptor& descriptor, const WorkloadInfo& info);
+
+ void Execute() const override;
+
+private:
+ mutable arm_compute::CLReshapeLayer m_Layer;
+};
+
+} //namespace armnn
+
+
diff --git a/src/backends/cl/workloads/ClResizeBilinearFloatWorkload.cpp b/src/backends/cl/workloads/ClResizeBilinearFloatWorkload.cpp
new file mode 100644
index 0000000000..499466e959
--- /dev/null
+++ b/src/backends/cl/workloads/ClResizeBilinearFloatWorkload.cpp
@@ -0,0 +1,38 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClResizeBilinearFloatWorkload.hpp"
+#include <backends/cl/ClTensorHandle.hpp>
+#include <backends/CpuTensorHandle.hpp>
+#include <backends/cl/ClLayerSupport.hpp>
+#include <backends/aclCommon/ArmComputeUtils.hpp>
+
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+ClResizeBilinearFloatWorkload::ClResizeBilinearFloatWorkload(const ResizeBilinearQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : FloatWorkload<ResizeBilinearQueueDescriptor>(descriptor, info)
+{
+ m_Data.ValidateInputsOutputs("ClResizeBilinearFloatWorkload", 1, 1);
+
+ arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+ m_ResizeBilinearLayer.configure(&input, &output, arm_compute::InterpolationPolicy::BILINEAR,
+ arm_compute::BorderMode::REPLICATE, arm_compute::PixelValue(0.f),
+ arm_compute::SamplingPolicy::TOP_LEFT);
+};
+
+void ClResizeBilinearFloatWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClResizeBilinearFloatWorkload_Execute");
+ m_ResizeBilinearLayer.run();
+}
+
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClResizeBilinearFloatWorkload.hpp b/src/backends/cl/workloads/ClResizeBilinearFloatWorkload.hpp
new file mode 100644
index 0000000000..f29f416907
--- /dev/null
+++ b/src/backends/cl/workloads/ClResizeBilinearFloatWorkload.hpp
@@ -0,0 +1,25 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+
+#include <arm_compute/runtime/CL/CLFunctions.h>
+
+namespace armnn
+{
+
+class ClResizeBilinearFloatWorkload : public FloatWorkload<ResizeBilinearQueueDescriptor>
+{
+public:
+ ClResizeBilinearFloatWorkload(const ResizeBilinearQueueDescriptor& descriptor, const WorkloadInfo& info);
+ void Execute() const override;
+
+private:
+ mutable arm_compute::CLScale m_ResizeBilinearLayer;
+};
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClSoftmaxBaseWorkload.cpp b/src/backends/cl/workloads/ClSoftmaxBaseWorkload.cpp
new file mode 100644
index 0000000000..eb05a19670
--- /dev/null
+++ b/src/backends/cl/workloads/ClSoftmaxBaseWorkload.cpp
@@ -0,0 +1,30 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClSoftmaxBaseWorkload.hpp"
+
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+
+#include <arm_compute/runtime/CL/functions/CLSoftmaxLayer.h>
+
+namespace armnn
+{
+
+arm_compute::Status ClSoftmaxWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output)
+{
+ // NOTE: We report 4D Softmax as unsupported until full support is added to ACL
+ if(input.GetShape().GetNumDimensions() >= 4u)
+ {
+ return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, "4d softmax is not supported");
+ }
+
+ const arm_compute::TensorInfo aclInputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(input);
+ const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+
+ return arm_compute::CLSoftmaxLayer::validate(&aclInputInfo, &aclOutputInfo);
+}
+
+}
diff --git a/src/backends/cl/workloads/ClSoftmaxBaseWorkload.hpp b/src/backends/cl/workloads/ClSoftmaxBaseWorkload.hpp
new file mode 100644
index 0000000000..b800056cdf
--- /dev/null
+++ b/src/backends/cl/workloads/ClSoftmaxBaseWorkload.hpp
@@ -0,0 +1,17 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <armnn/Tensor.hpp>
+#include <arm_compute/core/Error.h>
+
+namespace armnn
+{
+
+arm_compute::Status ClSoftmaxWorkloadValidate(const TensorInfo& input,
+ const TensorInfo& output);
+
+} // namespace armnn
diff --git a/src/backends/cl/workloads/ClSoftmaxFloatWorkload.cpp b/src/backends/cl/workloads/ClSoftmaxFloatWorkload.cpp
new file mode 100644
index 0000000000..606005659f
--- /dev/null
+++ b/src/backends/cl/workloads/ClSoftmaxFloatWorkload.cpp
@@ -0,0 +1,33 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClSoftmaxFloatWorkload.hpp"
+#include <backends/cl/ClTensorHandle.hpp>
+#include <backends/CpuTensorHandle.hpp>
+
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+ClSoftmaxFloatWorkload::ClSoftmaxFloatWorkload(const SoftmaxQueueDescriptor& descriptor, const WorkloadInfo& info,
+ std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
+ : FloatWorkload<SoftmaxQueueDescriptor>(descriptor, info)
+ , m_SoftmaxLayer(memoryManager)
+{
+ m_Data.ValidateInputsOutputs("ClSoftmaxFloatWorkload", 1, 1);
+
+ arm_compute::ICLTensor& input = static_cast<ClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ICLTensor& output = static_cast<ClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+ m_SoftmaxLayer.configure(&input, &output, m_Data.m_Parameters.m_Beta);
+}
+
+void ClSoftmaxFloatWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClSoftmaxFloatWorkload_Execute");
+ m_SoftmaxLayer.run();
+}
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClSoftmaxFloatWorkload.hpp b/src/backends/cl/workloads/ClSoftmaxFloatWorkload.hpp
new file mode 100644
index 0000000000..b400b3c7ea
--- /dev/null
+++ b/src/backends/cl/workloads/ClSoftmaxFloatWorkload.hpp
@@ -0,0 +1,30 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+
+#include <arm_compute/runtime/CL/CLFunctions.h>
+#include <arm_compute/runtime/MemoryManagerOnDemand.h>
+
+#include <memory>
+
+namespace armnn
+{
+
+class ClSoftmaxFloatWorkload : public FloatWorkload<SoftmaxQueueDescriptor>
+{
+public:
+ ClSoftmaxFloatWorkload(const SoftmaxQueueDescriptor& descriptor, const WorkloadInfo& info,
+ std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager);
+ void Execute() const override;
+
+private:
+ mutable arm_compute::CLSoftmaxLayer m_SoftmaxLayer;
+};
+
+} //namespace armnn
+
diff --git a/src/backends/cl/workloads/ClSoftmaxUint8Workload.cpp b/src/backends/cl/workloads/ClSoftmaxUint8Workload.cpp
new file mode 100644
index 0000000000..7e0589e89f
--- /dev/null
+++ b/src/backends/cl/workloads/ClSoftmaxUint8Workload.cpp
@@ -0,0 +1,43 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClSoftmaxUint8Workload.hpp"
+#include <backends/cl/ClTensorHandle.hpp>
+#include <backends/CpuTensorHandle.hpp>
+
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+ClSoftmaxUint8Workload::ClSoftmaxUint8Workload(const SoftmaxQueueDescriptor& descriptor, const WorkloadInfo& info,
+ std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager)
+ : Uint8Workload<SoftmaxQueueDescriptor>(descriptor, info)
+ , m_SoftmaxLayer(memoryManager)
+{
+ m_Data.ValidateInputsOutputs("ClSoftmaxUint8Workload", 1, 1);
+
+ arm_compute::ICLTensor& input = static_cast<ClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ICLTensor& output = static_cast<ClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+ const auto outputQuantization = output.info()->quantization_info();
+
+ if ((outputQuantization.scale != (1.0f / 256.0f)) || (outputQuantization.offset != 0))
+ {
+ throw InvalidArgumentException(
+ "Invalid quantization for output. Only scale = 1.0f / 256.0f and offset = 0 supported");
+ }
+
+ m_SoftmaxLayer.configure(&input, &output, descriptor.m_Parameters.m_Beta);
+}
+
+void ClSoftmaxUint8Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClSoftmaxUint8Workload_Execute");
+
+ m_SoftmaxLayer.run();
+}
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClSoftmaxUint8Workload.hpp b/src/backends/cl/workloads/ClSoftmaxUint8Workload.hpp
new file mode 100644
index 0000000000..4786faf60b
--- /dev/null
+++ b/src/backends/cl/workloads/ClSoftmaxUint8Workload.hpp
@@ -0,0 +1,31 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+
+#include <arm_compute/runtime/CL/CLFunctions.h>
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
+
+#include <memory>
+
+namespace armnn
+{
+// Softmax
+class ClSoftmaxUint8Workload : public Uint8Workload<SoftmaxQueueDescriptor>
+{
+public:
+ ClSoftmaxUint8Workload(const SoftmaxQueueDescriptor& descriptor, const WorkloadInfo& info,
+ std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager);
+
+ void Execute() const override;
+private:
+
+ mutable arm_compute::CLSoftmaxLayer m_SoftmaxLayer;
+};
+
+} //namespace armnn
+
diff --git a/src/backends/cl/workloads/ClSplitterFloatWorkload.cpp b/src/backends/cl/workloads/ClSplitterFloatWorkload.cpp
new file mode 100644
index 0000000000..5fd634bdb6
--- /dev/null
+++ b/src/backends/cl/workloads/ClSplitterFloatWorkload.cpp
@@ -0,0 +1,19 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClSplitterFloatWorkload.hpp"
+
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+void ClSplitterFloatWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClSplitterFloatWorkload_Execute");
+ ClBaseSplitterWorkload::Execute();
+}
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClSplitterFloatWorkload.hpp b/src/backends/cl/workloads/ClSplitterFloatWorkload.hpp
new file mode 100644
index 0000000000..a0b5846f8e
--- /dev/null
+++ b/src/backends/cl/workloads/ClSplitterFloatWorkload.hpp
@@ -0,0 +1,20 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "ClBaseSplitterWorkload.hpp"
+
+namespace armnn
+{
+
+class ClSplitterFloatWorkload : public ClBaseSplitterWorkload<DataType::Float16, DataType::Float32>
+{
+public:
+ using ClBaseSplitterWorkload<DataType::Float16, DataType::Float32>::ClBaseSplitterWorkload;
+ virtual void Execute() const override;
+};
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClSplitterUint8Workload.cpp b/src/backends/cl/workloads/ClSplitterUint8Workload.cpp
new file mode 100644
index 0000000000..50a251ada7
--- /dev/null
+++ b/src/backends/cl/workloads/ClSplitterUint8Workload.cpp
@@ -0,0 +1,19 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClSplitterUint8Workload.hpp"
+
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+void ClSplitterUint8Workload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClSplitterUint8Workload_Execute");
+ ClBaseSplitterWorkload::Execute();
+}
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClSplitterUint8Workload.hpp b/src/backends/cl/workloads/ClSplitterUint8Workload.hpp
new file mode 100644
index 0000000000..19e8be5034
--- /dev/null
+++ b/src/backends/cl/workloads/ClSplitterUint8Workload.hpp
@@ -0,0 +1,21 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "ClBaseSplitterWorkload.hpp"
+
+namespace armnn
+{
+class ClSplitterUint8Workload : public ClBaseSplitterWorkload<DataType::QuantisedAsymm8>
+{
+public:
+ using ClBaseSplitterWorkload<DataType::QuantisedAsymm8>::ClBaseSplitterWorkload;
+ virtual void Execute() const override;
+};
+} //namespace armnn
+
+
+
diff --git a/src/backends/cl/workloads/ClSubtractionWorkload.cpp b/src/backends/cl/workloads/ClSubtractionWorkload.cpp
new file mode 100644
index 0000000000..37b334d94e
--- /dev/null
+++ b/src/backends/cl/workloads/ClSubtractionWorkload.cpp
@@ -0,0 +1,66 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClSubtractionWorkload.hpp"
+
+#include <backends/cl/ClTensorHandle.hpp>
+#include <backends/CpuTensorHandle.hpp>
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+using namespace armcomputetensorutils;
+
+static constexpr arm_compute::ConvertPolicy g_AclConvertPolicy = arm_compute::ConvertPolicy::SATURATE;
+
+template <armnn::DataType... T>
+ClSubtractionWorkload<T...>::ClSubtractionWorkload(const SubtractionQueueDescriptor& descriptor,
+ const WorkloadInfo& info)
+ : TypedWorkload<SubtractionQueueDescriptor, T...>(descriptor, info)
+{
+ this->m_Data.ValidateInputsOutputs("ClSubtractionWorkload", 2, 1);
+
+ arm_compute::ICLTensor& input0 = static_cast<IClTensorHandle*>(this->m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ICLTensor& input1 = static_cast<IClTensorHandle*>(this->m_Data.m_Inputs[1])->GetTensor();
+ arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(this->m_Data.m_Outputs[0])->GetTensor();
+ m_Layer.configure(&input0, &input1, &output, g_AclConvertPolicy);
+}
+
+template <armnn::DataType... T>
+void ClSubtractionWorkload<T...>::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_CL("ClSubtractionWorkload_Execute");
+ m_Layer.run();
+}
+
+bool ClSubtractionValidate(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported)
+{
+ const arm_compute::TensorInfo aclInput0Info = BuildArmComputeTensorInfo(input0);
+ const arm_compute::TensorInfo aclInput1Info = BuildArmComputeTensorInfo(input1);
+ const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output);
+
+ const arm_compute::Status aclStatus = arm_compute::CLArithmeticSubtraction::validate(&aclInput0Info,
+ &aclInput1Info,
+ &aclOutputInfo,
+ g_AclConvertPolicy);
+
+ const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK);
+ if (!supported && reasonIfUnsupported)
+ {
+ *reasonIfUnsupported = aclStatus.error_description();
+ }
+
+ return supported;
+}
+
+} //namespace armnn
+
+template class armnn::ClSubtractionWorkload<armnn::DataType::Float16, armnn::DataType::Float32>;
+template class armnn::ClSubtractionWorkload<armnn::DataType::QuantisedAsymm8>;
diff --git a/src/backends/cl/workloads/ClSubtractionWorkload.hpp b/src/backends/cl/workloads/ClSubtractionWorkload.hpp
new file mode 100644
index 0000000000..67b219b09d
--- /dev/null
+++ b/src/backends/cl/workloads/ClSubtractionWorkload.hpp
@@ -0,0 +1,31 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backends/Workload.hpp>
+
+#include <arm_compute/runtime/CL/CLFunctions.h>
+
+namespace armnn
+{
+
+template <armnn::DataType... dataTypes>
+class ClSubtractionWorkload : public TypedWorkload<SubtractionQueueDescriptor, dataTypes...>
+{
+public:
+ ClSubtractionWorkload(const SubtractionQueueDescriptor& descriptor, const WorkloadInfo& info);
+
+ void Execute() const override;
+
+private:
+ mutable arm_compute::CLArithmeticSubtraction m_Layer;
+};
+
+bool ClSubtractionValidate(const TensorInfo& input0,
+ const TensorInfo& input1,
+ const TensorInfo& output,
+ std::string* reasonIfUnsupported);
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClWorkloadUtils.hpp b/src/backends/cl/workloads/ClWorkloadUtils.hpp
new file mode 100644
index 0000000000..3a8ff00bb6
--- /dev/null
+++ b/src/backends/cl/workloads/ClWorkloadUtils.hpp
@@ -0,0 +1,63 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include "OpenClTimer.hpp"
+#include <backends/aclCommon/ArmComputeTensorUtils.hpp>
+#include <backends/CpuTensorHandle.hpp>
+
+#include <Half.hpp>
+
+#define ARMNN_SCOPED_PROFILING_EVENT_CL(name) \
+ ARMNN_SCOPED_PROFILING_EVENT_WITH_INSTRUMENTS(armnn::Compute::GpuAcc, \
+ name, \
+ armnn::OpenClTimer(), \
+ armnn::WallClockTimer())
+
+namespace armnn
+{
+
+template <typename T>
+void CopyArmComputeClTensorData(arm_compute::CLTensor& dstTensor, const T* srcData)
+{
+ {
+ ARMNN_SCOPED_PROFILING_EVENT_CL("MapClTensorForWriting");
+ dstTensor.map(true);
+ }
+
+ {
+ ARMNN_SCOPED_PROFILING_EVENT_CL("CopyToClTensor");
+ armcomputetensorutils::CopyArmComputeITensorData<T>(srcData, dstTensor);
+ }
+
+ dstTensor.unmap();
+}
+
+inline void InitializeArmComputeClTensorData(arm_compute::CLTensor& clTensor,
+ const ConstCpuTensorHandle* handle)
+{
+ BOOST_ASSERT(handle);
+
+ armcomputetensorutils::InitialiseArmComputeTensorEmpty(clTensor);
+ switch(handle->GetTensorInfo().GetDataType())
+ {
+ case DataType::Float16:
+ CopyArmComputeClTensorData(clTensor, handle->GetConstTensor<armnn::Half>());
+ break;
+ case DataType::Float32:
+ CopyArmComputeClTensorData(clTensor, handle->GetConstTensor<float>());
+ break;
+ case DataType::QuantisedAsymm8:
+ CopyArmComputeClTensorData(clTensor, handle->GetConstTensor<uint8_t>());
+ break;
+ case DataType::Signed32:
+ CopyArmComputeClTensorData(clTensor, handle->GetConstTensor<int32_t>());
+ break;
+ default:
+ BOOST_ASSERT_MSG(false, "Unexpected tensor type.");
+ }
+};
+
+} //namespace armnn
diff --git a/src/backends/cl/workloads/ClWorkloads.hpp b/src/backends/cl/workloads/ClWorkloads.hpp
new file mode 100644
index 0000000000..3329f42e08
--- /dev/null
+++ b/src/backends/cl/workloads/ClWorkloads.hpp
@@ -0,0 +1,41 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+#include "ClActivationFloatWorkload.hpp"
+#include "ClActivationUint8Workload.hpp"
+#include "ClAdditionWorkload.hpp"
+#include "ClBaseConstantWorkload.hpp"
+#include "ClBaseMergerWorkload.hpp"
+#include "ClBatchNormalizationFloatWorkload.hpp"
+#include "ClConstantFloatWorkload.hpp"
+#include "ClConstantUint8Workload.hpp"
+#include "ClConvolution2dFloatWorkload.hpp"
+#include "ClConvolution2dUint8Workload.hpp"
+#include "ClDepthwiseConvolutionFloatWorkload.hpp"
+#include "ClDepthwiseConvolutionUint8Workload.hpp"
+#include "ClDivisionFloatWorkload.hpp"
+#include "ClFloorFloatWorkload.hpp"
+#include "ClFullyConnectedWorkload.hpp"
+#include "ClL2NormalizationFloatWorkload.hpp"
+#include "ClLstmFloatWorkload.hpp"
+#include "ClMergerFloatWorkload.hpp"
+#include "ClMergerUint8Workload.hpp"
+#include "ClMultiplicationFloatWorkload.hpp"
+#include "ClNormalizationFloatWorkload.hpp"
+#include "ClPermuteWorkload.hpp"
+#include "ClPadWorkload.hpp"
+#include "ClPooling2dFloatWorkload.hpp"
+#include "ClPooling2dUint8Workload.hpp"
+#include "ClReshapeFloatWorkload.hpp"
+#include "ClReshapeUint8Workload.hpp"
+#include "ClResizeBilinearFloatWorkload.hpp"
+#include "ClSoftmaxFloatWorkload.hpp"
+#include "ClSoftmaxUint8Workload.hpp"
+#include "ClSplitterFloatWorkload.hpp"
+#include "ClSplitterUint8Workload.hpp"
+#include "ClSubtractionWorkload.hpp"
+#include "ClConvertFp16ToFp32Workload.hpp"
+#include "ClConvertFp32ToFp16Workload.hpp"