Set up the framework to choose the default LWS

Resolve COMPMID-4486 Signed-off-by: Giorgio Arena <giorgio.arena@arm.com> Change-Id: Ib38b7943bd776a6d75d1da163908724c49eae73d Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5864 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
author: Giorgio Arena <giorgio.arena@arm.com> 2021-06-28 11:00:27 +0100
committer: Giorgio Arena <giorgio.arena@arm.com> 2021-06-29 11:06:10 +0000
commit: 4a95bba6ca61ce99995ece6fd237b5498c9f322c (patch)
tree: 04ef2dee1cdc015ce14ce6b664d9a956a96c3a05
parent: 731fe667e3009bfbfee8b0eb74ecb68b291e4311 (diff)
download: ComputeLibrary-4a95bba6ca61ce99995ece6fd237b5498c9f322c.tar.gz
116 files changed, 323 insertions, 55 deletions
diff --git a/arm_compute/core/CL/CLTypes.h b/arm_compute/core/CL/CLTypes.h
index 2ba2e8dd62..ede8d0a9e4 100644
--- a/arm_compute/core/CL/CLTypes.h
+++ b/arm_compute/core/CL/CLTypes.h
@@ -75,5 +75,16 @@ struct CLQuantization
     const ICLFloatArray *scale;  /**< Quantization scale array */
     const ICLInt32Array *offset; /**< Quantization offset array */
 };
+
+enum CLKernelType
+{
+    UNKNOWN,     /**< Unknown CL kernel type */
+    DEPTHWISE,   /**< Depthwise CL kernel type */
+    DIRECT,      /**< Direct Convolution CL kernel type */
+    ELEMENTWISE, /**< Elementeise CL kernel type */
+    GEMM,        /**< GEMM CL kernel type */
+    POOL,        /**< Pool CL kernel type */
+    WINOGRAD     /**< Winograd CL kernel type */
+};
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CL_TYPES_H */
diff --git a/src/core/CL/ICLKernel.h b/src/core/CL/ICLKernel.h
index 6737109f34..ae3077a564 100644
--- a/src/core/CL/ICLKernel.h
+++ b/src/core/CL/ICLKernel.h
@@ -37,6 +37,26 @@
 
 namespace arm_compute
 {
+namespace
+{
+bool is_same_lws(cl::NDRange lws0, cl::NDRange lws1)
+{
+    if(lws0.dimensions() != lws1.dimensions())
+    {
+        return false;
+    }
+
+    for(size_t i = 0; i < lws0.dimensions(); ++i)
+    {
+        if(lws0.get()[i] != lws1.get()[i])
+        {
+            return false;
+        }
+    }
+
+    return true;
+}
+} // namespace
 template <typename T>
 class ICLArray;
 class ICLTensor;
@@ -64,6 +84,13 @@ private:
     {
         return 2 + 2 * dimension_size;
     }
+
+    cl::NDRange default_lws_tune(const Window &window)
+    {
+        ARM_COMPUTE_UNUSED(window);
+        return CLKernelLibrary::get().default_ndrange();
+    }
+
     using IKernel::configure; //Prevent children from calling IKernel::configure() directly
 protected:
     /** Configure the kernel's window and local workgroup size hint.
@@ -85,13 +112,19 @@ protected:
     void configure_internal(const Window &window, CLTuningParams tuning_params_hint = CLTuningParams(CLKernelLibrary::get().default_ndrange(), 0))
     {
         _tuning_params_hint = tuning_params_hint;
+
+        if(is_same_lws(_tuning_params_hint.get_lws(), CLKernelLibrary::get().default_ndrange()))
+        {
+            _tuning_params_hint.set_lws(default_lws_tune(window));
+        }
+
         IKernel::configure(window);
     }
 
 public:
     /** Constructor */
     ICLKernel()
-        : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _tuning_params_hint()
+        : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _type(CLKernelType::UNKNOWN), _tuning_params_hint()
     {
     }
     /** Returns a reference to the OpenCL kernel of this object.
@@ -102,6 +135,14 @@ public:
     {
         return _kernel;
     }
+    /** Returns the CL kernel type
+     *
+     * @return The CL kernel type
+     */
+    CLKernelType type() const
+    {
+        return _type;
+    }
     /** Add the passed 1D array's parameters to the object's kernel's arguments starting from the index idx.
      *
      * @param[in,out] idx            Index at which to start adding the array's arguments. Will be incremented by the number of kernel arguments set.
@@ -372,10 +413,11 @@ private:
     void add_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window);
 
 protected:
-    cl::Kernel  _kernel;             /**< OpenCL kernel to run */
-    GPUTarget   _target;             /**< The targeted GPU */
-    std::string _config_id;          /**< Configuration ID */
-    size_t      _max_workgroup_size; /**< The maximum workgroup size for this kernel */
+    cl::Kernel   _kernel;             /**< OpenCL kernel to run */
+    GPUTarget    _target;             /**< The targeted GPU */
+    std::string  _config_id;          /**< Configuration ID */
+    size_t       _max_workgroup_size; /**< The maximum workgroup size for this kernel */
+    CLKernelType _type;               /**< The CL kernel type */
 private:
     CLTuningParams _tuning_params_hint; /**< Tuning parameters hint for the OpenCL kernel */
 };
diff --git a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
index 909972482f..7af2fa1e64 100644
--- a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
@@ -69,6 +69,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *prev_outp
 CLArgMinMaxLayerKernel::CLArgMinMaxLayerKernel()
     : _input(nullptr), _prev_output(nullptr), _output(nullptr), _reduction_axis(0), _op(ReductionOperation::ARG_IDX_MAX)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLArgMinMaxLayerKernel::configure(const ICLTensor *input, const ICLTensor *prev_output, ICLTensor *output, unsigned int axis, ReductionOperation op)
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
index 44bdc6f587..24c8b4da5b 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -109,6 +109,7 @@ std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input,
 CLBatchNormalizationLayerKernel::CLBatchNormalizationLayerKernel()
     : _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _beta(nullptr), _gamma(nullptr), _epsilon(0), _run_in_place(false)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLBatchNormalizationLayerKernel::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma,
diff --git a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
index da41feb7b8..6f333dd925 100644
--- a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -83,6 +83,7 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
 CLBatchToSpaceLayerKernel::CLBatchToSpaceLayerKernel()
     : _input(nullptr), _block_shape(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
diff --git a/src/core/CL/kernels/CLBitwiseKernel.cpp b/src/core/CL/kernels/CLBitwiseKernel.cpp
index b1f7c00fac..4c3ea80895 100644
--- a/src/core/CL/kernels/CLBitwiseKernel.cpp
+++ b/src/core/CL/kernels/CLBitwiseKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,6 +37,7 @@ namespace arm_compute
 CLBitwiseKernel::CLBitwiseKernel()
     : _input1(nullptr), _input2(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLBitwiseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, BitwiseOperation op)
diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
index 1bf0dc7445..f57221d16b 100644
--- a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
+++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
@@ -85,6 +85,7 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe
 CLBoundingBoxTransformKernel::CLBoundingBoxTransformKernel()
     : _boxes(nullptr), _pred_boxes(nullptr), _deltas(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLBoundingBoxTransformKernel::configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
index 8a6b58002c..7c8a7ce150 100644
--- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
@@ -97,6 +97,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 CLChannelShuffleLayerKernel::CLChannelShuffleLayerKernel()
     : _input(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLChannelShuffleLayerKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int num_groups)
diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp
index 5f52945efb..6d7b83471f 100644
--- a/src/core/CL/kernels/CLCol2ImKernel.cpp
+++ b/src/core/CL/kernels/CLCol2ImKernel.cpp
@@ -83,6 +83,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 CLCol2ImKernel::CLCol2ImKernel()
     : _input(nullptr), _output(nullptr), _convolved_dims()
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &convolved_dims, unsigned int num_groups)
diff --git a/src/core/CL/kernels/CLComparisonKernel.cpp b/src/core/CL/kernels/CLComparisonKernel.cpp
index d0b29e2ba8..21f98349a0 100644
--- a/src/core/CL/kernels/CLComparisonKernel.cpp
+++ b/src/core/CL/kernels/CLComparisonKernel.cpp
@@ -101,6 +101,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITe
 CLComparisonKernel::CLComparisonKernel()
     : _input1(nullptr), _input2(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLComparisonKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
index eb420d8842..505a93761d 100644
--- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
@@ -36,6 +36,7 @@ namespace arm_compute
 CLDeconvolutionLayerUpsampleKernel::CLDeconvolutionLayerUpsampleKernel()
     : _input(nullptr), _output(nullptr), _info(), _data_layout(DataLayout::UNKNOWN)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
diff --git a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
index ca7e9d4b23..8863de5c57 100644
--- a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -113,6 +113,7 @@ CLDeconvolutionReshapeOutputKernel::CLDeconvolutionReshapeOutputKernel()
     : _add_bias(false),
       _bias(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLDeconvolutionReshapeOutputKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
diff --git a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
index 8946f2a713..efc6f820f2 100644
--- a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -65,6 +65,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
 CLDepthToSpaceLayerKernel::CLDepthToSpaceLayerKernel()
     : _input(nullptr), _output(nullptr), _block_shape()
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLDepthToSpaceLayerKernel::configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape)
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
index 67ca341b0d..65c4b8568c 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
@@ -142,6 +142,7 @@ CLDepthwiseConvolutionLayerNativeKernel::CLDepthwiseConvolutionLayerNativeKernel
       _export_to_cl_image(false),
       _is_quantized(false)
 {
+    _type = CLKernelType::DEPTHWISE;
 }
 
 void CLDepthwiseConvolutionLayerNativeKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
diff --git a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
index ff04708b5b..bbf4e55433 100644
--- a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
+++ b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
@@ -70,6 +70,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 CLFFTDigitReverseKernel::CLFFTDigitReverseKernel()
     : _input(nullptr), _output(nullptr), _idx(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLFFTDigitReverseKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config)
diff --git a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
index 779bf43922..0ee247fecd 100644
--- a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
+++ b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
@@ -78,6 +78,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 CLFFTRadixStageKernel::CLFFTRadixStageKernel()
     : _input(nullptr), _output(nullptr), _run_in_place(false)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLFFTRadixStageKernel::configure(ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config)
diff --git a/src/core/CL/kernels/CLFFTScaleKernel.cpp b/src/core/CL/kernels/CLFFTScaleKernel.cpp
index c80f774c6a..8901345738 100644
--- a/src/core/CL/kernels/CLFFTScaleKernel.cpp
+++ b/src/core/CL/kernels/CLFFTScaleKernel.cpp
@@ -55,6 +55,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 CLFFTScaleKernel::CLFFTScaleKernel()
     : _input(nullptr), _output(nullptr), _run_in_place(false)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLFFTScaleKernel::configure(ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config)
diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
index 840ed0ca2f..ded707e9d6 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.cpp
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,6 +39,7 @@ namespace arm_compute
 CLFillBorderKernel::CLFillBorderKernel()
     : ICLKernel(), _tensor(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 bool CLFillBorderKernel::is_parallelisable() const
diff --git a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
index 2116239080..0695ff9935 100644
--- a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -103,6 +103,7 @@ CLFuseBatchNormalizationKernel::CLFuseBatchNormalizationKernel()
     : _input_weights(nullptr), _input_bias(nullptr), _bn_mean(nullptr), _bn_var(nullptr), _bn_gamma(nullptr), _bn_beta(nullptr), _fused_weights(nullptr), _fused_bias(nullptr), _epsilon(),
       _run_in_place_weights(false), _run_in_place_bias(false)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLFuseBatchNormalizationKernel::configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp
index 9215fd602d..efba3d1d7a 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp
@@ -156,6 +156,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe
 CLGEMMLowpMatrixMultiplyNativeKernel::CLGEMMLowpMatrixMultiplyNativeKernel()
     : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _use_dummy_work_items(false)
 {
+    _type = CLKernelType::GEMM;
 }
 
 void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
index 848f272e50..fd533fc310 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
@@ -130,6 +130,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe
 CLGEMMLowpMatrixMultiplyReshapedKernel::CLGEMMLowpMatrixMultiplyReshapedKernel()
     : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _k(1), _use_dummy_work_items(false)
 {
+    _type = CLKernelType::GEMM;
 }
 
 void CLGEMMLowpMatrixMultiplyReshapedKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp
index 37c11000db..3424b6d264 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp
@@ -291,6 +291,7 @@ CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::CLGEMMLowpMatrixMultiplyReshapedO
       _is_quantized_per_channel(false),
       _fuse_output_stage(false)
 {
+    _type = CLKernelType::GEMM;
 }
 
 void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMKernelInfo &gemm_info,
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
index e621323c5f..b97bb84e59 100644
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
@@ -96,6 +96,7 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
 CLGEMMLowpOffsetContributionKernel::CLGEMMLowpOffsetContributionKernel()
     : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr), _bias(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset,
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
index 8ed83ed52c..eaa832fbaa 100644
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
@@ -127,6 +127,7 @@ CLGEMMLowpOffsetContributionOutputStageKernel::CLGEMMLowpOffsetContributionOutpu
       _output_shifts(nullptr),
       _is_quantized_per_channel(false)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLGEMMLowpOffsetContributionOutputStageKernel::configure(const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output,
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
index 5d827189e2..03bdf3f836 100644
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
@@ -65,6 +65,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
 CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel()
     : _input(nullptr), _bias(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 Status CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp
index adbbb1f5ac..abef484c1e 100644
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp
@@ -69,6 +69,7 @@ class Coordinates;
 CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel()
     : _input(nullptr), _bias(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 Status CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp
index 7af4d16780..faf86c769e 100644
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp
@@ -65,6 +65,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
 CLGEMMLowpQuantizeDownInt32ScaleKernel::CLGEMMLowpQuantizeDownInt32ScaleKernel()
     : _input(nullptr), _bias(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 Status CLGEMMLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
index 3d23aa7f34..55ae3ea6ca 100644
--- a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
@@ -64,6 +64,7 @@ Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITe
 ICLGEMMLowpReductionKernel::ICLGEMMLowpReductionKernel()
     : _input(), _output()
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLGEMMLowpMatrixAReductionKernel::configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
diff --git a/src/core/CL/kernels/CLGatherKernel.cpp b/src/core/CL/kernels/CLGatherKernel.cpp
index cbd540d80b..b49e6351a2 100644
--- a/src/core/CL/kernels/CLGatherKernel.cpp
+++ b/src/core/CL/kernels/CLGatherKernel.cpp
@@ -75,6 +75,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 CLGatherKernel::CLGatherKernel()
     : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLGatherKernel::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis)
diff --git a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
index 459ed035b1..8b008c306b 100644
--- a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
+++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
@@ -68,6 +68,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc
 CLComputeAllAnchorsKernel::CLComputeAllAnchorsKernel()
     : _anchors(nullptr), _all_anchors(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLComputeAllAnchorsKernel::configure(const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info)
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 44012690e7..97740e3c34 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -288,6 +288,7 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *input, const Size
 CLIm2ColKernel::CLIm2ColKernel()
     : _input(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info(), _num_groups()
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
diff --git a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
index 323579dc3c..74cbef151b 100644
--- a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
@@ -72,6 +72,7 @@ Status validate_arguments_meanvar(const ITensorInfo *input, const ITensorInfo *o
 CLComputeMeanVariance::CLComputeMeanVariance()
     : _input(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLComputeMeanVariance::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, bool use_mixed_precision)
@@ -152,6 +153,7 @@ void CLComputeMeanVariance::run(const Window &window, cl::CommandQueue &queue)
 CLInstanceNormalizationLayerKernel::CLInstanceNormalizationLayerKernel()
     : _input(nullptr), _output(nullptr), _mean(nullptr), _run_in_place(false)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *mean_var, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info)
diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
index c688951d57..46c0747cf5 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
@@ -74,6 +74,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, cons
 CLL2NormalizeLayerKernel::CLL2NormalizeLayerKernel()
     : _input(nullptr), _sum(nullptr), _output(nullptr), _actual_axis(0), _epsilon(1e-12)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLL2NormalizeLayerKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon)
diff --git a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
index ab68e0f68d..89a6d82947 100644
--- a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
@@ -74,6 +74,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 CLMaxUnpoolingLayerKernel::CLMaxUnpoolingLayerKernel()
     : _input(nullptr), _output(nullptr), _indices(nullptr)
 {
+    _type = CLKernelType::POOL;
 }
 
 void CLMaxUnpoolingLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info)
diff --git a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
index 9f98b67582..da9e367590 100644
--- a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
@@ -59,6 +59,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f
 CLMeanStdDevNormalizationKernel::CLMeanStdDevNormalizationKernel()
     : _input(nullptr), _output(nullptr), _run_in_place(false)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLMeanStdDevNormalizationKernel::configure(ICLTensor *input, ICLTensor *output, float epsilon)
diff --git a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
index ac8770467e..f0202a9c5d 100644
--- a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
@@ -83,6 +83,7 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe
 CLMinMaxLayerKernel::CLMinMaxLayerKernel()
     : _input(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLMinMaxLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
index 9242505315..a5dfafe338 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
@@ -113,6 +113,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 CLNormalizationLayerKernel::CLNormalizationLayerKernel()
     : _input(nullptr), _output(nullptr), _border_size(0), _is_norm_across_width(false)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 BorderSize CLNormalizationLayerKernel::border_size() const
diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
index cf2511adec..6c23b18e62 100644
--- a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
@@ -83,6 +83,7 @@ std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input,
 CLNormalizePlanarYUVLayerKernel::CLNormalizePlanarYUVLayerKernel()
     : _input(nullptr), _output(nullptr), _mean(nullptr), _std(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLNormalizePlanarYUVLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
diff --git a/src/core/CL/kernels/CLPadLayerKernel.cpp b/src/core/CL/kernels/CLPadLayerKernel.cpp
index 2f54b390d5..eaab992840 100644
--- a/src/core/CL/kernels/CLPadLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPadLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -67,6 +67,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 CLPadLayerKernel::CLPadLayerKernel()
     : _input(nullptr), _output(nullptr), _4d_enabled(false)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLPadLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
index 7b9caf0063..bf1b874dd0 100644
--- a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -99,6 +99,7 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
 CLPriorBoxLayerKernel::CLPriorBoxLayerKernel()
     : _input1(nullptr), _input2(nullptr), _output(nullptr), _info(), _num_priors(), _min(), _max(), _aspect_ratios()
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLPriorBoxLayerKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max, cl::Buffer *aspect_ratios)
diff --git a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
index f68520dee6..5ad4355202 100644
--- a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
@@ -82,6 +82,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 CLQLSTMLayerNormalizationKernel::CLQLSTMLayerNormalizationKernel()
     : _input(nullptr), _weight(nullptr), _bias(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias)
diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
index 9894c731fe..34d3e70c25 100644
--- a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
@@ -79,6 +79,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITe
 CLROIAlignLayerKernel::CLROIAlignLayerKernel()
     : _input(nullptr), _output(nullptr), _rois(nullptr), _pool_info(0, 0, 0.f)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLROIAlignLayerKernel::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
index 7a843d65a2..663da0467a 100644
--- a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
@@ -44,6 +44,7 @@ namespace arm_compute
 CLROIPoolingLayerKernel::CLROIPoolingLayerKernel()
     : _input(nullptr), _rois(nullptr), _output(nullptr), _pool_info(0, 0, 0.f)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 Status CLROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
diff --git a/src/core/CL/kernels/CLRangeKernel.cpp b/src/core/CL/kernels/CLRangeKernel.cpp
index 85f79988c9..b245e62bc0 100644
--- a/src/core/CL/kernels/CLRangeKernel.cpp
+++ b/src/core/CL/kernels/CLRangeKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -68,6 +68,7 @@ Status validate_arguments(const ITensorInfo *output, const float start, const fl
 CLRangeKernel::CLRangeKernel()
     : _start(0), _end(1), _step(1), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLRangeKernel::configure(ICLTensor *output, const float start, const float end, const float step)
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index 133a35f513..3b3b6c0364 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -75,6 +75,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
 CLReductionOperationKernel::CLReductionOperationKernel()
     : _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReductionOperation::SUM_SQUARE)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
diff --git a/src/core/CL/kernels/CLRemapKernel.cpp b/src/core/CL/kernels/CLRemapKernel.cpp
index 7e3157c99d..ea3b637e8d 100644
--- a/src/core/CL/kernels/CLRemapKernel.cpp
+++ b/src/core/CL/kernels/CLRemapKernel.cpp
@@ -37,6 +37,7 @@ namespace arm_compute
 CLRemapKernel::CLRemapKernel()
     : _input(nullptr), _output(nullptr), _map_x(nullptr), _map_y(nullptr), _data_layout(DataLayout::NCHW)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 BorderSize CLRemapKernel::border_size() const
diff --git a/src/core/CL/kernels/CLReorgLayerKernel.cpp b/src/core/CL/kernels/CLReorgLayerKernel.cpp
index c6c7824188..aa5f16fbc1 100644
--- a/src/core/CL/kernels/CLReorgLayerKernel.cpp
+++ b/src/core/CL/kernels/CLReorgLayerKernel.cpp
@@ -68,6 +68,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
 CLReorgLayerKernel::CLReorgLayerKernel()
     : _input(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLReorgLayerKernel::configure(const ICLTensor *input, ICLTensor *output, int32_t stride)
diff --git a/src/core/CL/kernels/CLReverseKernel.cpp b/src/core/CL/kernels/CLReverseKernel.cpp
index b3c9bcafd1..7e9431e230 100644
--- a/src/core/CL/kernels/CLReverseKernel.cpp
+++ b/src/core/CL/kernels/CLReverseKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,6 +61,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 CLReverseKernel::CLReverseKernel()
     : _input(nullptr), _output(nullptr), _axis(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLReverseKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
diff --git a/src/core/CL/kernels/CLSelectKernel.cpp b/src/core/CL/kernels/CLSelectKernel.cpp
index f8e63ddc43..43b958a8dc 100644
--- a/src/core/CL/kernels/CLSelectKernel.cpp
+++ b/src/core/CL/kernels/CLSelectKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -65,6 +65,7 @@ Status validate_arguments(const ITensorInfo *c, const ITensorInfo *x, const ITen
 CLSelectKernel::CLSelectKernel()
     : _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLSelectKernel::configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output)
diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
index 57f7af488b..6533731571 100644
--- a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -84,6 +84,7 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
 CLSpaceToBatchLayerKernel::CLSpaceToBatchLayerKernel()
     : _input(nullptr), _block_shape(nullptr), _paddings(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
diff --git a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
index 4e5b417ec6..e7656b805c 100644
--- a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -66,6 +66,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
 CLSpaceToDepthLayerKernel::CLSpaceToDepthLayerKernel()
     : _input(nullptr), _output(nullptr), _block_shape()
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLSpaceToDepthLayerKernel::configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape)
diff --git a/src/core/CL/kernels/CLStackLayerKernel.cpp b/src/core/CL/kernels/CLStackLayerKernel.cpp
index 9bdcc8dc3f..075c93ab60 100644
--- a/src/core/CL/kernels/CLStackLayerKernel.cpp
+++ b/src/core/CL/kernels/CLStackLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -76,6 +76,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsi
 CLStackLayerKernel::CLStackLayerKernel()
     : _input(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLStackLayerKernel::configure(const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output)
diff --git a/src/core/CL/kernels/CLStridedSliceKernel.cpp b/src/core/CL/kernels/CLStridedSliceKernel.cpp
index dd51df9363..464f74c9eb 100644
--- a/src/core/CL/kernels/CLStridedSliceKernel.cpp
+++ b/src/core/CL/kernels/CLStridedSliceKernel.cpp
@@ -70,6 +70,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
 }
 } // namespace
 
+CLStridedSliceKernel::CLStridedSliceKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
 void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
                                      const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
                                      int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
diff --git a/src/core/CL/kernels/CLStridedSliceKernel.h b/src/core/CL/kernels/CLStridedSliceKernel.h
index 599cf34c39..4c201504f5 100644
--- a/src/core/CL/kernels/CLStridedSliceKernel.h
+++ b/src/core/CL/kernels/CLStridedSliceKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,6 +35,9 @@ namespace arm_compute
 class CLStridedSliceKernel : public ICLKernel
 {
 public:
+    /** Default constructor */
+    CLStridedSliceKernel();
+
     /** Configure kernel
      *
      * @note Supported tensor rank: up to 4
diff --git a/src/core/CL/kernels/CLTileKernel.cpp b/src/core/CL/kernels/CLTileKernel.cpp
index c0c3d2e2ee..e4eed68c84 100644
--- a/src/core/CL/kernels/CLTileKernel.cpp
+++ b/src/core/CL/kernels/CLTileKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,6 +57,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
 CLTileKernel::CLTileKernel()
     : _input(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLTileKernel::configure(const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
diff --git a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
index d55c548b99..45e3505d0f 100644
--- a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
+++ b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
@@ -69,6 +69,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, c
 CLWeightsReshapeKernel::CLWeightsReshapeKernel()
     : _input(nullptr), _biases(nullptr), _output(nullptr)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void CLWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups)
diff --git a/src/core/gpu/cl/kernels/ClActivationKernel.cpp b/src/core/gpu/cl/kernels/ClActivationKernel.cpp
index 17a8c6498d..e892d6a778 100644
--- a/src/core/gpu/cl/kernels/ClActivationKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClActivationKernel.cpp
@@ -89,6 +89,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
 ClActivationKernel::ClActivationKernel()
     : _run_in_place(false)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void ClActivationKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo act_info)
diff --git a/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.cpp b/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.cpp
index 26f5113822..dbc628d6d5 100644
--- a/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClBatchConcatenateKernel.cpp
@@ -62,6 +62,7 @@ Status validate_arguments(const ITensorInfo *src, unsigned int batch_offset, con
 ClBatchConcatenateKernel::ClBatchConcatenateKernel()
     : _batch_offset(0)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void ClBatchConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst)
diff --git a/src/core/gpu/cl/kernels/ClCastKernel.cpp b/src/core/gpu/cl/kernels/ClCastKernel.cpp
index 7a1d5c2824..fac9ebe5cf 100644
--- a/src/core/gpu/cl/kernels/ClCastKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClCastKernel.cpp
@@ -72,6 +72,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Conver
 }
 } // namespace
 
+ClCastKernel::ClCastKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
 void ClCastKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
diff --git a/src/core/gpu/cl/kernels/ClCastKernel.h b/src/core/gpu/cl/kernels/ClCastKernel.h
index 451aa9c1ab..6bf3cd9e50 100644
--- a/src/core/gpu/cl/kernels/ClCastKernel.h
+++ b/src/core/gpu/cl/kernels/ClCastKernel.h
@@ -41,7 +41,7 @@ namespace kernels
 class ClCastKernel : public IClKernel
 {
 public:
-    ClCastKernel() = default;
+    ClCastKernel();
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClCastKernel);
     /** Set the src and dst of the kernel.
      *
diff --git a/src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp b/src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp
index 49f2f68a76..d1abd274d6 100644
--- a/src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp
@@ -40,6 +40,11 @@ namespace opencl
 {
 namespace kernels
 {
+ClConvertFullyConnectedWeightsKernel::ClConvertFullyConnectedWeightsKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
 void ClConvertFullyConnectedWeightsKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape,
                                                      DataLayout data_layout)
 {
diff --git a/src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h b/src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h
index 11ab4d2a0d..6f4f09dc32 100644
--- a/src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h
+++ b/src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h
@@ -47,7 +47,7 @@ namespace kernels
 class ClConvertFullyConnectedWeightsKernel : public IClKernel
 {
 public:
-    ClConvertFullyConnectedWeightsKernel() = default;
+    ClConvertFullyConnectedWeightsKernel();
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClConvertFullyConnectedWeightsKernel);
     /** Set the src and dst tensor.
      *
diff --git a/src/core/gpu/cl/kernels/ClCopyKernel.cpp b/src/core/gpu/cl/kernels/ClCopyKernel.cpp
index d6c87f8fad..98c6f34e60 100644
--- a/src/core/gpu/cl/kernels/ClCopyKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClCopyKernel.cpp
@@ -68,6 +68,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Window
 
 } // namespace
 
+ClCopyKernel::ClCopyKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
 void ClCopyKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Window *dst_window)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
diff --git a/src/core/gpu/cl/kernels/ClCopyKernel.h b/src/core/gpu/cl/kernels/ClCopyKernel.h
index b1b9672bcb..f3eb0aab62 100644
--- a/src/core/gpu/cl/kernels/ClCopyKernel.h
+++ b/src/core/gpu/cl/kernels/ClCopyKernel.h
@@ -38,7 +38,7 @@ namespace kernels
 class ClCopyKernel : public IClKernel
 {
 public:
-    ClCopyKernel() = default;
+    ClCopyKernel();
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClCopyKernel);
     /** Initialize the kernel's src, dst.
      *
diff --git a/src/core/gpu/cl/kernels/ClCropKernel.cpp b/src/core/gpu/cl/kernels/ClCropKernel.cpp
index 1d322eefa1..ef2e48b45d 100644
--- a/src/core/gpu/cl/kernels/ClCropKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClCropKernel.cpp
@@ -46,6 +46,11 @@ void ClCropKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Coordinat
     configure(CLKernelLibrary::get().get_compile_context(), src, dst, start, end, batch_index, extrapolation_value, dst_window);
 }
 
+ClCropKernel::ClCropKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
 void ClCropKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index,
                              float extrapolation_value, Window *dst_window)
 {
diff --git a/src/core/gpu/cl/kernels/ClCropKernel.h b/src/core/gpu/cl/kernels/ClCropKernel.h
index ec0f8e58da..7120dbbc81 100644
--- a/src/core/gpu/cl/kernels/ClCropKernel.h
+++ b/src/core/gpu/cl/kernels/ClCropKernel.h
@@ -38,7 +38,7 @@ namespace kernels
 class ClCropKernel : public IClKernel
 {
 public:
-    ClCropKernel() = default;
+    ClCropKernel();
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClCropKernel);
     /** Configure kernel
      *
diff --git a/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.cpp b/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.cpp
index 4039570da4..e3e384f748 100644
--- a/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClDepthConcatenateKernel.cpp
@@ -61,6 +61,7 @@ Status validate_arguments(const ITensorInfo *src, unsigned int depth_offset, con
 ClDepthConcatenateKernel::ClDepthConcatenateKernel()
     : _depth_offset(0)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 void ClDepthConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst)
diff --git a/src/core/gpu/cl/kernels/ClDequantizeKernel.cpp b/src/core/gpu/cl/kernels/ClDequantizeKernel.cpp
index f2758b759f..d69da8716c 100644
--- a/src/core/gpu/cl/kernels/ClDequantizeKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClDequantizeKernel.cpp
@@ -61,6 +61,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
 }
 } // namespace
 
+ClDequantizeKernel::ClDequantizeKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
 void ClDequantizeKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
diff --git a/src/core/gpu/cl/kernels/ClDequantizeKernel.h b/src/core/gpu/cl/kernels/ClDequantizeKernel.h
index 33e0164cc9..2460674067 100644
--- a/src/core/gpu/cl/kernels/ClDequantizeKernel.h
+++ b/src/core/gpu/cl/kernels/ClDequantizeKernel.h
@@ -39,7 +39,7 @@ class ClDequantizeKernel : public IClKernel
 {
 public:
     /** Default constructor */
-    ClDequantizeKernel() = default;
+    ClDequantizeKernel();
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClDequantizeKernel);
     /** Initialise the kernel's input and output
      *
diff --git a/src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp b/src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp
index 94c4044bff..7b98671da2 100644
--- a/src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp
@@ -377,6 +377,11 @@ BorderSize ClDirectConv2dKernel::border_size() const
     return _border_size;
 }
 
+ClDirectConv2dKernel::ClDirectConv2dKernel()
+{
+    _type = CLKernelType::DIRECT;
+}
+
 void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
                                      const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
 {
diff --git a/src/core/gpu/cl/kernels/ClDirectConv2dKernel.h b/src/core/gpu/cl/kernels/ClDirectConv2dKernel.h
index e76666fd36..b592a2191b 100644
--- a/src/core/gpu/cl/kernels/ClDirectConv2dKernel.h
+++ b/src/core/gpu/cl/kernels/ClDirectConv2dKernel.h
@@ -38,7 +38,7 @@ namespace kernels
 class ClDirectConv2dKernel : public IClKernel
 {
 public:
-    ClDirectConv2dKernel() = default;
+    ClDirectConv2dKernel();
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClDirectConv2dKernel);
     /** Set the src, weights, biases and dst tensors info.
      *
diff --git a/src/core/gpu/cl/kernels/ClElementwiseKernel.cpp b/src/core/gpu/cl/kernels/ClElementwiseKernel.cpp
index 335ee9c392..7bfdb9efdd 100644
--- a/src/core/gpu/cl/kernels/ClElementwiseKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClElementwiseKernel.cpp
@@ -98,7 +98,7 @@ Status validate_arguments_with_float_only_supported_rules(const ITensorInfo &src
     return Status{};
 }
 
-Status validate_arguments_divide_operation(const ITensorInfo* src1, const ITensorInfo* src2, const ITensorInfo* dst)
+Status validate_arguments_divide_operation(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1);
@@ -271,6 +271,11 @@ std::pair<Status, Window> validate_and_configure_window_for_division(ITensorInfo
 }
 } // namespace
 
+ClElementwiseKernel::ClElementwiseKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
 void ClElementwiseKernel::configure_common(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
 {
     configure_common(CLKernelLibrary::get().get_compile_context(), src1, src2, dst);
diff --git a/src/core/gpu/cl/kernels/ClElementwiseKernel.h b/src/core/gpu/cl/kernels/ClElementwiseKernel.h
index 4ed8ae73ab..7f55151a87 100644
--- a/src/core/gpu/cl/kernels/ClElementwiseKernel.h
+++ b/src/core/gpu/cl/kernels/ClElementwiseKernel.h
@@ -45,7 +45,7 @@ class ClElementwiseKernel : public IClKernel
 {
 public:
     /** Default constructor */
-    ClElementwiseKernel() = default;
+    ClElementwiseKernel();
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClElementwiseKernel);
 
     // Inherited methods overridden:
@@ -89,9 +89,15 @@ protected:
     ActivationLayerInfo _act_info{};
 
 private:
-    const ITensorInfo *_src1{ nullptr }; /**< Source tensor info 1 */
-    const ITensorInfo *_src2{ nullptr }; /**< Source tensor info 2 */
-    ITensorInfo       *_dst{ nullptr };  /**< Destination tensor info */
+    const ITensorInfo *_src1
+    {
+        nullptr
+    }; /**< Source tensor info 1 */
+    const ITensorInfo *_src2
+    {
+        nullptr
+    };                            /**< Source tensor info 2 */
+    ITensorInfo *_dst{ nullptr }; /**< Destination tensor info */
 };
 
 class ClLogicalBinaryKernel : public ClElementwiseKernel
diff --git a/src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp b/src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp
index 5cbb3f2e38..1525c0fe54 100644
--- a/src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp
@@ -66,6 +66,11 @@ Status validate_arguments(const ITensorInfo &src, const ITensorInfo &dst, const
 }
 } // namespace
 
+ClElementWiseUnaryKernel::ClElementWiseUnaryKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
 void ClElementWiseUnaryKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const ElementWiseUnary &op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
diff --git a/src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.h b/src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.h
index 7e5edef3ee..225869b58b 100644
--- a/src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.h
+++ b/src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.h
@@ -38,7 +38,7 @@ namespace kernels
 class ClElementWiseUnaryKernel : public IClKernel
 {
 public:
-    ClElementWiseUnaryKernel() = default;
+    ClElementWiseUnaryKernel();
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClElementWiseUnaryKernel);
     /** Initialise the kernel's srcs, dst.
      *
diff --git a/src/core/gpu/cl/kernels/ClFillKernel.cpp b/src/core/gpu/cl/kernels/ClFillKernel.cpp
index b194ee549b..526a466a00 100644
--- a/src/core/gpu/cl/kernels/ClFillKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClFillKernel.cpp
@@ -42,6 +42,11 @@ namespace opencl
 {
 namespace kernels
 {
+ClFillKernel::ClFillKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
 void ClFillKernel::configure(ITensorInfo      *tensor,
                              const PixelValue &constant_value,
                              Window           *window)
diff --git a/src/core/gpu/cl/kernels/ClFillKernel.h b/src/core/gpu/cl/kernels/ClFillKernel.h
index b439eac0de..9542c20508 100644
--- a/src/core/gpu/cl/kernels/ClFillKernel.h
+++ b/src/core/gpu/cl/kernels/ClFillKernel.h
@@ -38,7 +38,7 @@ namespace kernels
 class ClFillKernel : public IClKernel
 {
 public:
-    ClFillKernel() = default;
+    ClFillKernel();
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClFillKernel);
     /** Initialise the kernel's tensor and filling value
      *
diff --git a/src/core/gpu/cl/kernels/ClFloorKernel.cpp b/src/core/gpu/cl/kernels/ClFloorKernel.cpp
index 7296d40eaf..2047128963 100644
--- a/src/core/gpu/cl/kernels/ClFloorKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClFloorKernel.cpp
@@ -61,6 +61,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
 }
 } // namespace
 
+ClFloorKernel::ClFloorKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
 void ClFloorKernel::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
diff --git a/src/core/gpu/cl/kernels/ClFloorKernel.h b/src/core/gpu/cl/kernels/ClFloorKernel.h
index 646dfb30d8..3bc648b7be 100644
--- a/src/core/gpu/cl/kernels/ClFloorKernel.h
+++ b/src/core/gpu/cl/kernels/ClFloorKernel.h
@@ -38,7 +38,7 @@ namespace kernels
 class ClFloorKernel : public IClKernel
 {
 public:
-    ClFloorKernel() = default;
+    ClFloorKernel();
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClFloorKernel);
     /** Configure kernel for a given list of arguments
      *
diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp
index 817a105b14..6079644935 100644
--- a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp
@@ -262,6 +262,11 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0
 }
 } // namespace
 
+ClGemmMatrixMultiplyKernel::ClGemmMatrixMultiplyKernel()
+{
+    _type = CLKernelType::GEMM;
+}
+
 void ClGemmMatrixMultiplyKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha,
                                            float beta,
                                            bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision, const ActivationLayerInfo &activation_info)
diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h
index c1601335ee..c303f78b07 100644
--- a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h
+++ b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h
@@ -45,7 +45,7 @@ namespace kernels
 class ClGemmMatrixMultiplyKernel : public IClKernel
 {
 public:
-    ClGemmMatrixMultiplyKernel() = default;
+    ClGemmMatrixMultiplyKernel();
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmMatrixMultiplyKernel);
     /** Initialise the kernel's input, output and alpha
      *
diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
index 97d64c433c..5ae55ab04a 100644
--- a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
@@ -194,6 +194,11 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITens
 }
 } // namespace
 
+ClGemmMatrixMultiplyNativeKernel::ClGemmMatrixMultiplyNativeKernel()
+{
+    _type = CLKernelType::GEMM;
+}
+
 void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha,
                                                  float                    beta,
                                                  const GEMMLHSMatrixInfo &lhs_info,
diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
index 4770b18b8e..c3bdc7589e 100644
--- a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
+++ b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
@@ -39,7 +39,7 @@ namespace kernels
 class ClGemmMatrixMultiplyNativeKernel : public IClKernel
 {
 public:
-    ClGemmMatrixMultiplyNativeKernel() = default;
+    ClGemmMatrixMultiplyNativeKernel();
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmMatrixMultiplyNativeKernel);
     /** Initialise the kernel's input and dst.
      *
diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
index 27409b66ac..591834f762 100644
--- a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
@@ -184,6 +184,11 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITens
 }
 } // namespace
 
+ClGemmMatrixMultiplyReshapedKernel::ClGemmMatrixMultiplyReshapedKernel()
+{
+    _type = CLKernelType::GEMM;
+}
+
 void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context,
                                                    ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
                                                    const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
index ab648f15ae..b8ae4b9ae3 100644
--- a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
+++ b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
@@ -45,7 +45,7 @@ namespace kernels
 class ClGemmMatrixMultiplyReshapedKernel : public IClKernel
 {
 public:
-    ClGemmMatrixMultiplyReshapedKernel() = default;
+    ClGemmMatrixMultiplyReshapedKernel();
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmMatrixMultiplyReshapedKernel);
     /** Initialise the kernel's input and output.
      *
diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
index 4eea2c6f76..32ee0f9705 100644
--- a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
@@ -181,6 +181,11 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITens
 }
 } // namespace
 
+ClGemmMatrixMultiplyReshapedOnlyRhsKernel::ClGemmMatrixMultiplyReshapedOnlyRhsKernel()
+{
+    _type = CLKernelType::GEMM;
+}
+
 void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context,
                                                           ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
                                                           const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
index ff6c391e15..3d6164eca9 100644
--- a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
+++ b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
@@ -43,7 +43,7 @@ namespace kernels
 class ClGemmMatrixMultiplyReshapedOnlyRhsKernel : public ICLKernel
 {
 public:
-    ClGemmMatrixMultiplyReshapedOnlyRhsKernel() = default;
+    ClGemmMatrixMultiplyReshapedOnlyRhsKernel();
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmMatrixMultiplyReshapedOnlyRhsKernel);
     /** Initialise the kernel's input and output.
      *
diff --git a/src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp b/src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp
index 98161edfff..f92945e2a4 100644
--- a/src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp
@@ -111,6 +111,11 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso
 }
 } // namespace
 
+ClGemmReshapeLhsMatrixKernel::ClGemmReshapeLhsMatrixKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
 void ClGemmReshapeLhsMatrixKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
diff --git a/src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h b/src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h
index b830ba02b4..73d811f3c3 100644
--- a/src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h
+++ b/src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h
@@ -41,7 +41,7 @@ namespace kernels
 class ClGemmReshapeLhsMatrixKernel : public ICLKernel
 {
 public:
-    ClGemmReshapeLhsMatrixKernel() = default;
+    ClGemmReshapeLhsMatrixKernel();
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmReshapeLhsMatrixKernel);
     /** Initialise the kernel's input and output.
      *
diff --git a/src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp b/src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp
index e1ef7c61aa..3a6f3c7e8f 100644
--- a/src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp
@@ -107,6 +107,11 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso
 }
 } // namespace
 
+ClGemmReshapeRhsMatrixKernel::ClGemmReshapeRhsMatrixKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
 void ClGemmReshapeRhsMatrixKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
diff --git a/src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h b/src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h
index e877d87408..27f80d3428 100644
--- a/src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h
+++ b/src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h
@@ -40,7 +40,7 @@ namespace kernels
 class ClGemmReshapeRhsMatrixKernel : public ICLKernel
 {
 public:
-    ClGemmReshapeRhsMatrixKernel() = default;
+    ClGemmReshapeRhsMatrixKernel();
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmReshapeRhsMatrixKernel);
     /** Initialise the kernel's input and output.
      *
diff --git a/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.cpp b/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.cpp
index 4436e98fe3..9ff30eedcd 100644
--- a/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClHeightConcatenateKernel.cpp
@@ -64,6 +64,7 @@ Status validate_arguments(const ITensorInfo *src, unsigned int height_offset, co
 ClHeightConcatenateKernel::ClHeightConcatenateKernel()
     : _height_offset(0)
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 Status ClHeightConcatenateKernel::validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst)
diff --git a/src/core/gpu/cl/kernels/ClMulKernel.cpp b/src/core/gpu/cl/kernels/ClMulKernel.cpp
index b8081bbacf..65f3bec099 100644
--- a/src/core/gpu/cl/kernels/ClMulKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClMulKernel.cpp
@@ -92,6 +92,11 @@ Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, cons
 }
 } // namespace
 
+ClMulKernel::ClMulKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
 void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
                             ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
 {
@@ -128,7 +133,7 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo
     else
     {
         if(src1->element_size() == 4 || src2->element_size() == 4)
-        {   
+        {
             // use 64 bit accumulator for 32-bit input
             acc_type = "long";
         }
@@ -316,6 +321,11 @@ Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *sr
 }
 } // namespace
 
+ClComplexMulKernel::ClComplexMulKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
 void ClComplexMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
diff --git a/src/core/gpu/cl/kernels/ClMulKernel.h b/src/core/gpu/cl/kernels/ClMulKernel.h
index 44162f3db3..9c70301d89 100644
--- a/src/core/gpu/cl/kernels/ClMulKernel.h
+++ b/src/core/gpu/cl/kernels/ClMulKernel.h
@@ -39,7 +39,7 @@ class ClMulKernel : public IClKernel
 {
 public:
     /** Default constructor */
-    ClMulKernel() = default;
+    ClMulKernel();
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClMulKernel);
     /** Initialise the kernel's src and dst.
      *
@@ -88,7 +88,7 @@ class ClComplexMulKernel : public ICLKernel
 {
 public:
     /** Default constructor */
-    ClComplexMulKernel() = default;
+    ClComplexMulKernel();
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClComplexMulKernel);
     /** Initialise the kernel's src and dst.
      *
diff --git a/src/core/gpu/cl/kernels/ClPermuteKernel.cpp b/src/core/gpu/cl/kernels/ClPermuteKernel.cpp
index ffc13060a8..722bf454f2 100644
--- a/src/core/gpu/cl/kernels/ClPermuteKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClPermuteKernel.cpp
@@ -77,6 +77,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
 }
 } // namespace
 
+ClPermuteKernel::ClPermuteKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
 void ClPermuteKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
diff --git a/src/core/gpu/cl/kernels/ClPermuteKernel.h b/src/core/gpu/cl/kernels/ClPermuteKernel.h
index b844214595..326110a27c 100644
--- a/src/core/gpu/cl/kernels/ClPermuteKernel.h
+++ b/src/core/gpu/cl/kernels/ClPermuteKernel.h
@@ -42,7 +42,7 @@ class ClPermuteKernel : public IClKernel
 {
 public:
     /** Default constructor */
-    ClPermuteKernel() = default;
+    ClPermuteKernel();
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClPermuteKernel);
     /** Set the src and dst of the kernel.
      *
diff --git a/src/core/gpu/cl/kernels/ClPool2dKernel.cpp b/src/core/gpu/cl/kernels/ClPool2dKernel.cpp
index 0e15bffd14..9d5a24fdf2 100644
--- a/src/core/gpu/cl/kernels/ClPool2dKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClPool2dKernel.cpp
@@ -205,6 +205,7 @@ std::tuple<Status, Window, ClPoolingConfig> validate_and_configure_window(ITenso
 ClPool2dKernel::ClPool2dKernel()
     : _pool_info(), _data_layout(DataLayout::UNKNOWN), _border_size(0), _num_elems_processed_per_iteration(1)
 {
+    _type = CLKernelType::POOL;
 }
 
 BorderSize ClPool2dKernel::border_size() const
diff --git a/src/core/gpu/cl/kernels/ClQuantizeKernel.cpp b/src/core/gpu/cl/kernels/ClQuantizeKernel.cpp
index 48d351d536..7900489db7 100644
--- a/src/core/gpu/cl/kernels/ClQuantizeKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClQuantizeKernel.cpp
@@ -61,6 +61,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
 }
 } // namespace
 
+ClQuantizeKernel::ClQuantizeKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
 void ClQuantizeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
diff --git a/src/core/gpu/cl/kernels/ClQuantizeKernel.h b/src/core/gpu/cl/kernels/ClQuantizeKernel.h
index 8d37f33032..cd972987f5 100644
--- a/src/core/gpu/cl/kernels/ClQuantizeKernel.h
+++ b/src/core/gpu/cl/kernels/ClQuantizeKernel.h
@@ -42,7 +42,7 @@ class ClQuantizeKernel : public IClKernel
 {
 public:
     /** Default constructor */
-    ClQuantizeKernel() = default;
+    ClQuantizeKernel();
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClQuantizeKernel);
     /** Set the input, output.
      *
diff --git a/src/core/gpu/cl/kernels/ClReshapeKernel.cpp b/src/core/gpu/cl/kernels/ClReshapeKernel.cpp
index 923b9cb264..fcda061930 100644
--- a/src/core/gpu/cl/kernels/ClReshapeKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClReshapeKernel.cpp
@@ -62,6 +62,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
 }
 } // namespace
 
+ClReshapeKernel::ClReshapeKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
 void ClReshapeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
diff --git a/src/core/gpu/cl/kernels/ClReshapeKernel.h b/src/core/gpu/cl/kernels/ClReshapeKernel.h
index 0501b93f40..3cd8369012 100644
--- a/src/core/gpu/cl/kernels/ClReshapeKernel.h
+++ b/src/core/gpu/cl/kernels/ClReshapeKernel.h
@@ -38,7 +38,7 @@ namespace kernels
 class ClReshapeKernel : public IClKernel
 {
 public:
-    ClReshapeKernel() = default;
+    ClReshapeKernel();
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClReshapeKernel);
     /** Set the src and dst of the kernel
      *
diff --git a/src/core/gpu/cl/kernels/ClScaleKernel.cpp b/src/core/gpu/cl/kernels/ClScaleKernel.cpp
index 7fb5d2a5d3..57ca331539 100644
--- a/src/core/gpu/cl/kernels/ClScaleKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClScaleKernel.cpp
@@ -140,6 +140,11 @@ Status ClScaleKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, c
     return Status{};
 }
 
+ClScaleKernel::ClScaleKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
 void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, info));
diff --git a/src/core/gpu/cl/kernels/ClScaleKernel.h b/src/core/gpu/cl/kernels/ClScaleKernel.h
index ad7632c713..826c4821b5 100644
--- a/src/core/gpu/cl/kernels/ClScaleKernel.h
+++ b/src/core/gpu/cl/kernels/ClScaleKernel.h
@@ -42,7 +42,7 @@ class ClScaleKernel : public IClKernel
 {
 public:
     /** Default constructor */
-    ClScaleKernel() = default;
+    ClScaleKernel();
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClScaleKernel);
 
     /** Initialise the kernel's inputs, output and interpolation policy
diff --git a/src/core/gpu/cl/kernels/ClSoftmaxKernel.cpp b/src/core/gpu/cl/kernels/ClSoftmaxKernel.cpp
index 000c9ad04d..1dd905d66e 100644
--- a/src/core/gpu/cl/kernels/ClSoftmaxKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClSoftmaxKernel.cpp
@@ -154,6 +154,11 @@ const unsigned int ClLogits1DMaxShiftExpSumKernel::_serial_vector_size = 8;
 /**< Vector size in the parallel case (obtained through auto-tuning, enables the best memory access pattern for Bifrost) .*/
 const unsigned int ClLogits1DMaxShiftExpSumKernel::_parallel_vector_size = 4;
 
+ClLogits1DMaxShiftExpSumKernel::ClLogits1DMaxShiftExpSumKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
 void ClLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &max, ITensorInfo &dst, ITensorInfo &sum, const SoftmaxKernelInfo &info)
 {
     auto padding_info = get_padding_info({ &src, &max, &dst, &sum });
@@ -273,6 +278,11 @@ void ClLogits1DMaxShiftExpSumKernel::run_op(ITensorPack &tensors, const Window &
     while(window_collapsed.slide_window_slice_3D(slice));
 }
 
+ClLogits1DNormKernel::ClLogits1DNormKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
 void ClLogits1DNormKernel::configure(const CLCompileContext &compile_context, const ITensorInfo &src, const ITensorInfo &sum, ITensorInfo &dst, const SoftmaxKernelInfo &info)
 {
     auto padding_info = get_padding_info({ &src, &dst, &sum });
diff --git a/src/core/gpu/cl/kernels/ClSoftmaxKernel.h b/src/core/gpu/cl/kernels/ClSoftmaxKernel.h
index af980eaa8e..db1aca3b9b 100644
--- a/src/core/gpu/cl/kernels/ClSoftmaxKernel.h
+++ b/src/core/gpu/cl/kernels/ClSoftmaxKernel.h
@@ -51,7 +51,7 @@ public:
     using ParallelReductionInfo = std::tuple<bool, unsigned int>;
 
     /** Default constructor */
-    ClLogits1DMaxShiftExpSumKernel() = default;
+    ClLogits1DMaxShiftExpSumKernel();
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClLogits1DMaxShiftExpSumKernel);
     /** Configure the kernel using the given information about tensors
      *
@@ -94,7 +94,7 @@ class ClLogits1DNormKernel : public IClKernel
 {
 public:
     /** Default constructor */
-    ClLogits1DNormKernel() = default;
+    ClLogits1DNormKernel();
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClLogits1DNormKernel);
 
     /** Set the input and output tensors.
diff --git a/src/core/gpu/cl/kernels/ClTransposeKernel.cpp b/src/core/gpu/cl/kernels/ClTransposeKernel.cpp
index 704d0152cf..40bd4b034a 100644
--- a/src/core/gpu/cl/kernels/ClTransposeKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClTransposeKernel.cpp
@@ -43,6 +43,11 @@ namespace opencl
 {
 namespace kernels
 {
+ClTransposeKernel::ClTransposeKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
 void ClTransposeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
diff --git a/src/core/gpu/cl/kernels/ClTransposeKernel.h b/src/core/gpu/cl/kernels/ClTransposeKernel.h
index 21d4fd41f5..7d1226cc0d 100644
--- a/src/core/gpu/cl/kernels/ClTransposeKernel.h
+++ b/src/core/gpu/cl/kernels/ClTransposeKernel.h
@@ -38,7 +38,7 @@ namespace kernels
 class ClTransposeKernel : public IClKernel
 {
 public:
-    ClTransposeKernel() = default;
+    ClTransposeKernel();
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClTransposeKernel);
     /** Set the src and dst of the kernel.
      *
diff --git a/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp b/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp
index 9f970719ed..8607620e92 100644
--- a/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp
@@ -68,6 +68,11 @@ Status ClWidthConcatenate2TensorsKernel::validate(const ITensorInfo *src1, const
     return Status{};
 }
 
+ClWidthConcatenate2TensorsKernel::ClWidthConcatenate2TensorsKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
 void ClWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
diff --git a/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h b/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h
index ddade29113..56202bad2e 100644
--- a/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h
+++ b/src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h
@@ -41,7 +41,7 @@ class ClWidthConcatenate2TensorsKernel : public IClKernel
 {
 public:
     /** Default constructor */
-    ClWidthConcatenate2TensorsKernel() = default;
+    ClWidthConcatenate2TensorsKernel();
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWidthConcatenate2TensorsKernel);
     /** Initialise the kernel's sources and destination
      *
diff --git a/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp b/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp
index 281d190381..edbc23c1d3 100644
--- a/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp
@@ -66,6 +66,7 @@ Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, cons
 
 ClWidthConcatenate4TensorsKernel::ClWidthConcatenate4TensorsKernel()
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 Status ClWidthConcatenate4TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst)
diff --git a/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.cpp b/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.cpp
index d188a5226b..5510c746f8 100644
--- a/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClWidthConcatenateKernel.cpp
@@ -63,6 +63,7 @@ Status validate_arguments(const ITensorInfo *src, unsigned int width_offset, con
 
 ClWidthConcatenateKernel::ClWidthConcatenateKernel()
 {
+    _type = CLKernelType::ELEMENTWISE;
 }
 
 Status ClWidthConcatenateKernel::validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst)
diff --git a/src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp b/src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp
index 381b4bcae9..ae43fed12d 100644
--- a/src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp
@@ -91,6 +91,11 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 }
 } // namespace
 
+ClWinogradFilterTransformKernel::ClWinogradFilterTransformKernel()
+{
+    _type = CLKernelType::WINOGRAD;
+}
+
 void ClWinogradFilterTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
diff --git a/src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.h b/src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.h
index 2bc2ceb36e..13200dc419 100644
--- a/src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.h
+++ b/src/core/gpu/cl/kernels/ClWinogradFilterTransformKernel.h
@@ -40,7 +40,7 @@ class ClWinogradFilterTransformKernel : public IClKernel
 {
 public:
     /** Default constructor */
-    ClWinogradFilterTransformKernel() = default;
+    ClWinogradFilterTransformKernel();
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWinogradFilterTransformKernel);
     /** Set the input and output tensor.
      *
diff --git a/src/core/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp b/src/core/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp
index 17f0eb9e2c..62db2282e0 100644
--- a/src/core/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp
@@ -103,6 +103,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 ClWinogradInputTransformKernel::ClWinogradInputTransformKernel()
     : _border_size(0), _data_layout(DataLayout::UNKNOWN), _num_tiles_x(0), _num_tiles_y(0), _step_z(1)
 {
+    _type = CLKernelType::WINOGRAD;
 }
 
 BorderSize ClWinogradInputTransformKernel::border_size() const
diff --git a/src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp b/src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
index a6c05420ed..f6ade57e5d 100644
--- a/src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
@@ -122,6 +122,11 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 }
 } // namespace
 
+ClWinogradOutputTransformKernel::ClWinogradOutputTransformKernel()
+{
+    _type = CLKernelType::WINOGRAD;
+}
+
 void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const WinogradInfo &winograd_info,
                                                 const ActivationLayerInfo &act_info)
 {
diff --git a/src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.h b/src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.h
index 48b27e658c..2948d3f181 100644
--- a/src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.h
+++ b/src/core/gpu/cl/kernels/ClWinogradOutputTransformKernel.h
@@ -40,7 +40,7 @@ class ClWinogradOutputTransformKernel : public IClKernel
 {
 public:
     /** Default constructor */
-    ClWinogradOutputTransformKernel() = default;
+    ClWinogradOutputTransformKernel();
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWinogradOutputTransformKernel);
     /** Set the input and output tensor.
      *
author	Giorgio Arena <giorgio.arena@arm.com>	2021-06-28 11:00:27 +0100
committer	Giorgio Arena <giorgio.arena@arm.com>	2021-06-29 11:06:10 +0000
commit	4a95bba6ca61ce99995ece6fd237b5498c9f322c (patch)
tree	04ef2dee1cdc015ce14ce6b664d9a956a96c3a05
parent	731fe667e3009bfbfee8b0eb74ecb68b291e4311 (diff)
download	ComputeLibrary-4a95bba6ca61ce99995ece6fd237b5498c9f322c.tar.gz