Extend cl image support to input and output tensors

- Add support for texture image to input and output of direct convolution - Extend T_LOAD2D_INDIRECT macro to read values from cl image storages Resolves COMPMID-5715 Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Change-Id: Idb0410f53f6d0763cd9e39895a7cbf9bc826d33a Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8904 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
author: Gian Marco Iodice <gianmarco.iodice@arm.com> 2022-12-30 16:07:45 +0000
committer: Gian Marco Iodice <gianmarco.iodice@arm.com> 2023-01-10 09:57:51 +0000
commit: 3cce35dcad8bc8f53a1e6613f719af9ab04feda6 (patch)
tree: e1015566852ebce4af897db37cf5cb1989c29924
parent: d2d9361a0a338bce478f7d85b4af70d1ed20f26c (diff)
download: ComputeLibrary-3cce35dcad8bc8f53a1e6613f719af9ab04feda6.tar.gz
19 files changed, 312 insertions, 147 deletions
diff --git a/arm_compute/core/KernelDescriptors.h b/arm_compute/core/KernelDescriptors.h
index 4a64032b14..19ac254c04 100644
--- a/arm_compute/core/KernelDescriptors.h
+++ b/arm_compute/core/KernelDescriptors.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -117,6 +117,8 @@ struct DirectConvComputeKernelInfo
     int32_t n0{ 1 };                             /**< Number of columns to be processed by the kernel */
     int32_t k0{ 1 };                             /**< Number of partial accumulations to be processed in a single iteration by the kernel */
     bool    export_weights_to_cl_image{ false }; /**< Flag to export the weights to cl_image */
+    bool    export_output_to_cl_image{ false };  /**< Flag to export the output to cl_image */
+    bool    export_input_to_cl_image{ false };   /**< Flag to export the input to cl_image */
 };
 
 /** Descriptor used by the softmax kernels */
diff --git a/src/core/CL/CLUtils.cpp b/src/core/CL/CLUtils.cpp
index 8f39c2d700..84cf88e099 100644
--- a/src/core/CL/CLUtils.cpp
+++ b/src/core/CL/CLUtils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022 Arm Limited.
+ * Copyright (c) 2020-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 
 namespace arm_compute
 {
-cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch)
+cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch, CLImage2DType type)
 {
     cl_channel_type cl_data_type;
 
@@ -61,7 +61,17 @@ cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer
     desc.image_width     = shape2d[0];
     desc.image_height    = shape2d[1];
 
-    cl_image = clCreateImage(ctx(), CL_MEM_READ_ONLY, &format, &desc, nullptr, &err);
+    switch(type)
+    {
+        case CLImage2DType::ReadOnly:
+            cl_image = clCreateImage(ctx(), CL_MEM_READ_ONLY, &format, &desc, nullptr, &err);
+            break;
+        case CLImage2DType::WriteOnly:
+            cl_image = clCreateImage(ctx(), CL_MEM_WRITE_ONLY, &format, &desc, nullptr, &err);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported CLImage2DType");
+    }
 
     ARM_COMPUTE_UNUSED(err);
     ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Error during the creation of CL image from buffer");
@@ -176,4 +186,4 @@ void PostOpCLKernelUtils::set_post_ops_cl_kernel_name(std::string &kernel_name,
 }
 } // namespace experimental
 
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/CL/CLUtils.h b/src/core/CL/CLUtils.h
index d133e4fe6f..b31944c72f 100644
--- a/src/core/CL/CLUtils.h
+++ b/src/core/CL/CLUtils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,6 +34,13 @@ class TensorShape;
 class CLBuildOptions;
 class ITensorInfo;
 
+/** OpenCL Image2D types */
+enum class CLImage2DType
+{
+    ReadOnly,
+    WriteOnly
+};
+
 /** Create a cl::Image2D object from an OpenCL buffer
  *
  * @note The following conditions are required to create a OpenCL image object from OpenCL buffer,
@@ -49,10 +56,11 @@ class ITensorInfo;
  * @param[in] shape2d         2D tensor shape
  * @param[in] data_type       DataType to use. Only supported: F32,F16
  * @param[in] image_row_pitch Image row pitch (a.k.a. stride Y) to be used in the image2d object
+ * @param[in] image_type      Image 2D type (@ref CLImage2DType)
  *
  * @return cl::Image2D object
  */
-cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch);
+cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch, CLImage2DType image_type);
 
 namespace experimental
 {
diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index 298edc244f..c5d94ccb04 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2022 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -438,6 +438,16 @@
 #define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
 #endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
 
+#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
+#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
+#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
+#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
+#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
 /** Utility macro to read a 2D OpenCL image object.
  *
  * @note Coordinates are not normalized
@@ -454,6 +464,22 @@
 #define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
 #define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
 
+/** Utility macro to write a 2D OpenCL image object.
+ *
+ * @note Coordinates are not normalized
+ *
+ * @param[in] data_type Data type
+ * @param[in] n0        Number of pixel to write. Only 1,2 and 4 is supported
+ * @param[in] img       OpenCL image object
+ * @param[in] x_coord   The x coordinate for the top-left pixel
+ * @param[in] y_coord   The y coordinate for the top-left pixel
+ * @param[in] values    Values to write
+ *
+ * @{
+ */
+#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
+#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
+
 #define VSTORE_STR(size) vstore##size
 #define VSTORE(size) VSTORE_STR(size)
 
diff --git a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
index 4693a1fbcd..81ceeb8846 100644
--- a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
+++ b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -66,36 +66,36 @@
  * - The weights offset e.g. -DWEI_OFFSET=4
  * - The quantized zero value e.g. -DZERO_VALUE=4
  *
+ * @param[in]  src_img                           (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: F16/F32/QASYMM8
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_c                             The size of the channels dimension of the source tensor
+ * @param[in]  src_w                             The size of the width dimension of the source tensor
+ * @param[in]  src_h                             The size of the height dimension of the source tensor
+ * @param[in]  src_n                             The size of the batches dimension of the source tensor
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_img                           (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_c                             The size of the channels dimension of the destination tensor
+ * @param[in]  dst_w                             The size of the width dimension of the destination tensor
+ * @param[in]  dst_h                             The size of the height dimension of the destination tensor
+ * @param[in]  dst_n                             The size of the batches dimension of the destination tensor
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  wei_img                           (Optional) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE
  * @param[in]  wei_ptr                           Pointer to the weights tensor. Supported data type: same as @p src_ptr
- * @param[in]  wei_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  wei_step_x                        wei_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  wei_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  wei_step_y                        wei_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  wei_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  wei_step_z                        wei_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  wei_stride_w                      Stride of the weights tensor in W dimension (in bytes)
- * @param[in]  wei_step_w                        wei_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  wei_offset_first_element_in_bytes The offset of the first element in the bias matrix
+ * @param[in]  wei_c                             The size of the channels dimension of the weights tensor
+ * @param[in]  wei_w                             The size of the width dimension of the weights tensor
+ * @param[in]  wei_h                             The size of the height dimension of the weights tensor
+ * @param[in]  wei_n                             The size of the batches dimension of the weights tensor
+ * @param[in]  wei_offset_first_element_in_bytes The offset of the first element in the weights matrix
  * @param[in]  bia_ptr                           (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr (if F32/F16) or S32 (if QASYMM8/QASYMM8_SIGNED)
  * @param[in]  bia_stride_x                      (Optional) Stride of the bias tensor in X dimension (in bytes)
  * @param[in]  bia_step_x                        (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
@@ -103,9 +103,9 @@
  */
 //! @endcond
 __kernel void direct_convolution_nhwc(
-    TENSOR4D_T(src, SRC_TENSOR_TYPE),
-    TENSOR4D_T(dst, DST_TENSOR_TYPE),
-    TENSOR4D_T(wei, WEI_TENSOR_TYPE)
+    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
+    TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE)
 #if defined(HAS_BIAS)
     ,
     VECTOR_DECLARATION(bia)
@@ -292,4 +292,4 @@ __kernel void direct_convolution_nhwc(
 #undef _IDST_HEIGHT
 #undef _IDST_CHANNELS
 #undef _IY_MULTIPLIER
-}
-\ No newline at end of file
+}
diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
index 345469063a..dcbae220b6 100644
--- a/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
+++ b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,6 +54,7 @@
  * @note The number of rows to read from the src tensor must be passed at compile time using -DM0_A (e.g., -DM0_A=3). M0_A must be equal to WEI_WIDTH + (M0 - 1)
  * @note The number of columns to read from the src tensor must be passed at compile time using -DN0_A. It can either be 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1)
  *
+ * @param[in]  src_img                           (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: F16/F32
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
@@ -63,6 +64,7 @@
  * @param[in]  src_h                             The size of the height dimension of the source tensor
  * @param[in]  src_n                             The size of the batches dimension of the source tensor
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_img                           (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p src_ptr
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
@@ -72,16 +74,16 @@
  * @param[in]  dst_h                             The size of the height dimension of the destination tensor
  * @param[in]  dst_n                             The size of the batches dimension of the destination tensor
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  wei_img                           (Optional) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE
  * @param[in]  wei_ptr                           Pointer to the weights tensor. Supported data type: same as @p src_ptr
- * @param[in]  wei_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  wei_step_x                        wei_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  wei_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  wei_step_y                        wei_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  wei_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  wei_step_z                        wei_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  wei_stride_w                      Stride of the weights tensor in W dimension (in bytes)
- * @param[in]  wei_step_w                        wei_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  wei_offset_first_element_in_bytes The offset of the first element in the bias matrix
+ * @param[in]  wei_c                             The size of the channels dimension of the weights tensor
+ * @param[in]  wei_w                             The size of the width dimension of the weights tensor
+ * @param[in]  wei_h                             The size of the height dimension of the weights tensor
+ * @param[in]  wei_n                             The size of the batches dimension of the weights tensor
+ * @param[in]  wei_offset_first_element_in_bytes The offset of the first element in the weigts matrix
  * @param[in]  bia_ptr                           (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr
  * @param[in]  bia_stride_x                      (Optional) Stride of the bias tensor in X dimension (in bytes)
  * @param[in]  bia_step_x                        (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
@@ -89,9 +91,9 @@
  */
 //! @endcond
 __kernel void dwc_native_fp_nhwc(
-    TENSOR4D_T(src, SRC_TENSOR_TYPE),
-    TENSOR4D_T(dst, DST_TENSOR_TYPE),
-    TENSOR4D(wei, WEI_TENSOR_TYPE)
+    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
+    TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE)
 #if defined(HAS_BIAS)
     ,
     VECTOR_DECLARATION(bia)
@@ -206,4 +208,4 @@ __kernel void dwc_native_fp_nhwc(
 }
 #endif // defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
 // *INDENT-ON*
-// clang-format on
-\ No newline at end of file
+// clang-format on
diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
index e2ffd444dd..2d255e5b61 100644
--- a/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
+++ b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -77,6 +77,7 @@
  * @note The number of rows to read from the src tensor must be passed at compile time using -DM0_A (e.g., -DM0_A=3). M0_A must be equal to WEI_WIDTH + (M0 - 1)
  * @note The number of columns to read from the src tensor must be passed at compile time using -DN0_A. It can either be 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1)
  *
+ * @param[in]  src_img                                       (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE
  * @param[in]  src_ptr                                       Pointer to the source tensor. Supported data type: QSYMM8/QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
  * @param[in]  src_stride_y                                  Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  src_stride_z                                  Stride of the source tensor in Z dimension (in bytes)
@@ -86,6 +87,7 @@
  * @param[in]  src_h                                         The size of the height dimension of the source tensor
  * @param[in]  src_n                                         The size of the batches dimension of the source tensor
  * @param[in]  src_offset_first_element_in_bytes             The offset of the first element in the source tensor
+ * @param[out] dst_img                                       (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
  * @param[out] dst_ptr                                       Pointer to the destination tensor. Supported data type: same as @p src_ptr
  * @param[in]  dst_stride_y                                  Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_stride_z                                  Stride of the destination tensor in Z dimension (in bytes)
@@ -95,14 +97,15 @@
  * @param[in]  dst_h                                         The size of the height dimension of the destination tensor
  * @param[in]  dst_n                                         The size of the batches dimension of the destination tensor
  * @param[in]  dst_offset_first_element_in_bytes             The offset of the first element in the destination tensor
+ * @param[in]  wei_img                                       (Not supported) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE
  * @param[in]  wei_ptr                                       Pointer to the weights tensor. Supported data type: same as @p src_ptr
- * @param[in]  wei_stride_x                                  Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  wei_step_x                                    wei_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  wei_stride_y                                  Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  wei_step_y                                    wei_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  wei_stride_z                                  Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  wei_step_z                                    wei_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  wei_stride_w                                  Stride of the weights tensor in W dimension (in bytes)
+ * @param[in]  wei_c                                         The size of the channels dimension of the weights tensor
+ * @param[in]  wei_w                                         The size of the width dimension of the weights tensor
+ * @param[in]  wei_h                                         The size of the height dimension of the weights tensor
+ * @param[in]  wei_n                                         The size of the batches dimension of the weights tensor
  * @param[in]  wei_step_w                                    wei_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  wei_offset_first_element_in_bytes             The offset of the first element in the weights tensor
  * @param[in]  dst_multipliers_ptr                           Pointer to the destination multipliers tensor for the per-channel quantization. Supported data type: S32
@@ -120,9 +123,9 @@
  */
 //! @endcond
 __kernel void dwc_native_quantized_nhwc(
-    TENSOR4D_T(src, SRC_TENSOR_TYPE),
-    TENSOR4D_T(dst, DST_TENSOR_TYPE),
-    TENSOR4D(wei, WEI_TENSOR_TYPE),
+    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
+    TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE),
     VECTOR_DECLARATION(dst_multipliers),
     VECTOR_DECLARATION(dst_shifts)
 #if defined(HAS_BIAS)
@@ -269,4 +272,4 @@ __kernel void dwc_native_quantized_nhwc(
 }
 #endif // defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
 // *INDENT-ON*
-// clang-format on
-\ No newline at end of file
+// clang-format on
diff --git a/src/core/CL/cl_kernels/nhwc/indirect_convolution.cl b/src/core/CL/cl_kernels/nhwc/indirect_convolution.cl
index c88f0034c5..aa719bfef0 100644
--- a/src/core/CL/cl_kernels/nhwc/indirect_convolution.cl
+++ b/src/core/CL/cl_kernels/nhwc/indirect_convolution.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,7 +41,7 @@
  * @note The number of M0 rows (width*height) to process must be passed at compile time using -DM0 (e.g. -DM0=2)
  *  - M0 = 1, 2, 3, 4, 5, 6, 7, and 8
  *
- * @param[out] dst_img                           (Not supported) CLImage object to the destination tensor (DST_TENSOR_TYPE=IMAGE only)
+ * @param[out] dst_img                           (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: INT32
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
@@ -54,7 +54,7 @@
  */
 //! @endcond
 __kernel void indirect_convolution_address_precalculation(
-    TENSOR4D_T(dst, DST_TENSOR_TYPE))
+    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE))
 {
     const int x = get_global_id(0);
     const int y = get_global_id(1);
@@ -123,7 +123,7 @@ __kernel void indirect_convolution_address_precalculation(
  *  - N0 = 2, 3, 4, 8, 16
  *  - K0 = 2, 3, 4, 8, 16 (only 4, 8 and 16 if WEI_TENSOR_TYPE=IMAGE)
  *
- * @param[in]  src_img                           (Not supported) CLImage object to the source tensor (SRC_TENSOR_TYPE=IMAGE only)
+ * @param[in]  src_img                           (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: F16/F32
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
@@ -133,7 +133,7 @@ __kernel void indirect_convolution_address_precalculation(
  * @param[in]  src_h                             The size of the height dimension of the source tensor
  * @param[in]  src_n                             The size of the batches dimension of the source tensor
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  off_img                           (Not supported) CLImage object to the indirect buffer tensor (OFF_TENSOR_TYPE=IMAGE only)
+ * @param[in]  off_img                           (Not supported) Read only cl_image object for the indirect buffer tensor. Included when OFF_TENSOR_TYPE=IMAGE
  * @param[in]  off_ptr                           Pointer to the indirect buffer tensor. Supported data type: INT32
  * @param[in]  off_stride_y                      Stride of the indirect buffer tensor in Y dimension (in bytes)
  * @param[in]  off_stride_z                      Stride of the indirect buffer tensor in Z dimension (in bytes)
@@ -143,8 +143,8 @@ __kernel void indirect_convolution_address_precalculation(
  * @param[in]  off_h                             The size of the height dimension of the indirect buffer tensor
  * @param[in]  off_n                             The size of the batches dimension of the indirect buffer tensor
  * @param[in]  off_offset_first_element_in_bytes The offset of the first element in the indirect buffer tensor
- * @param[out] dst_img                           (Not supported) CLImage object to the destination tensor (DST_TENSOR_TYPE=IMAGE only)
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as the input tensor
+ * @param[out] dst_img                           (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p src_ptr
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
@@ -153,8 +153,8 @@ __kernel void indirect_convolution_address_precalculation(
  * @param[in]  dst_h                             The size of the height dimension of the destination tensor
  * @param[in]  dst_n                             The size of the batches dimension of the destination tensor
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] wei_img                           (Optional) CLImage object to the destination tensor (WEI_TENSOR_TYPE=IMAGE only)
- * @param[out] wei_ptr                           Pointer to the weights tensor. Supported data type: same as the input tensor
+ * @param[out] wei_img                           (Optional) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE
+ * @param[out] wei_ptr                           Pointer to the weights tensor. Supported data type: same as @p src_ptr
  * @param[in]  wei_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
  * @param[in]  wei_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
  * @param[in]  wei_stride_w                      Stride of the weights tensor in W dimension (in bytes)
@@ -163,23 +163,17 @@ __kernel void indirect_convolution_address_precalculation(
  * @param[in]  wei_h                             The size of the height dimension of the weights tensor
  * @param[in]  wei_n                             The size of the batches dimension of the weights tensor
  * @param[in]  wei_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[out] bia_img                           (Not supported) CLImage object to the destination tensor (BIA_TENSOR_TYPE=IMAGE only)
- * @param[out] bia_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as the input tensor
- * @param[in]  bia_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
- * @param[in]  bia_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
- * @param[in]  bia_stride_w                      (Optional) Stride of the bias tensor in W dimension (in bytes)
- * @param[in]  bia_c                             (Optional) The size of the channels dimension of the bias tensor
- * @param[in]  bia_w                             (Optional) The size of the width dimension of the bias tensor
- * @param[in]  bia_h                             (Optional) The size of the height dimension of the bias tensor
- * @param[in]  bia_n                             (Optional) The size of the batches dimension of the bias tensor
- * @param[in]  bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[in]  bia_ptr                           (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr
+ * @param[in]  bia_stride_x                      (Optional) Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  bia_step_x                        (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
  */
 //! @endcond
 __kernel void indirect_convolution_nhwc(
-    TENSOR4D_T(src, SRC_TENSOR_TYPE),
-    TENSOR4D_T(off, OFF_TENSOR_TYPE),
-    TENSOR4D_T(dst, DST_TENSOR_TYPE),
-    TENSOR4D_T(wei, WEI_TENSOR_TYPE)
+    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_RO_T(off, OFF_TENSOR_TYPE),
+    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
+    TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE)
 #if defined(HAS_BIAS)
     ,
     VECTOR_DECLARATION(bia)
diff --git a/src/core/CL/cl_kernels/nhwc/scale.cl b/src/core/CL/cl_kernels/nhwc/scale.cl
index f6a3e0971b..e071b0f192 100644
--- a/src/core/CL/cl_kernels/nhwc/scale.cl
+++ b/src/core/CL/cl_kernels/nhwc/scale.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2022 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,8 +61,8 @@
  */
 //! @endcond
 __kernel void scale_nearest_neighbour_nhwc(
-    TENSOR4D_T(src, SRC_TENSOR_TYPE),
-    TENSOR4D_T(dst, DST_TENSOR_TYPE),
+    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
     const float scale_x,
     const float scale_y)
 {
@@ -128,31 +128,33 @@ __kernel void scale_nearest_neighbour_nhwc(
  * - The source offset e.g. -DOFFSET=4
  * - The source scale e.g. -DSCALE=4
  *
- * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32.
- * @param[in] src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_c                             The size of the channels dimension of the source tensor
- * @param[in] src_w                             The size of the width dimension of the source tensor
- * @param[in] src_h                             The size of the height dimension of the source tensor
- * @param[in] src_n                             The size of the batches dimension of the source tensor
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: U8/S16/F16/F32.
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_c                             The size of the channels dimension of the destination tensor
- * @param[in] dst_w                             The size of the width dimension of the destination tensor
- * @param[in] dst_h                             The size of the height dimension of the destination tensor
- * @param[in] dst_n                             The size of the batches dimension of the destination tensor
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] scale_x                           The scale value to apply on the source width
- * @param[in] scale_y                           The scale value to apply on the source height
+ * @param[in]  src_img                           (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32.
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_c                             The size of the channels dimension of the source tensor
+ * @param[in]  src_w                             The size of the width dimension of the source tensor
+ * @param[in]  src_h                             The size of the height dimension of the source tensor
+ * @param[in]  src_n                             The size of the batches dimension of the source tensor
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_img                           (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: U8/S16/F16/F32.
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  dst_c                             The size of the channels dimension of the destination tensor
+ * @param[in]  dst_w                             The size of the width dimension of the destination tensor
+ * @param[in]  dst_h                             The size of the height dimension of the destination tensor
+ * @param[in]  dst_n                             The size of the batches dimension of the destination tensor
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  scale_x                           The scale value to apply on the source width
+ * @param[in]  scale_y                           The scale value to apply on the source height
  */
 //! @endcond
 __kernel void scale_bilinear_nhwc(
-    TENSOR4D_T(src, SRC_TENSOR_TYPE),
-    TENSOR4D_T(dst, DST_TENSOR_TYPE),
+    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
     const float scale_x,
     const float scale_y)
 {
@@ -224,11 +226,11 @@ __kernel void scale_bilinear_nhwc(
     const float a1 = (yi_f - (float)yi);
     const float b1 = (1.f - a1);
 
-    out[0].v = CONVERT_SAT((CONVERT(in00[0].v, VEC_DATA_TYPE(float, N0)) * b * b1) + 
-                   (CONVERT(in01[0].v, VEC_DATA_TYPE(float, N0)) * a * b1) + 
-                   (CONVERT(in10[0].v, VEC_DATA_TYPE(float, N0)) * b * a1) + 
-                   (CONVERT(in11[0].v, VEC_DATA_TYPE(float, N0)) * a * a1), 
-                VEC_DATA_TYPE(DST_DATA_TYPE, N0));
+    out[0].v = CONVERT_SAT((CONVERT(in00[0].v, VEC_DATA_TYPE(float, N0)) * b * b1) +
+                           (CONVERT(in01[0].v, VEC_DATA_TYPE(float, N0)) * a * b1) +
+                           (CONVERT(in10[0].v, VEC_DATA_TYPE(float, N0)) * b * a1) +
+                           (CONVERT(in11[0].v, VEC_DATA_TYPE(float, N0)) * a * a1),
+                           VEC_DATA_TYPE(DST_DATA_TYPE, N0));
 #endif // defined(IS_FLOATING_POINT)
 
     TILE(uint, 1, 1, dst_indirect_y);
@@ -240,4 +242,4 @@ __kernel void scale_bilinear_nhwc(
 
     T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, 1, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, out, dst_indirect_y);
 }
-#endif /* SCALE_BILINEAR */
-\ No newline at end of file
+#endif /* SCALE_BILINEAR */
diff --git a/src/core/CL/cl_kernels/nhwc/transposed_convolution.cl b/src/core/CL/cl_kernels/nhwc/transposed_convolution.cl
index fe6182fc95..1393537283 100644
--- a/src/core/CL/cl_kernels/nhwc/transposed_convolution.cl
+++ b/src/core/CL/cl_kernels/nhwc/transposed_convolution.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -64,7 +64,7 @@
  * - The weights offset e.g. -DWEI_OFFSET=4
  * - The quantized zero value e.g. -DZERO_VALUE=4
  *
- *
+ * @param[in]  src_img                           (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: F16/F32
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
@@ -73,6 +73,7 @@
  * @param[in]  src_w                             The size of the width dimension of the source tensor
  * @param[in]  src_h                             The size of the height dimension of the source tensor
  * @param[in]  src_n                             The size of the batches dimension of the source tensor
+ * @param[out] dst_img                           (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p src_ptr
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
@@ -82,6 +83,7 @@
  * @param[in]  dst_h                             The size of the height dimension of the destination tensor
  * @param[in]  dst_n                             The size of the batches dimension of the destination tensor
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  wei_img                           (Not supported) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE
  * @param[in]  wei_ptr                           Pointer to the weights tensor. Supported data type: same as @p src_ptr
  * @param[in]  wei_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
  * @param[in]  wei_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
@@ -98,9 +100,9 @@
  */
 //! @endcond
 __kernel void transposed_convolution_nhwc(
-    TENSOR4D_T(src, SRC_TENSOR_TYPE),
-    TENSOR4D_T(dst, DST_TENSOR_TYPE),
-    TENSOR4D_T(wei, WEI_TENSOR_TYPE)
+    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
+    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
+    TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE)
 #if defined(HAS_BIAS)
     ,
     VECTOR_DECLARATION(bia)
@@ -292,4 +294,4 @@ __kernel void transposed_convolution_nhwc(
 #undef _IDST_HEIGHT
 #undef _IDST_CHANNELS
 #undef _IY_MULTIPLIER
-}
-\ No newline at end of file
+}
diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h
index acc174d04f..507e172dfb 100644
--- a/src/core/CL/cl_kernels/tile_helpers.h
+++ b/src/core/CL/cl_kernels/tile_helpers.h
@@ -130,8 +130,44 @@
     uint        name##_offset_first_element_in_bytes
 
 #define TENSOR4D_T_STR(name, type) TENSOR4D_T_##type(name)
+
+/** Legacy tensor 4D arguments
+ *
+ * @param[in] name Tensor name. The tensor name is the prefix of the tensor components
+ * @param[in] type Tensor type (BUFFER or IMAGE)
+ */
 #define TENSOR4D_T(name, type) TENSOR4D_T_STR(name, type)
 
+#define TENSOR4D_RO_T_IMAGE(name)          \
+    __read_only image2d_t name##_img, \
+    TENSOR4D_T_BUFFER(name)
+
+#define TENSOR4D_RO_T_BUFFER(name) TENSOR4D_T_BUFFER(name)
+
+#define TENSOR4D_RO_T_STR(name, type) TENSOR4D_RO_T_##type(name)
+
+/** Read-Only (RO) tensor 4D.
+ *
+ * @param[in] name Tensor name. The tensor name is the prefix of the tensor components
+ * @param[in] type Tensor type (BUFFER or IMAGE)
+ */
+#define TENSOR4D_RO_T(name, type) TENSOR4D_RO_T_STR(name, type)
+
+#define TENSOR4D_WO_T_IMAGE(name)          \
+    __write_only image2d_t name##_img, \
+    TENSOR4D_T_BUFFER(name)
+
+#define TENSOR4D_WO_T_BUFFER(name) TENSOR4D_T_BUFFER(name)
+
+#define TENSOR4D_WO_T_STR(name, type) TENSOR4D_WO_T_##type(name)
+
+/** Write-Only (WO) tensor 4D.
+ *
+ * @param[in] name Tensor name. The tensor name is the prefix of the tensor components
+ * @param[in] type Tensor type (BUFFER or IMAGE)
+ */
+#define TENSOR4D_WO_T(name, type) TENSOR4D_WO_T_STR(name, type)
+
 #define TENSOR3D_T_IMAGE(name)          \
     __read_only image2d_t name##_img, \
     __global uchar *name##_ptr,       \
@@ -457,6 +493,25 @@
     (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y)))
 #define V_LOAD_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) READ_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y))
 
+/** Store a vector in global memory (tensor)
+ *
+ * @param[in] DATA_TYPE   Data type
+ * @param[in] WIDTH       Number of dst columns
+ * @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).
+ *                        In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)
+ * @param[in] TENSOR      Tensor basename
+ * @param[in] X           Starting X position
+ * @param[in] Y           Starting Y position
+ * @param[in] STRIDE_Y    Stride Y (in bytes)
+ * @param[in] VALUES      Values to store in memory
+ */
+#define V_STORE(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES)
+#define V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES)
+#define V_STORE_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) \
+    VSTORE(WIDTH)                                                \
+    (VALUES, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y)))
+#define V_STORE_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) WRITE_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y), VALUES)
+
 /** Load a tile from global memory (tensor)
  *
  * @param[in]  DATA_TYPE     Data type
@@ -658,7 +713,8 @@
  * @param[in]  DATA_TYPE     Data type
  * @param[in]  TILE_AREA     Number of elements to load from Y (height) dimension * Number of elements to load from X (width) dimension
  * @param[in]  TILE_CHANNELS Number of elements to load from C (channel) dimension
- * @param[in]  TENSOR_TYPE   Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported
+ * @param[in]  TENSOR_TYPE   Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).
+ *                           When TENSOR_TYPE=IMAGE, the if condition for the out-of-bound check can be skipped
  *                           In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)
  * @param[in]  TENSOR        Tensor basename
  * @param[in]  C             Starting C index
@@ -667,15 +723,25 @@
  *                           16 is the maximum indirect buffer size.
  * @param[out] dst           Output tile
  */
-#define T_LOAD2D_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst)                \
-    ({                                                                                                                                                                \
-        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA,                                                                                                                      \
-        {                                                                                                                                                             \
-            if(yi[0].s[_i] >= 0)                                                                                                                                     \
-            {                                                                                                                                                         \
-                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y);                                                               \
-            }                                                                                                                                                         \
-        })                                                                                                                                                            \
+#define T_LOAD2D_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst)
+#define T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_##TENSOR_TYPE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst)
+#define T_LOAD2D_INDIRECT_BUFFER(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \
+    ({ \
+        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \
+        { \
+            if(yi[0].s[_i] >= 0) \
+            { \
+                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \
+            } \
+        }) \
+    })
+
+#define T_LOAD2D_INDIRECT_IMAGE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \
+    ({ \
+        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \
+        { \
+            dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \
+        }) \
     })
 
 /** Load a tile from global memory (tensor) when the tensor is stored using a NDHWC layout using indirect X, Y and Z coordinates
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
index cded31936c..2d21a6eff0 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -377,7 +377,7 @@ void CLDepthwiseConvolutionLayerNativeKernel::run(const Window &window, cl::Comm
             const size_t      image_h = _input->info()->dimension(1) * _input->info()->dimension(2) * _input->info()->dimension(3);
             const TensorShape shape2d(image_w, image_h);
             const size_t      image_row_pitch = _input->info()->strides_in_bytes()[1];
-            input_cl_image                    = create_image2d_from_buffer(CLKernelLibrary::get().context(), _input->cl_buffer(), shape2d, _input->info()->data_type(), image_row_pitch);
+            input_cl_image                    = create_image2d_from_buffer(CLKernelLibrary::get().context(), _input->cl_buffer(), shape2d, _input->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
         }
 
         if(_export_weights_to_cl_image)
@@ -386,7 +386,8 @@ void CLDepthwiseConvolutionLayerNativeKernel::run(const Window &window, cl::Comm
             const size_t      image_h = _weights->info()->dimension(1) * _weights->info()->dimension(2) * _weights->info()->dimension(3);
             const TensorShape shape2d(image_w, image_h);
             const size_t      image_row_pitch = _weights->info()->strides_in_bytes()[1];
-            weights_cl_image                  = create_image2d_from_buffer(CLKernelLibrary::get().context(), _weights->cl_buffer(), shape2d, _weights->info()->data_type(), image_row_pitch);
+            weights_cl_image                  = create_image2d_from_buffer(CLKernelLibrary::get().context(), _weights->cl_buffer(), shape2d, _weights->info()->data_type(), image_row_pitch,
+                                                                           CLImage2DType::ReadOnly);
         }
     }
 
@@ -401,7 +402,7 @@ void CLDepthwiseConvolutionLayerNativeKernel::run(const Window &window, cl::Comm
     {
         _kernel.setArg(idx++, weights_cl_image);
     }
-    add_4D_tensor_argument(idx, _weights, slice);
+    add_4d_tensor_nhwc_argument(idx, _weights);
     if(_is_quantized)
     {
         add_1D_tensor_argument(idx, _output_multipliers, slice);
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp
index 93fbdfed63..022d4685fe 100644
--- a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -89,7 +89,7 @@ inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKer
         {
             const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3));
             const size_t      image_row_pitch = tensor->info()->strides_in_bytes()[1];
-            cl::Image2D       tensor_image2d  = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch);
+            cl::Image2D       tensor_image2d  = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
             cl_images.push_back(tensor_image2d);
             _kernel.setArg(idx++, tensor_image2d);
             break;
@@ -105,7 +105,7 @@ inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKer
         {
             const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3));
             const size_t      image_row_pitch = tensor->info()->strides_in_bytes()[1];
-            cl::Image2D       tensor_image2d  = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch);
+            cl::Image2D       tensor_image2d  = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
             cl_images.push_back(tensor_image2d);
             _kernel.setArg(idx++, tensor_image2d);
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(tensor->info()->strides_in_bytes()[2]));
@@ -135,7 +135,7 @@ inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKer
             const size_t image_stride_y = tensor->info()->strides_in_bytes()[1];
 
             cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(),
-                                                                    TensorShape(image_w, image_h), tensor->info()->data_type(), image_stride_y);
+                                                                    TensorShape(image_w, image_h), tensor->info()->data_type(), image_stride_y, CLImage2DType::ReadOnly);
             cl_images.push_back(tensor_image2d);
 
             _kernel.setArg(idx++, tensor_image2d);
diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
index d9271e24d9..b66163c805 100644
--- a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
+++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -64,6 +64,9 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx), "Weights feature map dimension should match the respective src's one");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
 
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.export_input_to_cl_image == true, "Export to CLImage is not supported for the input tensor");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.export_output_to_cl_image == true, "Export to CLImage is not supported for the output tensor");
+
     if(data_layout == DataLayout::NCHW)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != weights->dimension(height_idx), "Weights should have same width and height");
@@ -210,14 +213,26 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
         const unsigned int pad_left         = conv_info.pad_left();
         const unsigned int pad_top          = conv_info.pad_top();
 
-        _export_to_cl_image = desc.export_weights_to_cl_image;
+        _export_weights_to_cl_image = desc.export_weights_to_cl_image;
+        _export_input_to_cl_image   = desc.export_input_to_cl_image;
+        _export_output_to_cl_image  = desc.export_output_to_cl_image;
 
         // Update the padding for the weights tensor if we can export to cl_image
-        if(_export_to_cl_image)
+        if(_export_weights_to_cl_image)
         {
             gemm::update_padding_for_cl_image(weights);
         }
 
+        if(_export_output_to_cl_image)
+        {
+            gemm::update_padding_for_cl_image(dst);
+        }
+
+        if(_export_input_to_cl_image)
+        {
+            gemm::update_padding_for_cl_image(src);
+        }
+
         if(biases != nullptr)
         {
             build_options.add_option(std::string("-DHAS_BIAS"));
@@ -241,7 +256,7 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
             build_options.add_option("-cl-fast-relaxed-math");
         }
 
-        build_options.add_option("-DSRC_TENSOR_TYPE=BUFFER");
+        build_options.add_option_if_else(_export_input_to_cl_image, "-DSRC_TENSOR_TYPE=IMAGE", "-DSRC_TENSOR_TYPE=BUFFER");
         build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
         build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src->dimension(0)));
         build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(1)));
@@ -249,9 +264,9 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
         build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(0)));
         build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(1)));
         build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(2)));
-        build_options.add_option("-DDST_TENSOR_TYPE=BUFFER");
+        build_options.add_option_if_else(_export_output_to_cl_image, "-DDST_TENSOR_TYPE=IMAGE", "-DDST_TENSOR_TYPE=BUFFER");
         build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst_data_type));
-        build_options.add_option_if_else(_export_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER");
+        build_options.add_option_if_else(_export_weights_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER");
         build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights->dimension(width_idx)));
         build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights->dimension(height_idx)));
         build_options.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(weights->data_type()));
@@ -307,7 +322,7 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
     }
     else
     {
-        _export_to_cl_image = false;
+        _export_weights_to_cl_image = false;
 
         kernel_name << "direct_convolution_nchw";
         build_options.add_option_if(biases != nullptr, std::string("-DHAS_BIAS"));
@@ -399,8 +414,10 @@ void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl
     if(_data_layout == DataLayout::NHWC)
     {
         cl::Image2D weights_cl_image;
+        cl::Image2D output_cl_image;
+        cl::Image2D input_cl_image;
 
-        if(_export_to_cl_image)
+        if(_export_weights_to_cl_image)
         {
             const size_t      image_w = weights->info()->dimension(0) / 4;
             const size_t      image_h = weights->info()->dimension(1) * weights->info()->dimension(2) * weights->info()->dimension(3);
@@ -408,13 +425,43 @@ void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl
             const size_t      image_row_pitch = weights->info()->strides_in_bytes()[1];
 
             // Export cl_buffer to cl_image
-            weights_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), weights->cl_buffer(), shape2d, weights->info()->data_type(), image_row_pitch);
+            weights_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), weights->cl_buffer(), shape2d, weights->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+        }
+
+        if(_export_output_to_cl_image)
+        {
+            const size_t      image_w = dst->info()->dimension(0) / 4;
+            const size_t      image_h = dst->info()->dimension(1) * dst->info()->dimension(2) * dst->info()->dimension(3);
+            const TensorShape shape2d(image_w, image_h);
+            const size_t      image_row_pitch = dst->info()->strides_in_bytes()[1];
+
+            // Export cl_buffer to cl_image
+            output_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), dst->cl_buffer(), shape2d, dst->info()->data_type(), image_row_pitch, CLImage2DType::WriteOnly);
+        }
+
+        if(_export_input_to_cl_image)
+        {
+            const size_t      image_w = src->info()->dimension(0) / 4;
+            const size_t      image_h = src->info()->dimension(1) * src->info()->dimension(2) * src->info()->dimension(3);
+            const TensorShape shape2d(image_w, image_h);
+            const size_t      image_row_pitch = src->info()->strides_in_bytes()[1];
+
+            // Export cl_buffer to cl_image
+            input_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), src->cl_buffer(), shape2d, src->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
         }
 
         unsigned int idx = 0;
+        if(_export_input_to_cl_image)
+        {
+            _kernel.setArg(idx++, input_cl_image);
+        }
         add_4d_tensor_nhwc_argument(idx, src);
+        if(_export_output_to_cl_image)
+        {
+            _kernel.setArg(idx++, output_cl_image);
+        }
         add_4d_tensor_nhwc_argument(idx, dst);
-        if(_export_to_cl_image)
+        if(_export_weights_to_cl_image)
         {
             _kernel.setArg(idx++, weights_cl_image);
         }
diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.h b/src/gpu/cl/kernels/ClDirectConv2dKernel.h
index 0cb8aebbe1..25171a0536 100644
--- a/src/gpu/cl/kernels/ClDirectConv2dKernel.h
+++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -84,7 +84,9 @@ public:
 public:
     DataLayout    _data_layout{};
     PadStrideInfo _conv_info{};
-    bool          _export_to_cl_image{ false };
+    bool          _export_weights_to_cl_image{ false };
+    bool          _export_output_to_cl_image{ false };
+    bool          _export_input_to_cl_image{ false };
 };
 } // namespace kernels
 } // namespace opencl
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
index 6a450b652b..f74a5d87af 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -360,7 +360,7 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind
         const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2));
         const size_t      image_row_pitch = src1->info()->strides_in_bytes()[1];
 
-        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch);
+        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
     }
 
     do
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
index a8bcf8d6a1..efd0a95eab 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -375,7 +375,7 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con
         const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2));
         const size_t      image_row_pitch = src1->info()->strides_in_bytes()[1];
 
-        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch);
+        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
     }
 
     do
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
index fe46913517..f252afb06a 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -325,7 +325,7 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors,
         const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2));
         const size_t      image_row_pitch = src1->info()->strides_in_bytes()[1];
 
-        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch);
+        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
     }
 
     Window slice = window.first_slice_window_3D();
diff --git a/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp
index 3448377cb5..b8915cc406 100644
--- a/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp
+++ b/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -282,7 +282,7 @@ void ClIndirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window,
         const size_t      image_row_pitch = weights->info()->strides_in_bytes()[1];
 
         // Export cl_buffer to cl_image
-        weights_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), weights->cl_buffer(), shape2d, weights->info()->data_type(), image_row_pitch);
+        weights_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), weights->cl_buffer(), shape2d, weights->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
     }
 
     unsigned int idx = 0;
author	Gian Marco Iodice <gianmarco.iodice@arm.com>	2022-12-30 16:07:45 +0000
committer	Gian Marco Iodice <gianmarco.iodice@arm.com>	2023-01-10 09:57:51 +0000
commit	3cce35dcad8bc8f53a1e6613f719af9ab04feda6 (patch)
tree	e1015566852ebce4af897db37cf5cb1989c29924
parent	d2d9361a0a338bce478f7d85b4af70d1ed20f26c (diff)
download	ComputeLibrary-3cce35dcad8bc8f53a1e6613f719af9ab04feda6.tar.gz