Use the stable CKW API in the GPU dynamic fusion backend

- Refactor all kernels to work with the CKW stable API - Add support for sub-tile in the op_load/op_store CKW operator - Fix mismatch in resize - Add comments in all kernels written with CKW to help developers understand the structure of the code - Add texture image support in depthwise convolution written with CKW - Add support for different block sizes in depthwise convolution - Remove the use of the dynamic fusion helper functions. - Add support for floor in the op_unary() of CKW Resolves: COMPMID-6708, COMPMID-6743, COMPMID-6530 Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Signed-off-by: Gunes Bayir <gunes.bayir@arm.com> Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com> Signed-off-by: Jakub Sujak <jakub.sujak@arm.com> Change-Id: I8104ce4d04a3138a1aeb0b84940e1f1c89e76069 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10914 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Jakub Sujak <jakub.sujak@arm.com> Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
author: Gunes Bayir <gunes.bayir@arm.com> 2024-01-17 16:07:03 +0000
committer: Viet-Hoa Do <viet-hoa.do@arm.com> 2024-02-01 16:00:34 +0000
commit: 2b9fa593a0a172bf36a02b5cdb840c6b9b361d7c (patch)
tree: a4e2d5ce46443a79a0778e4960462ce3edf106ec /compute_kernel_writer
parent: 7ab7fca87cca8775f82b0e9efec6a40975910c17 (diff)
download: ComputeLibrary-2b9fa593a0a172bf36a02b5cdb840c6b9b361d7c.tar.gz
21 files changed, 323 insertions, 150 deletions
diff --git a/compute_kernel_writer/include/ckw/KernelWriter.h b/compute_kernel_writer/include/ckw/KernelWriter.h
index 0d739e859a..da41b940d7 100644
--- a/compute_kernel_writer/include/ckw/KernelWriter.h
+++ b/compute_kernel_writer/include/ckw/KernelWriter.h
@@ -25,11 +25,22 @@
 #ifndef CKW_INCLUDE_CKW_KERNELWRITER_H
 #define CKW_INCLUDE_CKW_KERNELWRITER_H
 
+#include "ckw/Kernel.h"
+#include "ckw/TensorInfo.h"
 #include "ckw/TensorOperand.h"
+#include "ckw/TensorSampler.h"
+#include "ckw/TileInfo.h"
 #include "ckw/TileOperand.h"
 #include "ckw/types/ConstantData.h"
 #include "ckw/types/ConvertPolicy.h"
+#include "ckw/types/DataType.h"
 #include "ckw/types/Operators.h"
+#include "ckw/types/TargetArchitecture.h"
+#include "ckw/types/TargetLanguage.h"
+#include "ckw/types/TensorComponentType.h"
+#include "ckw/types/TensorDataLayout.h"
+#include "ckw/types/TensorSamplerTypes.h"
+#include "ckw/types/TensorStorageType.h"
 
 #include <functional>
 #include <memory>
@@ -39,16 +50,8 @@
 namespace ckw
 {
 
-/** Forward Declerations */
-class Kernel;
-class TensorInfo;
-class TensorSampler;
+/** Forward Declarations */
 class TileArea;
-class TileInfo;
-
-enum class DataType;
-enum class TargetArchitecture;
-enum class TargetLanguage;
 
 /** A kernel writer.
  *
@@ -350,7 +353,6 @@ public:
                                   const TileOperand   &z,
                                   const TileOperand   &batch_op) = 0;
 
-protected:
     // =============================================================================================
     // ID space management
     // =============================================================================================
@@ -367,6 +369,7 @@ protected:
     /** Get the current ID space. */
     int32_t id_space() const;
 
+protected:
     /** Set the current ID space.
      *
      * @param[in] value The ID space to be used.
diff --git a/compute_kernel_writer/include/ckw/TensorOperand.h b/compute_kernel_writer/include/ckw/TensorOperand.h
index 2672cd5334..a3e53d1314 100644
--- a/compute_kernel_writer/include/ckw/TensorOperand.h
+++ b/compute_kernel_writer/include/ckw/TensorOperand.h
@@ -43,6 +43,15 @@ public:
     // Only kernel writer class interacts with tensor operand hence we allow it to access this field.
     friend class KernelWriter;
 
+    /** Create an empty tensor operand.
+     *
+     * The new tensor operand doesn't refer to any tensor therefore it is not useable.
+     */
+    TensorOperand();
+
+    /** Check if the tensor operand contains a tensor and therefore useable. */
+    bool is_valid() const;
+
     /** Get the tensor info. */
     const TensorInfo &info() const;
 
@@ -92,7 +101,7 @@ private:
     /** Initialize a new instance of @ref TensorOperand class for a tensor. */
     TensorOperand(ITensor &tensor);
 
-    ITensor &_tensor;
+    ITensor *_tensor;
 };
 
 } // namespace ckw
diff --git a/compute_kernel_writer/include/ckw/TileOperand.h b/compute_kernel_writer/include/ckw/TileOperand.h
index 56dc5e7b2b..556d589bc0 100644
--- a/compute_kernel_writer/include/ckw/TileOperand.h
+++ b/compute_kernel_writer/include/ckw/TileOperand.h
@@ -33,6 +33,7 @@ namespace ckw
 class KernelWriter;
 class TensorOperand;
 class ITile;
+class TileInfo;
 
 /** A tile operand refers to a tile object that can be used for kernel writing. */
 class TileOperand
@@ -43,6 +44,18 @@ public:
     friend class KernelWriter;
     friend class TensorOperand;
 
+    /** Create an empty tile operand.
+     *
+     * The new tile operand doesn't refer to any tile therefore it is not useable.
+     */
+    TileOperand();
+
+    /** Check if the tile operand contains a tile and therefore useable. */
+    bool is_valid() const;
+
+    /** Get the tile info. */
+    const TileInfo &tile_info() const;
+
     /** Get a row vector of the current tile operand.
      *
      * @param[in] row The index of the row to be accessed in the current tile operand.
diff --git a/compute_kernel_writer/include/ckw/types/ConstantData.h b/compute_kernel_writer/include/ckw/types/ConstantData.h
index 7708818ca8..ea95049c9e 100644
--- a/compute_kernel_writer/include/ckw/types/ConstantData.h
+++ b/compute_kernel_writer/include/ckw/types/ConstantData.h
@@ -53,6 +53,10 @@ public:
     template <typename T>
     ConstantData(std::initializer_list<std::initializer_list<T>> values, DataType data_type);
 
+    /** Templated constructor */
+    template <typename T>
+    ConstantData(const std::vector<std::vector<T>> &values, DataType data_type);
+
 private:
     /** Validate the given data type and the template type
      *
diff --git a/compute_kernel_writer/include/ckw/types/Operators.h b/compute_kernel_writer/include/ckw/types/Operators.h
index 1e5f9bd542..77b0519422 100644
--- a/compute_kernel_writer/include/ckw/types/Operators.h
+++ b/compute_kernel_writer/include/ckw/types/Operators.h
@@ -43,6 +43,7 @@ enum class UnaryOp : int32_t
     Fabs  = 0x0014,
     Log   = 0x0015,
     Round = 0x0016,
+    Floor = 0x0017,
 };
 
 /** Assignment operators. */
diff --git a/compute_kernel_writer/src/KernelWriter.cpp b/compute_kernel_writer/src/KernelWriter.cpp
index a478231c09..92a36746ce 100644
--- a/compute_kernel_writer/src/KernelWriter.cpp
+++ b/compute_kernel_writer/src/KernelWriter.cpp
@@ -107,7 +107,8 @@ TensorOperand KernelWriter::create_tensor_operand(ITensor &tensor)
 
 ITensor &KernelWriter::get_tensor(const TensorOperand &operand)
 {
-    return operand._tensor;
+    CKW_ASSERT(operand._tensor != nullptr);
+    return *operand._tensor;
 }
 
 const std::vector<std::vector<std::string>> &KernelWriter::get_values(const ConstantData &data)
diff --git a/compute_kernel_writer/src/TensorOperand.cpp b/compute_kernel_writer/src/TensorOperand.cpp
index bf11d0d332..94997537d8 100644
--- a/compute_kernel_writer/src/TensorOperand.cpp
+++ b/compute_kernel_writer/src/TensorOperand.cpp
@@ -21,91 +21,115 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
 #include "ckw/TensorOperand.h"
 
+#include "ckw/Error.h"
+
 #include "src/ITensor.h"
 
 namespace ckw
 {
 
-TensorOperand::TensorOperand(ITensor &tensor) : _tensor(tensor)
+TensorOperand::TensorOperand() : _tensor(nullptr)
+{
+}
+
+TensorOperand::TensorOperand(ITensor &tensor) : _tensor(&tensor)
+{
+}
+
+bool TensorOperand::is_valid() const
 {
+    return _tensor != nullptr;
 }
 
 const TensorInfo &TensorOperand::info() const
 {
-    return _tensor.info();
+    CKW_ASSERT(is_valid() == true);
+    return _tensor->info();
 }
 
 TileOperand TensorOperand::stride0()
 {
-    return TileOperand(_tensor.component(TensorComponentType::Stride0));
+    CKW_ASSERT(is_valid() == true);
+    return TileOperand(_tensor->component(TensorComponentType::Stride0));
 }
 
 TileOperand TensorOperand::stride1()
 {
-    return TileOperand(_tensor.component(TensorComponentType::Stride1));
+    CKW_ASSERT(is_valid() == true);
+    return TileOperand(_tensor->component(TensorComponentType::Stride1));
 }
 
 TileOperand TensorOperand::stride2()
 {
-    return TileOperand(_tensor.component(TensorComponentType::Stride2));
+    CKW_ASSERT(is_valid() == true);
+    return TileOperand(_tensor->component(TensorComponentType::Stride2));
 }
 
 TileOperand TensorOperand::stride3()
 {
-    return TileOperand(_tensor.component(TensorComponentType::Stride3));
+    CKW_ASSERT(is_valid() == true);
+    return TileOperand(_tensor->component(TensorComponentType::Stride3));
 }
 
 TileOperand TensorOperand::stride4()
 {
-    return TileOperand(_tensor.component(TensorComponentType::Stride4));
+    CKW_ASSERT(is_valid() == true);
+    return TileOperand(_tensor->component(TensorComponentType::Stride4));
 }
 
 TileOperand TensorOperand::dim0()
 {
-    return TileOperand(_tensor.component(TensorComponentType::Dim0));
+    return TileOperand(_tensor->component(TensorComponentType::Dim0));
 }
 
 TileOperand TensorOperand::dim1()
 {
-    return TileOperand(_tensor.component(TensorComponentType::Dim1));
+    CKW_ASSERT(is_valid() == true);
+    return TileOperand(_tensor->component(TensorComponentType::Dim1));
 }
 
 TileOperand TensorOperand::dim2()
 {
-    return TileOperand(_tensor.component(TensorComponentType::Dim2));
+    CKW_ASSERT(is_valid() == true);
+    return TileOperand(_tensor->component(TensorComponentType::Dim2));
 }
 
 TileOperand TensorOperand::dim3()
 {
-    return TileOperand(_tensor.component(TensorComponentType::Dim3));
+    CKW_ASSERT(is_valid() == true);
+    return TileOperand(_tensor->component(TensorComponentType::Dim3));
 }
 
 TileOperand TensorOperand::dim4()
 {
-    return TileOperand(_tensor.component(TensorComponentType::Dim4));
+    CKW_ASSERT(is_valid() == true);
+    return TileOperand(_tensor->component(TensorComponentType::Dim4));
 }
 
 TileOperand TensorOperand::dim1_dim2()
 {
-    return TileOperand(_tensor.component(TensorComponentType::Dim1xDim2));
+    CKW_ASSERT(is_valid() == true);
+    return TileOperand(_tensor->component(TensorComponentType::Dim1xDim2));
 }
 
 TileOperand TensorOperand::dim1_dim2_dim3()
 {
-    return TileOperand(_tensor.component(TensorComponentType::Dim1xDim2xDim3));
+    CKW_ASSERT(is_valid() == true);
+    return TileOperand(_tensor->component(TensorComponentType::Dim1xDim2xDim3));
 }
 
 TileOperand TensorOperand::dim2_dim3()
 {
-    return TileOperand(_tensor.component(TensorComponentType::Dim2xDim3));
+    CKW_ASSERT(is_valid() == true);
+    return TileOperand(_tensor->component(TensorComponentType::Dim2xDim3));
 }
 
 TileOperand TensorOperand::offset_first_element_in_bytes()
 {
-    return TileOperand(_tensor.component(TensorComponentType::OffsetFirstElement));
+    CKW_ASSERT(is_valid() == true);
+    return TileOperand(_tensor->component(TensorComponentType::OffsetFirstElement));
 }
 
 } // namespace ckw
diff --git a/compute_kernel_writer/src/TensorSampler.cpp b/compute_kernel_writer/src/TensorSampler.cpp
index 91d5af2fd0..e81c5f9d66 100644
--- a/compute_kernel_writer/src/TensorSampler.cpp
+++ b/compute_kernel_writer/src/TensorSampler.cpp
@@ -27,6 +27,8 @@
 namespace ckw
 {
 
+TensorSampler::TensorSampler() = default;
+
 TensorSampler::TensorSampler(TensorStorageType         storage,
                              TensorSamplerFormat       format,
                              TensorSamplerAddressModeX address_mode_x,
diff --git a/compute_kernel_writer/src/TileOperand.cpp b/compute_kernel_writer/src/TileOperand.cpp
index 865ef85a13..8ced6cfe3f 100644
--- a/compute_kernel_writer/src/TileOperand.cpp
+++ b/compute_kernel_writer/src/TileOperand.cpp
@@ -31,6 +31,10 @@
 namespace ckw
 {
 
+TileOperand::TileOperand() : _tile(nullptr), _row_start(0), _row_end(0), _col_start(0), _col_end(0)
+{
+}
+
 TileOperand::TileOperand(ITile &tile)
     : _tile(&tile), _row_start(0), _row_end(tile.info().height()), _col_start(0), _col_end(tile.info().width())
 {
@@ -46,6 +50,16 @@ TileOperand::TileOperand(
     CKW_ASSERT(col_end > col_start && col_end <= _tile->info().width());
 }
 
+bool TileOperand::is_valid() const
+{
+    return _tile != nullptr;
+}
+
+const TileInfo &TileOperand::tile_info() const
+{
+    return _tile->info();
+}
+
 TileOperand TileOperand::tile(int32_t row_start, int32_t row_end, int32_t col_start, int32_t col_end) const
 {
     CKW_ASSERT(row_start >= 0 && _row_start + row_start < _row_end);
diff --git a/compute_kernel_writer/src/TileView.h b/compute_kernel_writer/src/TileView.h
index 50ae66b389..42854ac823 100644
--- a/compute_kernel_writer/src/TileView.h
+++ b/compute_kernel_writer/src/TileView.h
@@ -78,6 +78,10 @@ template <typename T>
 class TileView
 {
 public:
+    /** Default constructor */
+    TileView() : _tile(nullptr), _area(0, 0, 0, 0)
+    {
+    }
     /** Create a tile view that refers to the whole tile.
      *
      * @param[in] tile The tile object.
@@ -179,6 +183,22 @@ public:
                col_end() == _tile->info().width();
     }
 
+    /** Set the rectangular active area.
+     *
+     * @param[in] area The rectangular active area.
+    */
+    TileView &area(const TileArea &area)
+    {
+        _area = area;
+        return *this;
+    }
+
+    /** Get the tile area */
+    TileArea area() const
+    {
+        return _area;
+    }
+
 private:
     const T *_tile;
     TileArea _area;
diff --git a/compute_kernel_writer/src/cl/CLHelpers.cpp b/compute_kernel_writer/src/cl/CLHelpers.cpp
index 8e4a932764..252c5cdfcb 100644
--- a/compute_kernel_writer/src/cl/CLHelpers.cpp
+++ b/compute_kernel_writer/src/cl/CLHelpers.cpp
@@ -193,6 +193,9 @@ std::tuple<bool, std::string> cl_get_unary_op(UnaryOp op)
         case UnaryOp::Round:
             return {true, "round"};
 
+        case UnaryOp::Floor:
+            return {true, "floor"};
+
         default:
             CKW_THROW_MSG("Unsupported unary operation!");
     }
diff --git a/compute_kernel_writer/src/cl/CLKernelWriter.cpp b/compute_kernel_writer/src/cl/CLKernelWriter.cpp
index 62e6853a7a..8b4876b6a7 100644
--- a/compute_kernel_writer/src/cl/CLKernelWriter.cpp
+++ b/compute_kernel_writer/src/cl/CLKernelWriter.cpp
@@ -47,6 +47,25 @@
 #include <tuple>
 #include <vector>
 
+namespace
+{
+std::string generate_cl_extensions()
+{
+    std::string ext = R"(
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(cl_khr_fp16)
+
+#if defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(cl_arm_printf);
+
+#define inf (INFINITY)
+)";
+    return ext;
+}
+} // namespace
+
 namespace ckw
 {
 
@@ -56,7 +75,7 @@ CLKernelWriter::~CLKernelWriter() = default;
 std::unique_ptr<Kernel> CLKernelWriter::emit_kernel(const std::string &name)
 {
     std::string code;
-
+    code += generate_cl_extensions();
     code += "__kernel void ";
     code += name;
     code += "\n(\n";
@@ -154,21 +173,31 @@ void CLKernelWriter::op_cast(const TileOperand &dst, const TileOperand &src, Con
     const auto dst_type_str     = cl_get_variable_datatype_as_string(dst_type, dst_w);
 
     const std::string sat = policy == ConvertPolicy::Saturate ? "_sat" : "";
+
     CKW_ASSERT_IF(policy == ConvertPolicy::Saturate, !is_data_type_float(dst_type));
 
     const auto        broadcast_x = dst_w != 1 && src_w == 1;
     const std::string prefix      = broadcast_x ? "(" + dst_type_str + ")" : "";
 
-    CKW_ASSERT_MSG(src_view.data_type() != dst_view.data_type(), "Source and destination type must be different.");
     CKW_ASSERT_MSG(src_view.height() == dst_h || src_view.height() == 1,
                    "Tile height must match or source is broadcasting in y dimension.");
     CKW_ASSERT_MSG(src_w == dst_w || src_w == 1, "Tile width must match or source is broadcasting in x dimension.");
 
     // Broadcasting on y dimension is automatic (see CLTile::vector).
-    for (int32_t y = 0; y < dst_h; ++y)
+    if (src_view.data_type() == dst_view.data_type())
+    {
+        for (int32_t y = 0; y < dst_h; ++y)
+        {
+            append_code(dst_view.vector(y).str, " = ", src_view.vector(y).str, ";\n");
+        }
+    }
+    else
     {
-        append_code(dst_view.vector(y).str, " = ", prefix, "convert_", convert_type_str, sat, "(",
-                    src_view.vector(y).str, ");\n");
+        for (int32_t y = 0; y < dst_h; ++y)
+        {
+            append_code(dst_view.vector(y).str, " = ", prefix, "convert_", convert_type_str, sat, "(",
+                        src_view.vector(y).str, ");\n");
+        }
     }
 }
 
@@ -219,18 +248,12 @@ void CLKernelWriter::op_binary(const TileOperand &dst, BinaryOp op, const TileOp
 
     CKW_ASSERT_MSG(lhs_view.data_type() == rhs_view.data_type(), "LHS and RHS type must match.");
 
-    CKW_ASSERT_MSG(lhs_view.height() == dst_h || lhs_view.height() == 1,
-                   "LHS tile height must match or source is broadcasting in y dimension.");
-    CKW_ASSERT_MSG(rhs_view.height() == dst_h || rhs_view.height() == 1,
-                   "RHS tile height must match or source is broadcasting in y dimension.");
-
-    CKW_ASSERT_MSG(lhs_w == dst_w || lhs_w == 1,
-                   "LHS tile width must match destination or LHS is broadcasting in x dimension.");
-    CKW_ASSERT_MSG(rhs_w == dst_w || rhs_w == 1,
-                   "RHS tile width must match destination or RHS is broadcasting in x dimension.");
-
     if (op == BinaryOp::MatMul_Nt_T)
     {
+        CKW_ASSERT_MSG(lhs_view.height() == dst_h, "LHS tile height must match the DST tile height");
+        CKW_ASSERT_MSG(rhs_view.height() == dst_w, "RHS tile height must match the DST tile width");
+        CKW_ASSERT_MSG(lhs_view.width() == rhs_view.width(), "LHS tile width must match the LHS tile width");
+
         CKW_ASSERT(is_data_type_float(data_type));
 
         for (int32_t y = 0; y < dst_h; ++y)
@@ -239,14 +262,24 @@ void CLKernelWriter::op_binary(const TileOperand &dst, BinaryOp op, const TileOp
             {
                 for (int32_t k = 0; k < lhs_w; ++k)
                 {
-                    append_code(dst_view.scalar(x, y).str, " = fma(", lhs_view.scalar(k, y).str, ", ",
-                                rhs_view.scalar(k, x).str, ", ", dst_view.scalar(x, y).str, ");\n");
+                    append_code(dst_view.scalar(y, x).str, " = fma(", lhs_view.scalar(y, k).str, ", ",
+                                rhs_view.scalar(x, k).str, ", ", dst_view.scalar(y, x).str, ");\n");
                 }
             }
         }
     }
     else
     {
+        CKW_ASSERT_MSG(lhs_view.height() == dst_h || lhs_view.height() == 1,
+                       "LHS tile height must match or source is broadcasting in y dimension.");
+        CKW_ASSERT_MSG(rhs_view.height() == dst_h || rhs_view.height() == 1,
+                       "RHS tile height must match or source is broadcasting in y dimension.");
+
+        CKW_ASSERT_MSG(lhs_w == dst_w || lhs_w == 1,
+                       "LHS tile width must match destination or LHS is broadcasting in x dimension.");
+        CKW_ASSERT_MSG(rhs_w == dst_w || rhs_w == 1,
+                       "RHS tile width must match destination or RHS is broadcasting in x dimension.");
+
         const auto  op_info    = cl_get_binary_op(op, data_type);
         const auto  op_is_func = std::get<0>(op_info);
         const auto &op_name    = std::get<1>(op_info);
@@ -746,36 +779,35 @@ void CLKernelWriter::op_load_store(MemoryOperation         op,
 
     ITensor &tensor = get_tensor(tensor_op);
 
+    const auto tile       = to_cl_tile_view(tile_op);
+    const auto x_tile     = to_cl_tile_view(x).full_tile();
+    const auto y_tile     = to_cl_tile_view(y).full_tile();
+    const auto z_tile     = to_cl_tile_view(z).full_tile();
+    const auto batch_tile = to_cl_tile_view(batch).full_tile();
+
     std::unique_ptr<ICLMemoryOpHelper> helper;
     switch (sampler.storage())
     {
         case TensorStorageType::BufferUint8Ptr:
-            helper = std::make_unique<CLMemoryOpBufferHelper>(this, &tensor, &sampler, op);
+            helper = std::make_unique<CLMemoryOpBufferHelper>(this, &tensor, &sampler, op, tile);
             break;
         case TensorStorageType::Texture2dReadOnly:
         case TensorStorageType::Texture2dWriteOnly:
-            helper = std::make_unique<CLMemoryOpImage2dHelper>(this, &tensor, &sampler, op);
+            helper = std::make_unique<CLMemoryOpImage2dHelper>(this, &tensor, &sampler, op, tile);
             break;
         default:
             CKW_THROW_MSG("Unsupported tensor storage");
     }
 
-    // Load/store op doesn't support sub-tile access.
-    const auto tile       = to_cl_tile_view(tile_op).full_tile();
-    const auto x_tile     = to_cl_tile_view(x).full_tile();
-    const auto y_tile     = to_cl_tile_view(y).full_tile();
-    const auto z_tile     = to_cl_tile_view(z).full_tile();
-    const auto batch_tile = to_cl_tile_view(batch).full_tile();
-
     CKW_ASSERT(x_tile.is_scalar());
     CKW_ASSERT(z_tile.is_scalar());
     CKW_ASSERT_IF(indirect_buffer, y_tile.info().width() == 1);
     CKW_ASSERT_IF(!indirect_buffer, y_tile.is_scalar());
     CKW_ASSERT(batch_tile.is_scalar());
 
-    helper->initialize(&tile, &x_tile, &z_tile, &batch_tile);
+    helper->initialize(&x_tile, &z_tile, &batch_tile);
 
-    for (int row = 0; row < tile.info().height(); ++row)
+    for (int row = 0; row < tile.height(); ++row)
     {
         if (!indirect_buffer)
         {
diff --git a/compute_kernel_writer/src/cl/CLTensorArgument.h b/compute_kernel_writer/src/cl/CLTensorArgument.h
index 35df51422e..a79cf340bb 100644
--- a/compute_kernel_writer/src/cl/CLTensorArgument.h
+++ b/compute_kernel_writer/src/cl/CLTensorArgument.h
@@ -27,6 +27,7 @@
 #include "ckw/types/TensorComponentType.h"
 #include "ckw/types/TensorStorageType.h"
 
+#include "src/cl/CLTensorComponent.h"
 #include "src/ITensor.h"
 
 #include <memory>
@@ -39,8 +40,6 @@ namespace ckw
 class TensorInfo;
 
 class ITensorComponent;
-class CLTensorComponent;
-class CLTensorStorage;
 
 /** OpenCL specific tensor argument
  *  Internally, the object keeps track of the components and storages used to minimize the number
diff --git a/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.cpp b/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.cpp
index a98ebed8fa..7d16f35fbe 100644
--- a/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.cpp
+++ b/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.cpp
@@ -34,15 +34,16 @@
 #include "src/cl/CLTile.h"
 #include "src/ITensor.h"
 #include "src/Tensor3dMapper.h"
+#include "src/TileView.h"
 
 namespace ckw
 {
-bool CLMemoryOpBufferHelper::validate(const CLKernelWriter *writer,
-                                      const ITensor        *tensor,
-                                      const TensorSampler  *sampler,
-                                      const Tensor3dMapper *mapper,
-                                      MemoryOperation       op,
-                                      const CLTile         *dst)
+bool CLMemoryOpBufferHelper::validate(const CLKernelWriter   *writer,
+                                      const ITensor          *tensor,
+                                      const TensorSampler    *sampler,
+                                      const Tensor3dMapper   *mapper,
+                                      MemoryOperation         op,
+                                      const TileView<CLTile> &dst)
 {
     CKW_UNUSED(writer, tensor, mapper, op, dst);
 
@@ -100,17 +101,14 @@ bool CLMemoryOpBufferHelper::validate(const CLKernelWriter *writer,
  *  The outermost block is x, then z and then y. This is why, if/else's covering for y are initialized
  *  at each row write. In some addressing modes, such as None, no if/else conditions are written.
  */
-void CLMemoryOpBufferHelper::initialize(const CLTile *dst, const CLTile *x, const CLTile *z, const CLTile *b)
+void CLMemoryOpBufferHelper::initialize(const CLTile *x, const CLTile *z, const CLTile *b)
 {
-    _dst = dst;
-
     CKW_ASSERT(validate(_writer, _tensor, _sampler, _mapper.get(), _op, _dst));
 
-    _ls_width_full = dst->info().width();
-    _coord_x       = x->scalar(0, 0).str;
-    _coord_z       = z->scalar(0, 0).str;
-    _coord_b       = b->scalar(0, 0).str;
-    _coord_orig_z  = _coord_z;
+    _coord_x      = x->scalar(0, 0).str;
+    _coord_z      = z->scalar(0, 0).str;
+    _coord_b      = b->scalar(0, 0).str;
+    _coord_orig_z = _coord_z;
 
     out_of_bound_initialize_x(_coord_x);
     out_of_bound_initialize_z(_coord_z);
@@ -121,7 +119,7 @@ void CLMemoryOpBufferHelper::write_row(int32_t row_id, const std::string &coord_
     // The only check required is on Y.
     out_of_bound_initialize_y(coord_y);
 
-    const std::string dst     = _dst->vector(row_id).str;
+    const std::string dst     = _dst.vector(row_id).str;
     const std::string address = to_buffer_address(_coord_x, coord_y, _coord_z, _coord_b);
     const std::string ls_buf  = to_statement(_op, _ls_width_full, dst, address);
 
@@ -133,10 +131,17 @@ void CLMemoryOpBufferHelper::write_row(int32_t row_id, const std::string &coord_
     // The left over load/store will be written in the finalize stage
     if (_ls_width_part.size() != 0)
     {
-        int32_t col_start = 0;
+        int32_t        col_start     = 0;
+        const TileArea original_area = _dst.area();
+
         for (int32_t partial_width : _ls_width_part)
         {
-            const std::string dst       = _dst->vector(row_id, col_start, partial_width).str;
+            // Set the active area
+            const TileArea area(original_area.row_start(), original_area.row_end(), col_start,
+                                col_start + partial_width);
+            _dst.area(area);
+
+            const std::string dst       = _dst.vector(row_id).str;
             const std::string coord_x   = _coord_x + " + " + std::to_string(col_start);
             const std::string address   = to_buffer_address(coord_x, coord_y, _coord_z, _coord_b);
             const std::string statement = to_statement(_op, partial_width, dst, address);
@@ -144,6 +149,8 @@ void CLMemoryOpBufferHelper::write_row(int32_t row_id, const std::string &coord_
 
             col_start += partial_width;
         }
+        // Restore the original area
+        _dst.area(original_area);
     }
 }
 
@@ -304,7 +311,7 @@ std::string CLMemoryOpBufferHelper::to_buffer_address(const std::string &x,
     CKW_ASSERT(tensor_storage == TensorStorageType::BufferUint8Ptr);
 
     const std::string ptr_buf  = _tensor->storage(tensor_storage).val;
-    const std::string dst_type = cl_data_type_rounded_up_to_valid_vector_width(_dst->info().data_type(), 1);
+    const std::string dst_type = cl_data_type_rounded_up_to_valid_vector_width(_dst.data_type(), 1);
 
     std::string address;
     address += "(__global ";
diff --git a/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.h b/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.h
index 4e1a842fe1..a6b3272f32 100644
--- a/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.h
+++ b/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.h
@@ -22,8 +22,8 @@
  * SOFTWARE.
  */
 
-#ifndef CKW_SRC_CL_CLMEMORYOPBUFFERHELPER_H
-#define CKW_SRC_CL_CLMEMORYOPBUFFERHELPER_H
+#ifndef CKW_SRC_CL_HELPERS_CLMEMORYOPBUFFERHELPER_H
+#define CKW_SRC_CL_HELPERS_CLMEMORYOPBUFFERHELPER_H
 
 #include "src/cl/helpers/ICLMemoryOpHelper.h"
 
@@ -37,6 +37,8 @@ namespace ckw
 // Forward Declarations
 class CLKernelWriter;
 class CLTile;
+template <class CLTile>
+class TileView;
 enum class MemoryOperation;
 
 /** Helper class to write memory operations (like load/store) in OpenCL
@@ -45,19 +47,23 @@ class CLMemoryOpBufferHelper : public ICLMemoryOpHelper
 {
 public:
     /** Constructor similar to @ref ICLMemoryOpHelper() */
-    CLMemoryOpBufferHelper(CLKernelWriter *writer, ITensor *tensor, TensorSampler *sampler, MemoryOperation op)
-        : ICLMemoryOpHelper(writer, tensor, sampler, op)
+    CLMemoryOpBufferHelper(CLKernelWriter         *writer,
+                           ITensor                *tensor,
+                           TensorSampler          *sampler,
+                           MemoryOperation         op,
+                           const TileView<CLTile> &dst)
+        : ICLMemoryOpHelper(writer, tensor, sampler, op, dst)
     {
     }
 
     /** Copy constructor */
-    CLMemoryOpBufferHelper(const CLMemoryOpBufferHelper &) = default;
+    CLMemoryOpBufferHelper(const CLMemoryOpBufferHelper &) = delete;
 
     /** Assignment operator overload */
-    CLMemoryOpBufferHelper &operator=(const CLMemoryOpBufferHelper &) = default;
+    CLMemoryOpBufferHelper &operator=(const CLMemoryOpBufferHelper &) = delete;
 
     // Methods overridden
-    void initialize(const CLTile *dst, const CLTile *x, const CLTile *z, const CLTile *b) override;
+    void initialize(const CLTile *x, const CLTile *z, const CLTile *b) override;
     void write_row(int32_t row_id, const std::string &coord_y) override;
     void finalize() override;
 
@@ -78,12 +84,12 @@ private:
     std::vector<LeftoverDescriptor> _leftovers_x{};
     std::string                     _coord_orig_z{};
 
-    static bool validate(const CLKernelWriter *writer,
-                         const ITensor        *tensor,
-                         const TensorSampler  *sampler,
-                         const Tensor3dMapper *mapper,
-                         MemoryOperation       op,
-                         const CLTile         *dst);
+    static bool validate(const CLKernelWriter   *writer,
+                         const ITensor          *tensor,
+                         const TensorSampler    *sampler,
+                         const Tensor3dMapper   *mapper,
+                         MemoryOperation         op,
+                         const TileView<CLTile> &dst);
 
     void out_of_bound_initialize_x(const std::string &coord);
     void out_of_bound_finalize_x();
@@ -99,4 +105,4 @@ private:
 };
 } // namespace ckw
 
-#endif /* CKW_SRC_CL_CLMEMORYOPBUFFERHELPER_H */
+#endif // CKW_SRC_CL_HELPERS_CLMEMORYOPBUFFERHELPER_H
diff --git a/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.cpp b/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.cpp
index b7d146bdee..f392cd89cc 100644
--- a/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.cpp
+++ b/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.cpp
@@ -33,18 +33,15 @@
 #include "src/cl/CLTile.h"
 #include "src/ITensor.h"
 #include "src/Tensor3dMapper.h"
+#include "src/TileView.h"
 
 namespace ckw
 {
-void CLMemoryOpImage2dHelper::initialize(const CLTile *dst, const CLTile *x, const CLTile *z, const CLTile *b)
+void CLMemoryOpImage2dHelper::initialize(const CLTile *x, const CLTile *z, const CLTile *b)
 {
-    CKW_ASSERT(validate(_writer, _tensor, _sampler, _mapper.get(), _op, dst));
-
-    _dst           = dst;
-    _ls_width_full = dst->info().width();
-    _coord_x       = x->scalar(0, 0).str;
-    _coord_z       = z->scalar(0, 0).str;
-    _coord_b       = b->scalar(0, 0).str;
+    _coord_x = x->scalar(0, 0).str;
+    _coord_z = z->scalar(0, 0).str;
+    _coord_b = b->scalar(0, 0).str;
 }
 
 void CLMemoryOpImage2dHelper::write_row(int32_t row_id, const std::string &coord_y)
@@ -52,7 +49,7 @@ void CLMemoryOpImage2dHelper::write_row(int32_t row_id, const std::string &coord
     // The only check required is on Y.
     out_of_bound_initialize_y(coord_y);
 
-    const std::string dst     = _dst->vector(row_id).str;
+    const std::string dst     = _dst.vector(row_id).str;
     const std::string sampler = to_ls_image2d_sampler();
     const std::string coord   = to_ls_image2d_address(_coord_x, coord_y, _coord_z, _coord_b);
     const std::string ls_buf  = to_ls_image2d(_op, _ls_width_full, dst, sampler, coord);
@@ -66,16 +63,16 @@ void CLMemoryOpImage2dHelper::finalize()
 {
 }
 
-bool CLMemoryOpImage2dHelper::validate(const CLKernelWriter *writer,
-                                       const ITensor        *tensor,
-                                       const TensorSampler  *sampler,
-                                       const Tensor3dMapper *mapper,
-                                       MemoryOperation       op,
-                                       const CLTile         *dst)
+bool CLMemoryOpImage2dHelper::validate(const CLKernelWriter   *writer,
+                                       const ITensor          *tensor,
+                                       const TensorSampler    *sampler,
+                                       const Tensor3dMapper   *mapper,
+                                       MemoryOperation         op,
+                                       const TileView<CLTile> &dst)
 {
     CKW_UNUSED(writer, tensor, mapper);
 
-    if (dst->info().width() != 4)
+    if (dst.width() != 4)
     {
         return false;
     }
@@ -95,7 +92,7 @@ bool CLMemoryOpImage2dHelper::validate(const CLKernelWriter *writer,
     {
         return false;
     }
-    if ((dst->info().data_type() != DataType::Fp32) && (dst->info().data_type() != DataType::Fp16))
+    if ((dst.data_type() != DataType::Fp32) && (dst.data_type() != DataType::Fp16))
     {
         return false;
     }
@@ -143,10 +140,12 @@ std::string CLMemoryOpImage2dHelper::to_ls_image2d(MemoryOperation    op,
                                                    const std::string &address) const
 {
     CKW_UNUSED(vector_width);
+    CKW_ASSERT_MSG(_dst.data_type() == DataType::Fp32 || _dst.data_type() == DataType::Fp16,
+                   "Image2d only supports floating-point data type");
 
     const TensorStorageType tensor_storage = _sampler->storage();
     const std::string       image2d_obj    = _tensor->storage(tensor_storage).val;
-    const std::string       post_fix       = _dst->info().data_type() == DataType::Fp32 ? "f" : "h";
+    const std::string       post_fix       = _dst.data_type() == DataType::Fp32 ? "f" : "h";
 
     switch (op)
     {
diff --git a/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.h b/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.h
index fd9b097a24..6c42c132d9 100644
--- a/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.h
+++ b/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.h
@@ -35,6 +35,8 @@ namespace ckw
 // Forward Declarations
 class CLKernelWriter;
 class CLTile;
+template <class CLTile>
+class TileView;
 enum class MemoryOperation;
 
 /** Helper class to write memory operations (like load/store) in OpenCL for Image2d type */
@@ -42,29 +44,33 @@ class CLMemoryOpImage2dHelper : public ICLMemoryOpHelper
 {
 public:
     /** Constructor similar to @ref ICLMemoryOpHelper() */
-    CLMemoryOpImage2dHelper(CLKernelWriter *writer, ITensor *tensor, TensorSampler *sampler, MemoryOperation op)
-        : ICLMemoryOpHelper(writer, tensor, sampler, op)
+    CLMemoryOpImage2dHelper(CLKernelWriter         *writer,
+                            ITensor                *tensor,
+                            TensorSampler          *sampler,
+                            MemoryOperation         op,
+                            const TileView<CLTile> &dst)
+        : ICLMemoryOpHelper(writer, tensor, sampler, op, dst)
     {
     }
 
     /** Copy constructor */
-    CLMemoryOpImage2dHelper(const CLMemoryOpImage2dHelper &) = default;
+    CLMemoryOpImage2dHelper(const CLMemoryOpImage2dHelper &) = delete;
 
     /** Assignment operator overload */
-    CLMemoryOpImage2dHelper &operator=(const CLMemoryOpImage2dHelper &) = default;
+    CLMemoryOpImage2dHelper &operator=(const CLMemoryOpImage2dHelper &) = delete;
 
     // Methods overridden
-    void initialize(const CLTile *dst, const CLTile *x, const CLTile *z, const CLTile *b) override;
+    void initialize(const CLTile *x, const CLTile *z, const CLTile *b) override;
     void write_row(int32_t row_id, const std::string &coord_y) override;
     void finalize() override;
 
 private:
-    static bool validate(const CLKernelWriter *writer,
-                         const ITensor        *tensor,
-                         const TensorSampler  *sampler,
-                         const Tensor3dMapper *mapper,
-                         MemoryOperation       op,
-                         const CLTile         *dst);
+    static bool validate(const CLKernelWriter   *writer,
+                         const ITensor          *tensor,
+                         const TensorSampler    *sampler,
+                         const Tensor3dMapper   *mapper,
+                         MemoryOperation         op,
+                         const TileView<CLTile> &dst);
 
     void out_of_bound_initialize_y(const std::string &coord);
     void out_of_bound_finalize_y();
diff --git a/compute_kernel_writer/src/cl/helpers/ICLMemoryOpHelper.h b/compute_kernel_writer/src/cl/helpers/ICLMemoryOpHelper.h
index f46fee9750..a5b679ac03 100644
--- a/compute_kernel_writer/src/cl/helpers/ICLMemoryOpHelper.h
+++ b/compute_kernel_writer/src/cl/helpers/ICLMemoryOpHelper.h
@@ -28,6 +28,7 @@
 #include "ckw/TensorSampler.h"
 
 #include "src/Tensor3dMapper.h"
+#include "src/TileView.h"
 
 #include <cstdint>
 #include <memory>
@@ -55,18 +56,24 @@ public:
      * @param[in] tensor  @ref ckw::ITensor object to perform the memory operation on
      * @param[in] sampler @ref ckw::TensorSampler object that tells how to sample a tensor
      * @param[in] op      The memory operation to be done (e.g. Load/Store)
+     * @param[in] dst     The tile to perform the memory operation on
      */
-    ICLMemoryOpHelper(CLKernelWriter *writer, ITensor *tensor, TensorSampler *sampler, MemoryOperation op)
-        : _writer(writer), _tensor(tensor), _sampler(sampler), _op(op)
+    ICLMemoryOpHelper(CLKernelWriter         *writer,
+                      ITensor                *tensor,
+                      TensorSampler          *sampler,
+                      MemoryOperation         op,
+                      const TileView<CLTile> &dst)
+        : _writer(writer), _tensor(tensor), _sampler(sampler), _op(op), _dst(dst)
     {
-        _mapper = std::make_unique<Tensor3dMapper>(tensor, sampler->format());
+        _mapper        = std::make_unique<Tensor3dMapper>(tensor, sampler->format());
+        _ls_width_full = _dst.width();
     }
 
     /** Copy constructor */
-    ICLMemoryOpHelper(const ICLMemoryOpHelper &) = default;
+    ICLMemoryOpHelper(const ICLMemoryOpHelper &) = delete;
 
     /** Assignment operator overload */
-    ICLMemoryOpHelper &operator=(const ICLMemoryOpHelper &) = default;
+    ICLMemoryOpHelper &operator=(const ICLMemoryOpHelper &) = delete;
 
     /** Destructor */
     virtual ~ICLMemoryOpHelper() = default;
@@ -75,12 +82,11 @@ public:
      *  the batch offset as a tile object, and initializes the code inside
      *  the writer object.
      *
-     * @param[in] dst  tile object to perform the memory operation on
      * @param[in] x    tile object that describes the x-coordinate of the tensor involved
      * @param[in] z    tile object that describes the z-coordinate of the tensor involved
      * @param[in] b    tile object that describes the batch offset of the tensor involved
      */
-    virtual void initialize(const CLTile *dst, const CLTile *x, const CLTile *z, const CLTile *b) = 0;
+    virtual void initialize(const CLTile *x, const CLTile *z, const CLTile *b) = 0;
 
     /** Method that writes the actual code to the writer that performs the mentioned memory
      *  operation on the tile initialized. It writes the code for a specific row given in the
@@ -104,7 +110,7 @@ protected:
     TensorSampler                  *_sampler{nullptr};
     MemoryOperation                 _op;
     std::unique_ptr<Tensor3dMapper> _mapper{nullptr};
-    const CLTile                   *_dst{nullptr};
+    TileView<CLTile>                _dst{};
     int32_t                         _ls_width_full{0};
     std::string                     _coord_x{};
     std::string                     _coord_z{};
@@ -112,4 +118,4 @@ protected:
 };
 } // namespace ckw
 
-#endif /* CKW_SRC_CL_HELPERS_ICLMEMORYOPHELPER_H */
+#endif // CKW_SRC_CL_HELPERS_ICLMEMORYOPHELPER_H
diff --git a/compute_kernel_writer/src/types/ConstantData.cpp b/compute_kernel_writer/src/types/ConstantData.cpp
index 67b1103860..6d15eab407 100644
--- a/compute_kernel_writer/src/types/ConstantData.cpp
+++ b/compute_kernel_writer/src/types/ConstantData.cpp
@@ -31,7 +31,7 @@ namespace ckw
 namespace
 {
 template <typename T>
-inline typename std::enable_if<std::is_same<T, float>::value, std::string>::type to_str(T value)
+typename std::enable_if<std::is_same<T, float>::value, std::string>::type to_str(T value)
 {
     std::stringstream ss;
     ss << std::scientific << std::setprecision(std::numeric_limits<T>::max_digits10) << value;
@@ -39,14 +39,14 @@ inline typename std::enable_if<std::is_same<T, float>::value, std::string>::type
 }
 
 template <typename T>
-inline typename std::enable_if<!std::is_same<T, float>::value && !std::is_same<T, bool>::value, std::string>::type
+typename std::enable_if<!std::is_same<T, float>::value && !std::is_same<T, bool>::value, std::string>::type
 to_str(T value)
 {
     return std::to_string(value);
 }
 
 template <typename T>
-inline typename std::enable_if<std::is_same<T, bool>::value, std::string>::type to_str(T value)
+typename std::enable_if<std::is_same<T, bool>::value, std::string>::type to_str(T value)
 {
     return std::to_string((int)value);
 }
@@ -72,6 +72,24 @@ ConstantData::ConstantData(std::initializer_list<std::initializer_list<T>> value
 }
 
 template <typename T>
+ConstantData::ConstantData(const std::vector<std::vector<T>> &values, DataType data_type) : _data_type(data_type)
+{
+    CKW_ASSERT(validate<T>(data_type));
+    CKW_ASSERT(values.size() > 0);
+
+    for (auto value_arr : values)
+    {
+        // Each row must have the same number of elements
+        CKW_ASSERT(value_arr.size() == (*values.begin()).size());
+
+        StringVector vec;
+        std::transform(value_arr.begin(), value_arr.end(), std::back_inserter(vec), [](T val) { return to_str(val); });
+
+        _values.push_back(std::move(vec));
+    }
+}
+
+template <typename T>
 bool ConstantData::validate(DataType data_type)
 {
     switch (data_type)
@@ -100,6 +118,10 @@ template ConstantData::ConstantData(std::initializer_list<std::initializer_list<
 template ConstantData::ConstantData(std::initializer_list<std::initializer_list<uint32_t>>, DataType);
 template ConstantData::ConstantData(std::initializer_list<std::initializer_list<bool>>, DataType);
 template ConstantData::ConstantData(std::initializer_list<std::initializer_list<float>>, DataType);
+template ConstantData::ConstantData(const std::vector<std::vector<int32_t>> &, DataType);
+template ConstantData::ConstantData(const std::vector<std::vector<uint32_t>> &, DataType);
+template ConstantData::ConstantData(const std::vector<std::vector<bool>> &, DataType);
+template ConstantData::ConstantData(const std::vector<std::vector<float>> &, DataType);
 
 template bool ConstantData::validate<int32_t>(DataType);
 template bool ConstantData::validate<uint32_t>(DataType);
diff --git a/compute_kernel_writer/validation/tests/CLKernelWriterBinaryOpTest.h b/compute_kernel_writer/validation/tests/CLKernelWriterBinaryOpTest.h
index bfa6724008..44a4df1ce1 100644
--- a/compute_kernel_writer/validation/tests/CLKernelWriterBinaryOpTest.h
+++ b/compute_kernel_writer/validation/tests/CLKernelWriterBinaryOpTest.h
@@ -61,25 +61,19 @@ public:
 
         _tests.push_back({ 2, 4, DataType::Bool, 2, 1, 2, 1, DataType::Fp32, BinaryOp::GreaterEqual, "G0__dst__0 = (float4)G0__lhs__0 >= (float4)G0__rhs__0;\nG0__dst__1 = (float4)G0__lhs__1 >= (float4)G0__rhs__1;\n" }); // LHS and RHS x-dimension broadcast.
 
-        _tests.push_back({ 2, 3, DataType::Fp32, 2, 3, 2, 3, DataType::Fp32, BinaryOp::MatMul_Nt_T,
+        _tests.push_back({ 2, 2, DataType::Fp32, 2, 3, 2, 3, DataType::Fp32, BinaryOp::MatMul_Nt_T,
                            "G0__dst__0.s0 = fma(G0__lhs__0.s0, G0__rhs__0.s0, G0__dst__0.s0);\n"
-                           "G0__dst__0.s0 = fma(G0__lhs__1.s0, G0__rhs__1.s0, G0__dst__0.s0);\n"
-                           "G0__dst__0.s0 = fma(G0__lhs__1.s0, G0__rhs__1.s0, G0__dst__0.s0);\n"
-                           "G0__dst__1.s0 = fma(G0__lhs__0.s0, G0__rhs__0.s1, G0__dst__1.s0);\n"
-                           "G0__dst__1.s0 = fma(G0__lhs__1.s0, G0__rhs__1.s1, G0__dst__1.s0);\n"
-                           "G0__dst__1.s0 = fma(G0__lhs__1.s0, G0__rhs__1.s1, G0__dst__1.s0);\n"
-                           "G0__dst__1.s0 = fma(G0__lhs__0.s0, G0__rhs__0.s2, G0__dst__1.s0);\n"
-                           "G0__dst__1.s0 = fma(G0__lhs__1.s0, G0__rhs__1.s2, G0__dst__1.s0);\n"
-                           "G0__dst__1.s0 = fma(G0__lhs__1.s0, G0__rhs__1.s2, G0__dst__1.s0);\n"
-                           "G0__dst__0.s1 = fma(G0__lhs__0.s1, G0__rhs__0.s0, G0__dst__0.s1);\n"
-                           "G0__dst__0.s1 = fma(G0__lhs__1.s1, G0__rhs__1.s0, G0__dst__0.s1);\n"
-                           "G0__dst__0.s1 = fma(G0__lhs__1.s1, G0__rhs__1.s0, G0__dst__0.s1);\n"
-                           "G0__dst__1.s1 = fma(G0__lhs__0.s1, G0__rhs__0.s1, G0__dst__1.s1);\n"
+                           "G0__dst__0.s0 = fma(G0__lhs__0.s1, G0__rhs__0.s1, G0__dst__0.s0);\n"
+                           "G0__dst__0.s0 = fma(G0__lhs__0.s2, G0__rhs__0.s2, G0__dst__0.s0);\n"
+                           "G0__dst__0.s1 = fma(G0__lhs__0.s0, G0__rhs__1.s0, G0__dst__0.s1);\n"
+                           "G0__dst__0.s1 = fma(G0__lhs__0.s1, G0__rhs__1.s1, G0__dst__0.s1);\n"
+                           "G0__dst__0.s1 = fma(G0__lhs__0.s2, G0__rhs__1.s2, G0__dst__0.s1);\n"
+                           "G0__dst__1.s0 = fma(G0__lhs__1.s0, G0__rhs__0.s0, G0__dst__1.s0);\n"
+                           "G0__dst__1.s0 = fma(G0__lhs__1.s1, G0__rhs__0.s1, G0__dst__1.s0);\n"
+                           "G0__dst__1.s0 = fma(G0__lhs__1.s2, G0__rhs__0.s2, G0__dst__1.s0);\n"
+                           "G0__dst__1.s1 = fma(G0__lhs__1.s0, G0__rhs__1.s0, G0__dst__1.s1);\n"
                            "G0__dst__1.s1 = fma(G0__lhs__1.s1, G0__rhs__1.s1, G0__dst__1.s1);\n"
-                           "G0__dst__1.s1 = fma(G0__lhs__1.s1, G0__rhs__1.s1, G0__dst__1.s1);\n"
-                           "G0__dst__1.s1 = fma(G0__lhs__0.s1, G0__rhs__0.s2, G0__dst__1.s1);\n"
-                           "G0__dst__1.s1 = fma(G0__lhs__1.s1, G0__rhs__1.s2, G0__dst__1.s1);\n"
-                           "G0__dst__1.s1 = fma(G0__lhs__1.s1, G0__rhs__1.s2, G0__dst__1.s1);\n" });
+                           "G0__dst__1.s1 = fma(G0__lhs__1.s2, G0__rhs__1.s2, G0__dst__1.s1);\n" });
     }
 
     bool run() override
diff --git a/compute_kernel_writer/validation/tests/CLKernelWriterDeclareTensorTest.h b/compute_kernel_writer/validation/tests/CLKernelWriterDeclareTensorTest.h
index 3e1056972e..855c747f13 100644
--- a/compute_kernel_writer/validation/tests/CLKernelWriterDeclareTensorTest.h
+++ b/compute_kernel_writer/validation/tests/CLKernelWriterDeclareTensorTest.h
@@ -81,7 +81,15 @@ public:
             "{\n"
             "}\n";
 
-        const auto &actual_code = kernel->source_code();
+        std::string actual_code = kernel->source_code();
+
+        std::size_t pos = actual_code.find("__kernel");
+
+        if (pos != std::string::npos)
+        {
+            // Remove text before "__kernel"
+            actual_code = actual_code.substr(pos);
+        }
 
         int test_id = 0;
         VALIDATE_TEST(kernel->arguments().size() == 4, all_tests_passed, test_id++);
author	Gunes Bayir <gunes.bayir@arm.com>	2024-01-17 16:07:03 +0000
committer	Viet-Hoa Do <viet-hoa.do@arm.com>	2024-02-01 16:00:34 +0000
commit	2b9fa593a0a172bf36a02b5cdb840c6b9b361d7c (patch)
tree	a4e2d5ce46443a79a0778e4960462ce3edf106ec /compute_kernel_writer
parent	7ab7fca87cca8775f82b0e9efec6a40975910c17 (diff)
download	ComputeLibrary-2b9fa593a0a172bf36a02b5cdb840c6b9b361d7c.tar.gz