From d5f9a1cf9f0340f3e6bf9ff00156fc2adb1fdca9 Mon Sep 17 00:00:00 2001
From: Gunes Bayir <gunes.bayir@arm.com>
Date: Thu, 17 Aug 2023 11:04:02 +0100
Subject: Implement indirect load for buffer and CLImage

Add KernelWriter API functions for loading from an indirect buffer

Resolves: COMPMID-6390
Signed-off-by: Gunes Bayir <gunes.bayir@arm.com>
Change-Id: I45dbf88b25ec5caf2b458657ef20aacac9924745
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10192
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
---
 compute_kernel_writer/src/cl/CLKernelWriter.cpp    | 52 +++++++++++++++++-----
 compute_kernel_writer/src/cl/CLKernelWriter.h      | 26 +++--------
 .../src/cl/helpers/CLMemoryOpBufferHelper.cpp      |  6 +++
 .../src/cl/helpers/CLMemoryOpImage2dHelper.cpp     |  7 +++
 4 files changed, 60 insertions(+), 31 deletions(-)

(limited to 'compute_kernel_writer/src/cl')
diff --git a/compute_kernel_writer/src/cl/CLKernelWriter.cpp b/compute_kernel_writer/src/cl/CLKernelWriter.cpp
index a946b989d7..4074da7912 100644
--- a/compute_kernel_writer/src/cl/CLKernelWriter.cpp
+++ b/compute_kernel_writer/src/cl/CLKernelWriter.cpp
@@ -42,6 +42,7 @@
 
 #include <algorithm>
 #include <cstdint>
+#include <vector>
 
 namespace ckw
 {
@@ -628,7 +629,7 @@ void CLKernelWriter::op_load(const TileOperand &tile_op, const TensorOperand &te
     const CLTile dilation_x({ { "1" } }, DataType::Int32);
     const CLTile dilation_y({ { "1" } }, DataType::Int32);
 
-    op_load_store(MemoryOperation::Load, tile_op, tensor_op, sampler, x, y, z, batch, dilation_x, dilation_y);
+    op_load_store(MemoryOperation::Load, tile_op, tensor_op, sampler, x, y, z, batch, dilation_x, dilation_y, false /* indirect buffer */);
 }
 
 void CLKernelWriter::op_load_dilated(const TileOperand &tile_op, const TensorOperand &tensor_op, TensorSampler &sampler,
@@ -638,7 +639,7 @@ void CLKernelWriter::op_load_dilated(const TileOperand &tile_op, const TensorOpe
     const auto &dil_x_tile = to_cl_tile(dilation_x);
     const auto &dil_y_tile = to_cl_tile(dilation_y);
 
-    op_load_store(MemoryOperation::Load, tile_op, tensor_op, sampler, x, y, z, batch, dil_x_tile, dil_y_tile);
+    op_load_store(MemoryOperation::Load, tile_op, tensor_op, sampler, x, y, z, batch, dil_x_tile, dil_y_tile, false /* indirect buffer */);
 }
 
 void CLKernelWriter::op_store(const TensorOperand &tensor_op, const TileOperand &tile_op, TensorSampler &sampler,
@@ -647,7 +648,7 @@ void CLKernelWriter::op_store(const TensorOperand &tensor_op, const TileOperand
     const CLTile dilation_x({ { "1" } }, DataType::Int32);
     const CLTile dilation_y({ { "1" } }, DataType::Int32);
 
-    op_load_store(MemoryOperation::Store, tile_op, tensor_op, sampler, x, y, z, batch, dilation_x, dilation_y);
+    op_load_store(MemoryOperation::Store, tile_op, tensor_op, sampler, x, y, z, batch, dilation_x, dilation_y, false /* indirect buffer */);
 }
 
 void CLKernelWriter::op_store_dilated(const TensorOperand &tensor_op, const TileOperand &tile_op, TensorSampler &sampler,
@@ -657,16 +658,32 @@ void CLKernelWriter::op_store_dilated(const TensorOperand &tensor_op, const Tile
     const auto &dil_x_tile = to_cl_tile(dilation_x);
     const auto &dil_y_tile = to_cl_tile(dilation_y);
 
-    op_load_store(MemoryOperation::Store, tile_op, tensor_op, sampler, x, y, z, batch, dil_x_tile, dil_y_tile);
+    op_load_store(MemoryOperation::Store, tile_op, tensor_op, sampler, x, y, z, batch, dil_x_tile, dil_y_tile, false /* indirect buffer */);
+}
+
+void CLKernelWriter::op_load_indirect(const TileOperand &tile_op, const TensorOperand &tensor_op, TensorSampler &sampler,
+        const TileOperand &x, const TileOperand &y, const TileOperand &z, const TileOperand &batch)
+{
+    const CLTile dilation_x({ { "1" } }, DataType::Int32);
+    const CLTile dilation_y({ { "1" } }, DataType::Int32);
+
+    op_load_store(MemoryOperation::Load, tile_op, tensor_op, sampler, x, y, z, batch, dilation_x, dilation_y, true /* indirect buffer */);
 }
 
 void CLKernelWriter::op_load_store(MemoryOperation op, const TileOperand &tile_op, const TensorOperand &tensor_op, TensorSampler &sampler,
-                                   const TileOperand &x, const TileOperand &y, const TileOperand &z, const TileOperand &batch,
-                                   const CLTile &dilation_x, const CLTile &dilation_y)
+        const TileOperand &x, const TileOperand &y, const TileOperand &z, const TileOperand &batch,
+        const CLTile &dilation_x, const CLTile &dilation_y, bool indirect_buffer)
 {
     CKW_UNUSED(dilation_x);
+    CKW_ASSERT(dilation_x.is_scalar());
+    CKW_ASSERT(dilation_y.is_scalar());
     CKW_ASSERT(dilation_x.scalar(0, 0).str == "((int)(1))"); // Dilation in x dimension is not implemented yet
 
+    if(indirect_buffer)
+    {
+        CKW_ASSERT(dilation_y.scalar(0,0).str == "((int)(1))" && dilation_x.scalar(0,0).str == "((int)(1))");
+    }
+
     ITensor &tensor = get_tensor(tensor_op);
 
     std::unique_ptr<ICLMemoryOpHelper> helper;
@@ -689,18 +706,31 @@ void CLKernelWriter::op_load_store(MemoryOperation op, const TileOperand &tile_o
     const auto &z_tile     = to_cl_tile(z);
     const auto &batch_tile = to_cl_tile(batch);
 
+    CKW_ASSERT(x_tile.is_scalar());
+    CKW_ASSERT(z_tile.is_scalar());
+    CKW_ASSERT_IF(indirect_buffer, y_tile.info().width() == 1);
+    CKW_ASSERT_IF(!indirect_buffer, y_tile.is_scalar());
+    CKW_ASSERT(batch_tile.is_scalar());
+
     helper->initialize(&tile, &x_tile, &z_tile, &batch_tile);
 
     for(int row = 0; row < tile.info().height(); ++row)
     {
-        std::string coord_y = y_tile.scalar(0, 0).str + " + " + std::to_string(row);
+        if(!indirect_buffer)
+        {
+            std::string coord_y = y_tile.scalar(0, 0).str + " + " + std::to_string(row);
+
+            if(dilation_y.scalar(0, 0).str != "((int)(1))")
+            {
+                coord_y += " * " + dilation_y.scalar(0, 0).str;
+            }
 
-        if(dilation_y.scalar(0, 0).str != "1")
+            helper->write_row(row, coord_y);
+        }
+        else
         {
-            coord_y += " * " + dilation_y.scalar(0, 0).str;
+            helper->write_row(row, y_tile.scalar(row, 0).str);
         }
-
-        helper->write_row(row, coord_y);
     }
 
     helper->finalize();
diff --git a/compute_kernel_writer/src/cl/CLKernelWriter.h b/compute_kernel_writer/src/cl/CLKernelWriter.h
index c494847944..1e2e5dc910 100644
--- a/compute_kernel_writer/src/cl/CLKernelWriter.h
+++ b/compute_kernel_writer/src/cl/CLKernelWriter.h
@@ -131,40 +131,27 @@ public:
     // Memory Operations
     // =============================================================================================
 
-    /** Load the data from the tensor memory to the tile using the sampling information.
-     *
-     * Similar to @ref KernelWriter::op_load()
-     */
     void op_load(
         const TileOperand &tile_op, const TensorOperand &tensor_op, TensorSampler &sampler,
         const TileOperand &x, const TileOperand &y, const TileOperand &z, const TileOperand &batch) override;
 
-    /** Load the data from the tensor memory to the tile in a dilated way using the sampling information.
-     *
-     * Similar to @ref KernelWriter::op_load_dilated()
-     */
     void op_load_dilated(
         const TileOperand &tile_op, const TensorOperand &tensor_op, TensorSampler &sampler,
         const TileOperand &x, const TileOperand &y, const TileOperand &z, const TileOperand &batch,
         const TileOperand &dilation_x, const TileOperand &dilation_y) override;
 
-    /** Store the data to the tensor memory from the tile using the sampling information.
-     *
-     * Similar to @ref KernelWriter::op_store()
-     */
     void op_store(
         const TensorOperand &tensor_op, const TileOperand &tile_op, TensorSampler &sampler,
         const TileOperand &x, const TileOperand &y, const TileOperand &z, const TileOperand &batch) override;
 
-    /** Store the data to the tensor memory from the tile in a dilated way using the sampling information.
-     *
-     * Similar to @ref KernelWriter::op_store_dilated()
-     */
     void op_store_dilated(
         const TensorOperand &tensor_op, const TileOperand &tile_op, TensorSampler &sampler,
         const TileOperand &x, const TileOperand &y, const TileOperand &z, const TileOperand &batch,
         const TileOperand &dilation_x, const TileOperand &dilation_y) override;
 
+    void op_load_indirect(const TileOperand &tile_op, const TensorOperand &tensor_op, TensorSampler &sampler,
+        const TileOperand &x, const TileOperand &y, const TileOperand &z, const TileOperand &batch) override;
+
 protected:
     /** Return @ref CLTile object from the @ref TileOperand object.
      *
@@ -192,11 +179,10 @@ protected:
 
     // For helper functions
 private:
-    /** Helper function to consolidate all load/store logic in this class */
-    void op_load_store(
-        MemoryOperation op, const TileOperand &tile_op, const TensorOperand &tensor_op, TensorSampler &sampler,
+    /** Helper method to consolidate all load/store logic in this class */
+    void op_load_store(MemoryOperation op, const TileOperand &tile_op, const TensorOperand &tensor_op, TensorSampler &sampler,
         const TileOperand &x, const TileOperand &y, const TileOperand &z, const TileOperand &batch,
-        const CLTile &dilation_x, const CLTile &dilation_y);
+        const CLTile &dilation_x, const CLTile &dilation_y, bool indirect_buffer);
 
     /** This function is the generic function to write both `if` and `else if` blocks.
      *
diff --git a/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.cpp b/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.cpp
index e50418711e..f906bcd4b1 100644
--- a/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.cpp
+++ b/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.cpp
@@ -198,6 +198,9 @@ void CLMemoryOpBufferHelper::out_of_bound_initialize_y(const std::string &coord)
             max = _mapper->dim_y().str;
             _writer->op_write_raw_code("if(" + coord + " < " + max + ")\n{\n");
             break;
+        case TensorSamplerAddressModeY::SkipLessThanZero:
+            _writer->op_write_raw_code("if(" + coord + " >= 0)\n{\n");
+            break;
         case TensorSamplerAddressModeY::None:
             break;
         default:
@@ -216,6 +219,9 @@ void CLMemoryOpBufferHelper::out_of_bound_finalize_y(const std::string &dst)
             _writer->op_write_raw_code(dst);
             _writer->op_write_raw_code(" = 0.0f;\n}\n");
             break;
+        case TensorSamplerAddressModeY::SkipLessThanZero:
+            _writer->op_write_raw_code("}\n");
+            break;
         case TensorSamplerAddressModeY::None:
             break;
         default:
diff --git a/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.cpp b/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.cpp
index a5f0c17c16..55f88f4136 100644
--- a/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.cpp
+++ b/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.cpp
@@ -104,6 +104,9 @@ void CLMemoryOpImage2dHelper::out_of_bound_initialize_y(const std::string &coord
     const TensorSamplerAddressModeY address_mode_y = _sampler->address_mode_y();
     switch(address_mode_y)
     {
+        case TensorSamplerAddressModeY::SkipLessThanZero:
+            _writer->op_write_raw_code("if(" + coord + " >= 0)\n{\n");
+            break;
         case TensorSamplerAddressModeY::ClampToBorderMaxOnly:
         case TensorSamplerAddressModeY::None:
             break;
@@ -117,6 +120,9 @@ void CLMemoryOpImage2dHelper::out_of_bound_finalize_y()
     const TensorSamplerAddressModeY address_mode_y = _sampler->address_mode_y();
     switch(address_mode_y)
     {
+        case TensorSamplerAddressModeY::SkipLessThanZero:
+            _writer->op_write_raw_code("}\n");
+            break;
         case TensorSamplerAddressModeY::ClampToBorderMaxOnly:
         case TensorSamplerAddressModeY::None:
             break;
@@ -153,6 +159,7 @@ std::string CLMemoryOpImage2dHelper::to_ls_image2d_sampler() const
     {
         case TensorSamplerAddressModeY::None:
             return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST";
+        case TensorSamplerAddressModeY::SkipLessThanZero:
         case TensorSamplerAddressModeY::ClampToBorderMaxOnly:
             return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST";
         default:
-- 
cgit v1.2.1