aboutsummaryrefslogtreecommitdiff
path: root/compute_kernel_writer
diff options
context:
space:
mode:
authorGunes Bayir <gunes.bayir@arm.com>2024-01-17 16:07:03 +0000
committerViet-Hoa Do <viet-hoa.do@arm.com>2024-02-01 16:00:34 +0000
commit2b9fa593a0a172bf36a02b5cdb840c6b9b361d7c (patch)
treea4e2d5ce46443a79a0778e4960462ce3edf106ec /compute_kernel_writer
parent7ab7fca87cca8775f82b0e9efec6a40975910c17 (diff)
downloadComputeLibrary-2b9fa593a0a172bf36a02b5cdb840c6b9b361d7c.tar.gz
Use the stable CKW API in the GPU dynamic fusion backend
- Refactor all kernels to work with the CKW stable API - Add support for sub-tile in the op_load/op_store CKW operator - Fix mismatch in resize - Add comments in all kernels written with CKW to help developers understand the structure of the code - Add texture image support in depthwise convolution written with CKW - Add support for different block sizes in depthwise convolution - Remove the use of the dynamic fusion helper functions. - Add support for floor in the op_unary() of CKW Resolves: COMPMID-6708, COMPMID-6743, COMPMID-6530 Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Signed-off-by: Gunes Bayir <gunes.bayir@arm.com> Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com> Signed-off-by: Jakub Sujak <jakub.sujak@arm.com> Change-Id: I8104ce4d04a3138a1aeb0b84940e1f1c89e76069 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10914 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Jakub Sujak <jakub.sujak@arm.com> Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'compute_kernel_writer')
-rw-r--r--compute_kernel_writer/include/ckw/KernelWriter.h23
-rw-r--r--compute_kernel_writer/include/ckw/TensorOperand.h11
-rw-r--r--compute_kernel_writer/include/ckw/TileOperand.h13
-rw-r--r--compute_kernel_writer/include/ckw/types/ConstantData.h4
-rw-r--r--compute_kernel_writer/include/ckw/types/Operators.h1
-rw-r--r--compute_kernel_writer/src/KernelWriter.cpp3
-rw-r--r--compute_kernel_writer/src/TensorOperand.cpp58
-rw-r--r--compute_kernel_writer/src/TensorSampler.cpp2
-rw-r--r--compute_kernel_writer/src/TileOperand.cpp14
-rw-r--r--compute_kernel_writer/src/TileView.h20
-rw-r--r--compute_kernel_writer/src/cl/CLHelpers.cpp3
-rw-r--r--compute_kernel_writer/src/cl/CLKernelWriter.cpp88
-rw-r--r--compute_kernel_writer/src/cl/CLTensorArgument.h3
-rw-r--r--compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.cpp43
-rw-r--r--compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.h34
-rw-r--r--compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.cpp35
-rw-r--r--compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.h28
-rw-r--r--compute_kernel_writer/src/cl/helpers/ICLMemoryOpHelper.h24
-rw-r--r--compute_kernel_writer/src/types/ConstantData.cpp28
-rw-r--r--compute_kernel_writer/validation/tests/CLKernelWriterBinaryOpTest.h28
-rw-r--r--compute_kernel_writer/validation/tests/CLKernelWriterDeclareTensorTest.h10
21 files changed, 323 insertions, 150 deletions
diff --git a/compute_kernel_writer/include/ckw/KernelWriter.h b/compute_kernel_writer/include/ckw/KernelWriter.h
index 0d739e859a..da41b940d7 100644
--- a/compute_kernel_writer/include/ckw/KernelWriter.h
+++ b/compute_kernel_writer/include/ckw/KernelWriter.h
@@ -25,11 +25,22 @@
#ifndef CKW_INCLUDE_CKW_KERNELWRITER_H
#define CKW_INCLUDE_CKW_KERNELWRITER_H
+#include "ckw/Kernel.h"
+#include "ckw/TensorInfo.h"
#include "ckw/TensorOperand.h"
+#include "ckw/TensorSampler.h"
+#include "ckw/TileInfo.h"
#include "ckw/TileOperand.h"
#include "ckw/types/ConstantData.h"
#include "ckw/types/ConvertPolicy.h"
+#include "ckw/types/DataType.h"
#include "ckw/types/Operators.h"
+#include "ckw/types/TargetArchitecture.h"
+#include "ckw/types/TargetLanguage.h"
+#include "ckw/types/TensorComponentType.h"
+#include "ckw/types/TensorDataLayout.h"
+#include "ckw/types/TensorSamplerTypes.h"
+#include "ckw/types/TensorStorageType.h"
#include <functional>
#include <memory>
@@ -39,16 +50,8 @@
namespace ckw
{
-/** Forward Declerations */
-class Kernel;
-class TensorInfo;
-class TensorSampler;
+/** Forward Declarations */
class TileArea;
-class TileInfo;
-
-enum class DataType;
-enum class TargetArchitecture;
-enum class TargetLanguage;
/** A kernel writer.
*
@@ -350,7 +353,6 @@ public:
const TileOperand &z,
const TileOperand &batch_op) = 0;
-protected:
// =============================================================================================
// ID space management
// =============================================================================================
@@ -367,6 +369,7 @@ protected:
/** Get the current ID space. */
int32_t id_space() const;
+protected:
/** Set the current ID space.
*
* @param[in] value The ID space to be used.
diff --git a/compute_kernel_writer/include/ckw/TensorOperand.h b/compute_kernel_writer/include/ckw/TensorOperand.h
index 2672cd5334..a3e53d1314 100644
--- a/compute_kernel_writer/include/ckw/TensorOperand.h
+++ b/compute_kernel_writer/include/ckw/TensorOperand.h
@@ -43,6 +43,15 @@ public:
// Only kernel writer class interacts with tensor operand hence we allow it to access this field.
friend class KernelWriter;
+ /** Create an empty tensor operand.
+ *
+ * The new tensor operand doesn't refer to any tensor therefore it is not useable.
+ */
+ TensorOperand();
+
+ /** Check if the tensor operand contains a tensor and therefore useable. */
+ bool is_valid() const;
+
/** Get the tensor info. */
const TensorInfo &info() const;
@@ -92,7 +101,7 @@ private:
/** Initialize a new instance of @ref TensorOperand class for a tensor. */
TensorOperand(ITensor &tensor);
- ITensor &_tensor;
+ ITensor *_tensor;
};
} // namespace ckw
diff --git a/compute_kernel_writer/include/ckw/TileOperand.h b/compute_kernel_writer/include/ckw/TileOperand.h
index 56dc5e7b2b..556d589bc0 100644
--- a/compute_kernel_writer/include/ckw/TileOperand.h
+++ b/compute_kernel_writer/include/ckw/TileOperand.h
@@ -33,6 +33,7 @@ namespace ckw
class KernelWriter;
class TensorOperand;
class ITile;
+class TileInfo;
/** A tile operand refers to a tile object that can be used for kernel writing. */
class TileOperand
@@ -43,6 +44,18 @@ public:
friend class KernelWriter;
friend class TensorOperand;
+ /** Create an empty tile operand.
+ *
+ * The new tile operand doesn't refer to any tile therefore it is not useable.
+ */
+ TileOperand();
+
+ /** Check if the tile operand contains a tile and therefore useable. */
+ bool is_valid() const;
+
+ /** Get the tile info. */
+ const TileInfo &tile_info() const;
+
/** Get a row vector of the current tile operand.
*
* @param[in] row The index of the row to be accessed in the current tile operand.
diff --git a/compute_kernel_writer/include/ckw/types/ConstantData.h b/compute_kernel_writer/include/ckw/types/ConstantData.h
index 7708818ca8..ea95049c9e 100644
--- a/compute_kernel_writer/include/ckw/types/ConstantData.h
+++ b/compute_kernel_writer/include/ckw/types/ConstantData.h
@@ -53,6 +53,10 @@ public:
template <typename T>
ConstantData(std::initializer_list<std::initializer_list<T>> values, DataType data_type);
+ /** Templated constructor */
+ template <typename T>
+ ConstantData(const std::vector<std::vector<T>> &values, DataType data_type);
+
private:
/** Validate the given data type and the template type
*
diff --git a/compute_kernel_writer/include/ckw/types/Operators.h b/compute_kernel_writer/include/ckw/types/Operators.h
index 1e5f9bd542..77b0519422 100644
--- a/compute_kernel_writer/include/ckw/types/Operators.h
+++ b/compute_kernel_writer/include/ckw/types/Operators.h
@@ -43,6 +43,7 @@ enum class UnaryOp : int32_t
Fabs = 0x0014,
Log = 0x0015,
Round = 0x0016,
+ Floor = 0x0017,
};
/** Assignment operators. */
diff --git a/compute_kernel_writer/src/KernelWriter.cpp b/compute_kernel_writer/src/KernelWriter.cpp
index a478231c09..92a36746ce 100644
--- a/compute_kernel_writer/src/KernelWriter.cpp
+++ b/compute_kernel_writer/src/KernelWriter.cpp
@@ -107,7 +107,8 @@ TensorOperand KernelWriter::create_tensor_operand(ITensor &tensor)
ITensor &KernelWriter::get_tensor(const TensorOperand &operand)
{
- return operand._tensor;
+ CKW_ASSERT(operand._tensor != nullptr);
+ return *operand._tensor;
}
const std::vector<std::vector<std::string>> &KernelWriter::get_values(const ConstantData &data)
diff --git a/compute_kernel_writer/src/TensorOperand.cpp b/compute_kernel_writer/src/TensorOperand.cpp
index bf11d0d332..94997537d8 100644
--- a/compute_kernel_writer/src/TensorOperand.cpp
+++ b/compute_kernel_writer/src/TensorOperand.cpp
@@ -21,91 +21,115 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-
#include "ckw/TensorOperand.h"
+#include "ckw/Error.h"
+
#include "src/ITensor.h"
namespace ckw
{
-TensorOperand::TensorOperand(ITensor &tensor) : _tensor(tensor)
+TensorOperand::TensorOperand() : _tensor(nullptr)
+{
+}
+
+TensorOperand::TensorOperand(ITensor &tensor) : _tensor(&tensor)
+{
+}
+
+bool TensorOperand::is_valid() const
{
+ return _tensor != nullptr;
}
const TensorInfo &TensorOperand::info() const
{
- return _tensor.info();
+ CKW_ASSERT(is_valid() == true);
+ return _tensor->info();
}
TileOperand TensorOperand::stride0()
{
- return TileOperand(_tensor.component(TensorComponentType::Stride0));
+ CKW_ASSERT(is_valid() == true);
+ return TileOperand(_tensor->component(TensorComponentType::Stride0));
}
TileOperand TensorOperand::stride1()
{
- return TileOperand(_tensor.component(TensorComponentType::Stride1));
+ CKW_ASSERT(is_valid() == true);
+ return TileOperand(_tensor->component(TensorComponentType::Stride1));
}
TileOperand TensorOperand::stride2()
{
- return TileOperand(_tensor.component(TensorComponentType::Stride2));
+ CKW_ASSERT(is_valid() == true);
+ return TileOperand(_tensor->component(TensorComponentType::Stride2));
}
TileOperand TensorOperand::stride3()
{
- return TileOperand(_tensor.component(TensorComponentType::Stride3));
+ CKW_ASSERT(is_valid() == true);
+ return TileOperand(_tensor->component(TensorComponentType::Stride3));
}
TileOperand TensorOperand::stride4()
{
- return TileOperand(_tensor.component(TensorComponentType::Stride4));
+ CKW_ASSERT(is_valid() == true);
+ return TileOperand(_tensor->component(TensorComponentType::Stride4));
}
TileOperand TensorOperand::dim0()
{
- return TileOperand(_tensor.component(TensorComponentType::Dim0));
+ return TileOperand(_tensor->component(TensorComponentType::Dim0));
}
TileOperand TensorOperand::dim1()
{
- return TileOperand(_tensor.component(TensorComponentType::Dim1));
+ CKW_ASSERT(is_valid() == true);
+ return TileOperand(_tensor->component(TensorComponentType::Dim1));
}
TileOperand TensorOperand::dim2()
{
- return TileOperand(_tensor.component(TensorComponentType::Dim2));
+ CKW_ASSERT(is_valid() == true);
+ return TileOperand(_tensor->component(TensorComponentType::Dim2));
}
TileOperand TensorOperand::dim3()
{
- return TileOperand(_tensor.component(TensorComponentType::Dim3));
+ CKW_ASSERT(is_valid() == true);
+ return TileOperand(_tensor->component(TensorComponentType::Dim3));
}
TileOperand TensorOperand::dim4()
{
- return TileOperand(_tensor.component(TensorComponentType::Dim4));
+ CKW_ASSERT(is_valid() == true);
+ return TileOperand(_tensor->component(TensorComponentType::Dim4));
}
TileOperand TensorOperand::dim1_dim2()
{
- return TileOperand(_tensor.component(TensorComponentType::Dim1xDim2));
+ CKW_ASSERT(is_valid() == true);
+ return TileOperand(_tensor->component(TensorComponentType::Dim1xDim2));
}
TileOperand TensorOperand::dim1_dim2_dim3()
{
- return TileOperand(_tensor.component(TensorComponentType::Dim1xDim2xDim3));
+ CKW_ASSERT(is_valid() == true);
+ return TileOperand(_tensor->component(TensorComponentType::Dim1xDim2xDim3));
}
TileOperand TensorOperand::dim2_dim3()
{
- return TileOperand(_tensor.component(TensorComponentType::Dim2xDim3));
+ CKW_ASSERT(is_valid() == true);
+ return TileOperand(_tensor->component(TensorComponentType::Dim2xDim3));
}
TileOperand TensorOperand::offset_first_element_in_bytes()
{
- return TileOperand(_tensor.component(TensorComponentType::OffsetFirstElement));
+ CKW_ASSERT(is_valid() == true);
+ return TileOperand(_tensor->component(TensorComponentType::OffsetFirstElement));
}
} // namespace ckw
diff --git a/compute_kernel_writer/src/TensorSampler.cpp b/compute_kernel_writer/src/TensorSampler.cpp
index 91d5af2fd0..e81c5f9d66 100644
--- a/compute_kernel_writer/src/TensorSampler.cpp
+++ b/compute_kernel_writer/src/TensorSampler.cpp
@@ -27,6 +27,8 @@
namespace ckw
{
+TensorSampler::TensorSampler() = default;
+
TensorSampler::TensorSampler(TensorStorageType storage,
TensorSamplerFormat format,
TensorSamplerAddressModeX address_mode_x,
diff --git a/compute_kernel_writer/src/TileOperand.cpp b/compute_kernel_writer/src/TileOperand.cpp
index 865ef85a13..8ced6cfe3f 100644
--- a/compute_kernel_writer/src/TileOperand.cpp
+++ b/compute_kernel_writer/src/TileOperand.cpp
@@ -31,6 +31,10 @@
namespace ckw
{
+TileOperand::TileOperand() : _tile(nullptr), _row_start(0), _row_end(0), _col_start(0), _col_end(0)
+{
+}
+
TileOperand::TileOperand(ITile &tile)
: _tile(&tile), _row_start(0), _row_end(tile.info().height()), _col_start(0), _col_end(tile.info().width())
{
@@ -46,6 +50,16 @@ TileOperand::TileOperand(
CKW_ASSERT(col_end > col_start && col_end <= _tile->info().width());
}
+bool TileOperand::is_valid() const
+{
+ return _tile != nullptr;
+}
+
+const TileInfo &TileOperand::tile_info() const
+{
+ return _tile->info();
+}
+
TileOperand TileOperand::tile(int32_t row_start, int32_t row_end, int32_t col_start, int32_t col_end) const
{
CKW_ASSERT(row_start >= 0 && _row_start + row_start < _row_end);
diff --git a/compute_kernel_writer/src/TileView.h b/compute_kernel_writer/src/TileView.h
index 50ae66b389..42854ac823 100644
--- a/compute_kernel_writer/src/TileView.h
+++ b/compute_kernel_writer/src/TileView.h
@@ -78,6 +78,10 @@ template <typename T>
class TileView
{
public:
+ /** Default constructor */
+ TileView() : _tile(nullptr), _area(0, 0, 0, 0)
+ {
+ }
/** Create a tile view that refers to the whole tile.
*
* @param[in] tile The tile object.
@@ -179,6 +183,22 @@ public:
col_end() == _tile->info().width();
}
+ /** Set the rectangular active area.
+ *
+ * @param[in] area The rectangular active area.
+ */
+ TileView &area(const TileArea &area)
+ {
+ _area = area;
+ return *this;
+ }
+
+ /** Get the tile area */
+ TileArea area() const
+ {
+ return _area;
+ }
+
private:
const T *_tile;
TileArea _area;
diff --git a/compute_kernel_writer/src/cl/CLHelpers.cpp b/compute_kernel_writer/src/cl/CLHelpers.cpp
index 8e4a932764..252c5cdfcb 100644
--- a/compute_kernel_writer/src/cl/CLHelpers.cpp
+++ b/compute_kernel_writer/src/cl/CLHelpers.cpp
@@ -193,6 +193,9 @@ std::tuple<bool, std::string> cl_get_unary_op(UnaryOp op)
case UnaryOp::Round:
return {true, "round"};
+ case UnaryOp::Floor:
+ return {true, "floor"};
+
default:
CKW_THROW_MSG("Unsupported unary operation!");
}
diff --git a/compute_kernel_writer/src/cl/CLKernelWriter.cpp b/compute_kernel_writer/src/cl/CLKernelWriter.cpp
index 62e6853a7a..8b4876b6a7 100644
--- a/compute_kernel_writer/src/cl/CLKernelWriter.cpp
+++ b/compute_kernel_writer/src/cl/CLKernelWriter.cpp
@@ -47,6 +47,25 @@
#include <tuple>
#include <vector>
+namespace
+{
+std::string generate_cl_extensions()
+{
+ std::string ext = R"(
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(cl_khr_fp16)
+
+#if defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(cl_arm_printf);
+
+#define inf (INFINITY)
+)";
+ return ext;
+}
+} // namespace
+
namespace ckw
{
@@ -56,7 +75,7 @@ CLKernelWriter::~CLKernelWriter() = default;
std::unique_ptr<Kernel> CLKernelWriter::emit_kernel(const std::string &name)
{
std::string code;
-
+ code += generate_cl_extensions();
code += "__kernel void ";
code += name;
code += "\n(\n";
@@ -154,21 +173,31 @@ void CLKernelWriter::op_cast(const TileOperand &dst, const TileOperand &src, Con
const auto dst_type_str = cl_get_variable_datatype_as_string(dst_type, dst_w);
const std::string sat = policy == ConvertPolicy::Saturate ? "_sat" : "";
+
CKW_ASSERT_IF(policy == ConvertPolicy::Saturate, !is_data_type_float(dst_type));
const auto broadcast_x = dst_w != 1 && src_w == 1;
const std::string prefix = broadcast_x ? "(" + dst_type_str + ")" : "";
- CKW_ASSERT_MSG(src_view.data_type() != dst_view.data_type(), "Source and destination type must be different.");
CKW_ASSERT_MSG(src_view.height() == dst_h || src_view.height() == 1,
"Tile height must match or source is broadcasting in y dimension.");
CKW_ASSERT_MSG(src_w == dst_w || src_w == 1, "Tile width must match or source is broadcasting in x dimension.");
// Broadcasting on y dimension is automatic (see CLTile::vector).
- for (int32_t y = 0; y < dst_h; ++y)
+ if (src_view.data_type() == dst_view.data_type())
+ {
+ for (int32_t y = 0; y < dst_h; ++y)
+ {
+ append_code(dst_view.vector(y).str, " = ", src_view.vector(y).str, ";\n");
+ }
+ }
+ else
{
- append_code(dst_view.vector(y).str, " = ", prefix, "convert_", convert_type_str, sat, "(",
- src_view.vector(y).str, ");\n");
+ for (int32_t y = 0; y < dst_h; ++y)
+ {
+ append_code(dst_view.vector(y).str, " = ", prefix, "convert_", convert_type_str, sat, "(",
+ src_view.vector(y).str, ");\n");
+ }
}
}
@@ -219,18 +248,12 @@ void CLKernelWriter::op_binary(const TileOperand &dst, BinaryOp op, const TileOp
CKW_ASSERT_MSG(lhs_view.data_type() == rhs_view.data_type(), "LHS and RHS type must match.");
- CKW_ASSERT_MSG(lhs_view.height() == dst_h || lhs_view.height() == 1,
- "LHS tile height must match or source is broadcasting in y dimension.");
- CKW_ASSERT_MSG(rhs_view.height() == dst_h || rhs_view.height() == 1,
- "RHS tile height must match or source is broadcasting in y dimension.");
-
- CKW_ASSERT_MSG(lhs_w == dst_w || lhs_w == 1,
- "LHS tile width must match destination or LHS is broadcasting in x dimension.");
- CKW_ASSERT_MSG(rhs_w == dst_w || rhs_w == 1,
- "RHS tile width must match destination or RHS is broadcasting in x dimension.");
-
if (op == BinaryOp::MatMul_Nt_T)
{
+ CKW_ASSERT_MSG(lhs_view.height() == dst_h, "LHS tile height must match the DST tile height");
+ CKW_ASSERT_MSG(rhs_view.height() == dst_w, "RHS tile height must match the DST tile width");
+ CKW_ASSERT_MSG(lhs_view.width() == rhs_view.width(), "LHS tile width must match the LHS tile width");
+
CKW_ASSERT(is_data_type_float(data_type));
for (int32_t y = 0; y < dst_h; ++y)
@@ -239,14 +262,24 @@ void CLKernelWriter::op_binary(const TileOperand &dst, BinaryOp op, const TileOp
{
for (int32_t k = 0; k < lhs_w; ++k)
{
- append_code(dst_view.scalar(x, y).str, " = fma(", lhs_view.scalar(k, y).str, ", ",
- rhs_view.scalar(k, x).str, ", ", dst_view.scalar(x, y).str, ");\n");
+ append_code(dst_view.scalar(y, x).str, " = fma(", lhs_view.scalar(y, k).str, ", ",
+ rhs_view.scalar(x, k).str, ", ", dst_view.scalar(y, x).str, ");\n");
}
}
}
}
else
{
+ CKW_ASSERT_MSG(lhs_view.height() == dst_h || lhs_view.height() == 1,
+ "LHS tile height must match or source is broadcasting in y dimension.");
+ CKW_ASSERT_MSG(rhs_view.height() == dst_h || rhs_view.height() == 1,
+ "RHS tile height must match or source is broadcasting in y dimension.");
+
+ CKW_ASSERT_MSG(lhs_w == dst_w || lhs_w == 1,
+ "LHS tile width must match destination or LHS is broadcasting in x dimension.");
+ CKW_ASSERT_MSG(rhs_w == dst_w || rhs_w == 1,
+ "RHS tile width must match destination or RHS is broadcasting in x dimension.");
+
const auto op_info = cl_get_binary_op(op, data_type);
const auto op_is_func = std::get<0>(op_info);
const auto &op_name = std::get<1>(op_info);
@@ -746,36 +779,35 @@ void CLKernelWriter::op_load_store(MemoryOperation op,
ITensor &tensor = get_tensor(tensor_op);
+ const auto tile = to_cl_tile_view(tile_op);
+ const auto x_tile = to_cl_tile_view(x).full_tile();
+ const auto y_tile = to_cl_tile_view(y).full_tile();
+ const auto z_tile = to_cl_tile_view(z).full_tile();
+ const auto batch_tile = to_cl_tile_view(batch).full_tile();
+
std::unique_ptr<ICLMemoryOpHelper> helper;
switch (sampler.storage())
{
case TensorStorageType::BufferUint8Ptr:
- helper = std::make_unique<CLMemoryOpBufferHelper>(this, &tensor, &sampler, op);
+ helper = std::make_unique<CLMemoryOpBufferHelper>(this, &tensor, &sampler, op, tile);
break;
case TensorStorageType::Texture2dReadOnly:
case TensorStorageType::Texture2dWriteOnly:
- helper = std::make_unique<CLMemoryOpImage2dHelper>(this, &tensor, &sampler, op);
+ helper = std::make_unique<CLMemoryOpImage2dHelper>(this, &tensor, &sampler, op, tile);
break;
default:
CKW_THROW_MSG("Unsupported tensor storage");
}
- // Load/store op doesn't support sub-tile access.
- const auto tile = to_cl_tile_view(tile_op).full_tile();
- const auto x_tile = to_cl_tile_view(x).full_tile();
- const auto y_tile = to_cl_tile_view(y).full_tile();
- const auto z_tile = to_cl_tile_view(z).full_tile();
- const auto batch_tile = to_cl_tile_view(batch).full_tile();
-
CKW_ASSERT(x_tile.is_scalar());
CKW_ASSERT(z_tile.is_scalar());
CKW_ASSERT_IF(indirect_buffer, y_tile.info().width() == 1);
CKW_ASSERT_IF(!indirect_buffer, y_tile.is_scalar());
CKW_ASSERT(batch_tile.is_scalar());
- helper->initialize(&tile, &x_tile, &z_tile, &batch_tile);
+ helper->initialize(&x_tile, &z_tile, &batch_tile);
- for (int row = 0; row < tile.info().height(); ++row)
+ for (int row = 0; row < tile.height(); ++row)
{
if (!indirect_buffer)
{
diff --git a/compute_kernel_writer/src/cl/CLTensorArgument.h b/compute_kernel_writer/src/cl/CLTensorArgument.h
index 35df51422e..a79cf340bb 100644
--- a/compute_kernel_writer/src/cl/CLTensorArgument.h
+++ b/compute_kernel_writer/src/cl/CLTensorArgument.h
@@ -27,6 +27,7 @@
#include "ckw/types/TensorComponentType.h"
#include "ckw/types/TensorStorageType.h"
+#include "src/cl/CLTensorComponent.h"
#include "src/ITensor.h"
#include <memory>
@@ -39,8 +40,6 @@ namespace ckw
class TensorInfo;
class ITensorComponent;
-class CLTensorComponent;
-class CLTensorStorage;
/** OpenCL specific tensor argument
* Internally, the object keeps track of the components and storages used to minimize the number
diff --git a/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.cpp b/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.cpp
index a98ebed8fa..7d16f35fbe 100644
--- a/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.cpp
+++ b/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.cpp
@@ -34,15 +34,16 @@
#include "src/cl/CLTile.h"
#include "src/ITensor.h"
#include "src/Tensor3dMapper.h"
+#include "src/TileView.h"
namespace ckw
{
-bool CLMemoryOpBufferHelper::validate(const CLKernelWriter *writer,
- const ITensor *tensor,
- const TensorSampler *sampler,
- const Tensor3dMapper *mapper,
- MemoryOperation op,
- const CLTile *dst)
+bool CLMemoryOpBufferHelper::validate(const CLKernelWriter *writer,
+ const ITensor *tensor,
+ const TensorSampler *sampler,
+ const Tensor3dMapper *mapper,
+ MemoryOperation op,
+ const TileView<CLTile> &dst)
{
CKW_UNUSED(writer, tensor, mapper, op, dst);
@@ -100,17 +101,14 @@ bool CLMemoryOpBufferHelper::validate(const CLKernelWriter *writer,
* The outermost block is x, then z and then y. This is why, if/else's covering for y are initialized
* at each row write. In some addressing modes, such as None, no if/else conditions are written.
*/
-void CLMemoryOpBufferHelper::initialize(const CLTile *dst, const CLTile *x, const CLTile *z, const CLTile *b)
+void CLMemoryOpBufferHelper::initialize(const CLTile *x, const CLTile *z, const CLTile *b)
{
- _dst = dst;
-
CKW_ASSERT(validate(_writer, _tensor, _sampler, _mapper.get(), _op, _dst));
- _ls_width_full = dst->info().width();
- _coord_x = x->scalar(0, 0).str;
- _coord_z = z->scalar(0, 0).str;
- _coord_b = b->scalar(0, 0).str;
- _coord_orig_z = _coord_z;
+ _coord_x = x->scalar(0, 0).str;
+ _coord_z = z->scalar(0, 0).str;
+ _coord_b = b->scalar(0, 0).str;
+ _coord_orig_z = _coord_z;
out_of_bound_initialize_x(_coord_x);
out_of_bound_initialize_z(_coord_z);
@@ -121,7 +119,7 @@ void CLMemoryOpBufferHelper::write_row(int32_t row_id, const std::string &coord_
// The only check required is on Y.
out_of_bound_initialize_y(coord_y);
- const std::string dst = _dst->vector(row_id).str;
+ const std::string dst = _dst.vector(row_id).str;
const std::string address = to_buffer_address(_coord_x, coord_y, _coord_z, _coord_b);
const std::string ls_buf = to_statement(_op, _ls_width_full, dst, address);
@@ -133,10 +131,17 @@ void CLMemoryOpBufferHelper::write_row(int32_t row_id, const std::string &coord_
// The left over load/store will be written in the finalize stage
if (_ls_width_part.size() != 0)
{
- int32_t col_start = 0;
+ int32_t col_start = 0;
+ const TileArea original_area = _dst.area();
+
for (int32_t partial_width : _ls_width_part)
{
- const std::string dst = _dst->vector(row_id, col_start, partial_width).str;
+ // Set the active area
+ const TileArea area(original_area.row_start(), original_area.row_end(), col_start,
+ col_start + partial_width);
+ _dst.area(area);
+
+ const std::string dst = _dst.vector(row_id).str;
const std::string coord_x = _coord_x + " + " + std::to_string(col_start);
const std::string address = to_buffer_address(coord_x, coord_y, _coord_z, _coord_b);
const std::string statement = to_statement(_op, partial_width, dst, address);
@@ -144,6 +149,8 @@ void CLMemoryOpBufferHelper::write_row(int32_t row_id, const std::string &coord_
col_start += partial_width;
}
+ // Restore the original area
+ _dst.area(original_area);
}
}
@@ -304,7 +311,7 @@ std::string CLMemoryOpBufferHelper::to_buffer_address(const std::string &x,
CKW_ASSERT(tensor_storage == TensorStorageType::BufferUint8Ptr);
const std::string ptr_buf = _tensor->storage(tensor_storage).val;
- const std::string dst_type = cl_data_type_rounded_up_to_valid_vector_width(_dst->info().data_type(), 1);
+ const std::string dst_type = cl_data_type_rounded_up_to_valid_vector_width(_dst.data_type(), 1);
std::string address;
address += "(__global ";
diff --git a/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.h b/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.h
index 4e1a842fe1..a6b3272f32 100644
--- a/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.h
+++ b/compute_kernel_writer/src/cl/helpers/CLMemoryOpBufferHelper.h
@@ -22,8 +22,8 @@
* SOFTWARE.
*/
-#ifndef CKW_SRC_CL_CLMEMORYOPBUFFERHELPER_H
-#define CKW_SRC_CL_CLMEMORYOPBUFFERHELPER_H
+#ifndef CKW_SRC_CL_HELPERS_CLMEMORYOPBUFFERHELPER_H
+#define CKW_SRC_CL_HELPERS_CLMEMORYOPBUFFERHELPER_H
#include "src/cl/helpers/ICLMemoryOpHelper.h"
@@ -37,6 +37,8 @@ namespace ckw
// Forward Declarations
class CLKernelWriter;
class CLTile;
+template <class CLTile>
+class TileView;
enum class MemoryOperation;
/** Helper class to write memory operations (like load/store) in OpenCL
@@ -45,19 +47,23 @@ class CLMemoryOpBufferHelper : public ICLMemoryOpHelper
{
public:
/** Constructor similar to @ref ICLMemoryOpHelper() */
- CLMemoryOpBufferHelper(CLKernelWriter *writer, ITensor *tensor, TensorSampler *sampler, MemoryOperation op)
- : ICLMemoryOpHelper(writer, tensor, sampler, op)
+ CLMemoryOpBufferHelper(CLKernelWriter *writer,
+ ITensor *tensor,
+ TensorSampler *sampler,
+ MemoryOperation op,
+ const TileView<CLTile> &dst)
+ : ICLMemoryOpHelper(writer, tensor, sampler, op, dst)
{
}
/** Copy constructor */
- CLMemoryOpBufferHelper(const CLMemoryOpBufferHelper &) = default;
+ CLMemoryOpBufferHelper(const CLMemoryOpBufferHelper &) = delete;
/** Assignment operator overload */
- CLMemoryOpBufferHelper &operator=(const CLMemoryOpBufferHelper &) = default;
+ CLMemoryOpBufferHelper &operator=(const CLMemoryOpBufferHelper &) = delete;
// Methods overridden
- void initialize(const CLTile *dst, const CLTile *x, const CLTile *z, const CLTile *b) override;
+ void initialize(const CLTile *x, const CLTile *z, const CLTile *b) override;
void write_row(int32_t row_id, const std::string &coord_y) override;
void finalize() override;
@@ -78,12 +84,12 @@ private:
std::vector<LeftoverDescriptor> _leftovers_x{};
std::string _coord_orig_z{};
- static bool validate(const CLKernelWriter *writer,
- const ITensor *tensor,
- const TensorSampler *sampler,
- const Tensor3dMapper *mapper,
- MemoryOperation op,
- const CLTile *dst);
+ static bool validate(const CLKernelWriter *writer,
+ const ITensor *tensor,
+ const TensorSampler *sampler,
+ const Tensor3dMapper *mapper,
+ MemoryOperation op,
+ const TileView<CLTile> &dst);
void out_of_bound_initialize_x(const std::string &coord);
void out_of_bound_finalize_x();
@@ -99,4 +105,4 @@ private:
};
} // namespace ckw
-#endif /* CKW_SRC_CL_CLMEMORYOPBUFFERHELPER_H */
+#endif // CKW_SRC_CL_HELPERS_CLMEMORYOPBUFFERHELPER_H
diff --git a/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.cpp b/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.cpp
index b7d146bdee..f392cd89cc 100644
--- a/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.cpp
+++ b/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.cpp
@@ -33,18 +33,15 @@
#include "src/cl/CLTile.h"
#include "src/ITensor.h"
#include "src/Tensor3dMapper.h"
+#include "src/TileView.h"
namespace ckw
{
-void CLMemoryOpImage2dHelper::initialize(const CLTile *dst, const CLTile *x, const CLTile *z, const CLTile *b)
+void CLMemoryOpImage2dHelper::initialize(const CLTile *x, const CLTile *z, const CLTile *b)
{
- CKW_ASSERT(validate(_writer, _tensor, _sampler, _mapper.get(), _op, dst));
-
- _dst = dst;
- _ls_width_full = dst->info().width();
- _coord_x = x->scalar(0, 0).str;
- _coord_z = z->scalar(0, 0).str;
- _coord_b = b->scalar(0, 0).str;
+ _coord_x = x->scalar(0, 0).str;
+ _coord_z = z->scalar(0, 0).str;
+ _coord_b = b->scalar(0, 0).str;
}
void CLMemoryOpImage2dHelper::write_row(int32_t row_id, const std::string &coord_y)
@@ -52,7 +49,7 @@ void CLMemoryOpImage2dHelper::write_row(int32_t row_id, const std::string &coord
// The only check required is on Y.
out_of_bound_initialize_y(coord_y);
- const std::string dst = _dst->vector(row_id).str;
+ const std::string dst = _dst.vector(row_id).str;
const std::string sampler = to_ls_image2d_sampler();
const std::string coord = to_ls_image2d_address(_coord_x, coord_y, _coord_z, _coord_b);
const std::string ls_buf = to_ls_image2d(_op, _ls_width_full, dst, sampler, coord);
@@ -66,16 +63,16 @@ void CLMemoryOpImage2dHelper::finalize()
{
}
-bool CLMemoryOpImage2dHelper::validate(const CLKernelWriter *writer,
- const ITensor *tensor,
- const TensorSampler *sampler,
- const Tensor3dMapper *mapper,
- MemoryOperation op,
- const CLTile *dst)
+bool CLMemoryOpImage2dHelper::validate(const CLKernelWriter *writer,
+ const ITensor *tensor,
+ const TensorSampler *sampler,
+ const Tensor3dMapper *mapper,
+ MemoryOperation op,
+ const TileView<CLTile> &dst)
{
CKW_UNUSED(writer, tensor, mapper);
- if (dst->info().width() != 4)
+ if (dst.width() != 4)
{
return false;
}
@@ -95,7 +92,7 @@ bool CLMemoryOpImage2dHelper::validate(const CLKernelWriter *writer,
{
return false;
}
- if ((dst->info().data_type() != DataType::Fp32) && (dst->info().data_type() != DataType::Fp16))
+ if ((dst.data_type() != DataType::Fp32) && (dst.data_type() != DataType::Fp16))
{
return false;
}
@@ -143,10 +140,12 @@ std::string CLMemoryOpImage2dHelper::to_ls_image2d(MemoryOperation op,
const std::string &address) const
{
CKW_UNUSED(vector_width);
+ CKW_ASSERT_MSG(_dst.data_type() == DataType::Fp32 || _dst.data_type() == DataType::Fp16,
+ "Image2d only supports floating-point data type");
const TensorStorageType tensor_storage = _sampler->storage();
const std::string image2d_obj = _tensor->storage(tensor_storage).val;
- const std::string post_fix = _dst->info().data_type() == DataType::Fp32 ? "f" : "h";
+ const std::string post_fix = _dst.data_type() == DataType::Fp32 ? "f" : "h";
switch (op)
{
diff --git a/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.h b/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.h
index fd9b097a24..6c42c132d9 100644
--- a/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.h
+++ b/compute_kernel_writer/src/cl/helpers/CLMemoryOpImage2dHelper.h
@@ -35,6 +35,8 @@ namespace ckw
// Forward Declarations
class CLKernelWriter;
class CLTile;
+template <class CLTile>
+class TileView;
enum class MemoryOperation;
/** Helper class to write memory operations (like load/store) in OpenCL for Image2d type */
@@ -42,29 +44,33 @@ class CLMemoryOpImage2dHelper : public ICLMemoryOpHelper
{
public:
/** Constructor similar to @ref ICLMemoryOpHelper() */
- CLMemoryOpImage2dHelper(CLKernelWriter *writer, ITensor *tensor, TensorSampler *sampler, MemoryOperation op)
- : ICLMemoryOpHelper(writer, tensor, sampler, op)
+ CLMemoryOpImage2dHelper(CLKernelWriter *writer,
+ ITensor *tensor,
+ TensorSampler *sampler,
+ MemoryOperation op,
+ const TileView<CLTile> &dst)
+ : ICLMemoryOpHelper(writer, tensor, sampler, op, dst)
{
}
/** Copy constructor */
- CLMemoryOpImage2dHelper(const CLMemoryOpImage2dHelper &) = default;
+ CLMemoryOpImage2dHelper(const CLMemoryOpImage2dHelper &) = delete;
/** Assignment operator overload */
- CLMemoryOpImage2dHelper &operator=(const CLMemoryOpImage2dHelper &) = default;
+ CLMemoryOpImage2dHelper &operator=(const CLMemoryOpImage2dHelper &) = delete;
// Methods overridden
- void initialize(const CLTile *dst, const CLTile *x, const CLTile *z, const CLTile *b) override;
+ void initialize(const CLTile *x, const CLTile *z, const CLTile *b) override;
void write_row(int32_t row_id, const std::string &coord_y) override;
void finalize() override;
private:
- static bool validate(const CLKernelWriter *writer,
- const ITensor *tensor,
- const TensorSampler *sampler,
- const Tensor3dMapper *mapper,
- MemoryOperation op,
- const CLTile *dst);
+ static bool validate(const CLKernelWriter *writer,
+ const ITensor *tensor,
+ const TensorSampler *sampler,
+ const Tensor3dMapper *mapper,
+ MemoryOperation op,
+ const TileView<CLTile> &dst);
void out_of_bound_initialize_y(const std::string &coord);
void out_of_bound_finalize_y();
diff --git a/compute_kernel_writer/src/cl/helpers/ICLMemoryOpHelper.h b/compute_kernel_writer/src/cl/helpers/ICLMemoryOpHelper.h
index f46fee9750..a5b679ac03 100644
--- a/compute_kernel_writer/src/cl/helpers/ICLMemoryOpHelper.h
+++ b/compute_kernel_writer/src/cl/helpers/ICLMemoryOpHelper.h
@@ -28,6 +28,7 @@
#include "ckw/TensorSampler.h"
#include "src/Tensor3dMapper.h"
+#include "src/TileView.h"
#include <cstdint>
#include <memory>
@@ -55,18 +56,24 @@ public:
* @param[in] tensor @ref ckw::ITensor object to perform the memory operation on
* @param[in] sampler @ref ckw::TensorSampler object that tells how to sample a tensor
* @param[in] op The memory operation to be done (e.g. Load/Store)
+ * @param[in] dst The tile to perform the memory operation on
*/
- ICLMemoryOpHelper(CLKernelWriter *writer, ITensor *tensor, TensorSampler *sampler, MemoryOperation op)
- : _writer(writer), _tensor(tensor), _sampler(sampler), _op(op)
+ ICLMemoryOpHelper(CLKernelWriter *writer,
+ ITensor *tensor,
+ TensorSampler *sampler,
+ MemoryOperation op,
+ const TileView<CLTile> &dst)
+ : _writer(writer), _tensor(tensor), _sampler(sampler), _op(op), _dst(dst)
{
- _mapper = std::make_unique<Tensor3dMapper>(tensor, sampler->format());
+ _mapper = std::make_unique<Tensor3dMapper>(tensor, sampler->format());
+ _ls_width_full = _dst.width();
}
/** Copy constructor */
- ICLMemoryOpHelper(const ICLMemoryOpHelper &) = default;
+ ICLMemoryOpHelper(const ICLMemoryOpHelper &) = delete;
/** Assignment operator overload */
- ICLMemoryOpHelper &operator=(const ICLMemoryOpHelper &) = default;
+ ICLMemoryOpHelper &operator=(const ICLMemoryOpHelper &) = delete;
/** Destructor */
virtual ~ICLMemoryOpHelper() = default;
@@ -75,12 +82,11 @@ public:
* the batch offset as a tile object, and initializes the code inside
* the writer object.
*
- * @param[in] dst tile object to perform the memory operation on
* @param[in] x tile object that describes the x-coordinate of the tensor involved
* @param[in] z tile object that describes the z-coordinate of the tensor involved
* @param[in] b tile object that describes the batch offset of the tensor involved
*/
- virtual void initialize(const CLTile *dst, const CLTile *x, const CLTile *z, const CLTile *b) = 0;
+ virtual void initialize(const CLTile *x, const CLTile *z, const CLTile *b) = 0;
/** Method that writes the actual code to the writer that performs the mentioned memory
* operation on the tile initialized. It writes the code for a specific row given in the
@@ -104,7 +110,7 @@ protected:
TensorSampler *_sampler{nullptr};
MemoryOperation _op;
std::unique_ptr<Tensor3dMapper> _mapper{nullptr};
- const CLTile *_dst{nullptr};
+ TileView<CLTile> _dst{};
int32_t _ls_width_full{0};
std::string _coord_x{};
std::string _coord_z{};
@@ -112,4 +118,4 @@ protected:
};
} // namespace ckw
-#endif /* CKW_SRC_CL_HELPERS_ICLMEMORYOPHELPER_H */
+#endif // CKW_SRC_CL_HELPERS_ICLMEMORYOPHELPER_H
diff --git a/compute_kernel_writer/src/types/ConstantData.cpp b/compute_kernel_writer/src/types/ConstantData.cpp
index 67b1103860..6d15eab407 100644
--- a/compute_kernel_writer/src/types/ConstantData.cpp
+++ b/compute_kernel_writer/src/types/ConstantData.cpp
@@ -31,7 +31,7 @@ namespace ckw
namespace
{
template <typename T>
-inline typename std::enable_if<std::is_same<T, float>::value, std::string>::type to_str(T value)
+typename std::enable_if<std::is_same<T, float>::value, std::string>::type to_str(T value)
{
std::stringstream ss;
ss << std::scientific << std::setprecision(std::numeric_limits<T>::max_digits10) << value;
@@ -39,14 +39,14 @@ inline typename std::enable_if<std::is_same<T, float>::value, std::string>::type
}
template <typename T>
-inline typename std::enable_if<!std::is_same<T, float>::value && !std::is_same<T, bool>::value, std::string>::type
+typename std::enable_if<!std::is_same<T, float>::value && !std::is_same<T, bool>::value, std::string>::type
to_str(T value)
{
return std::to_string(value);
}
template <typename T>
-inline typename std::enable_if<std::is_same<T, bool>::value, std::string>::type to_str(T value)
+typename std::enable_if<std::is_same<T, bool>::value, std::string>::type to_str(T value)
{
return std::to_string((int)value);
}
@@ -72,6 +72,24 @@ ConstantData::ConstantData(std::initializer_list<std::initializer_list<T>> value
}
template <typename T>
+ConstantData::ConstantData(const std::vector<std::vector<T>> &values, DataType data_type) : _data_type(data_type)
+{
+ CKW_ASSERT(validate<T>(data_type));
+ CKW_ASSERT(values.size() > 0);
+
+ for (auto value_arr : values)
+ {
+ // Each row must have the same number of elements
+ CKW_ASSERT(value_arr.size() == (*values.begin()).size());
+
+ StringVector vec;
+ std::transform(value_arr.begin(), value_arr.end(), std::back_inserter(vec), [](T val) { return to_str(val); });
+
+ _values.push_back(std::move(vec));
+ }
+}
+
+template <typename T>
bool ConstantData::validate(DataType data_type)
{
switch (data_type)
@@ -100,6 +118,10 @@ template ConstantData::ConstantData(std::initializer_list<std::initializer_list<
template ConstantData::ConstantData(std::initializer_list<std::initializer_list<uint32_t>>, DataType);
template ConstantData::ConstantData(std::initializer_list<std::initializer_list<bool>>, DataType);
template ConstantData::ConstantData(std::initializer_list<std::initializer_list<float>>, DataType);
+template ConstantData::ConstantData(const std::vector<std::vector<int32_t>> &, DataType);
+template ConstantData::ConstantData(const std::vector<std::vector<uint32_t>> &, DataType);
+template ConstantData::ConstantData(const std::vector<std::vector<bool>> &, DataType);
+template ConstantData::ConstantData(const std::vector<std::vector<float>> &, DataType);
template bool ConstantData::validate<int32_t>(DataType);
template bool ConstantData::validate<uint32_t>(DataType);
diff --git a/compute_kernel_writer/validation/tests/CLKernelWriterBinaryOpTest.h b/compute_kernel_writer/validation/tests/CLKernelWriterBinaryOpTest.h
index bfa6724008..44a4df1ce1 100644
--- a/compute_kernel_writer/validation/tests/CLKernelWriterBinaryOpTest.h
+++ b/compute_kernel_writer/validation/tests/CLKernelWriterBinaryOpTest.h
@@ -61,25 +61,19 @@ public:
_tests.push_back({ 2, 4, DataType::Bool, 2, 1, 2, 1, DataType::Fp32, BinaryOp::GreaterEqual, "G0__dst__0 = (float4)G0__lhs__0 >= (float4)G0__rhs__0;\nG0__dst__1 = (float4)G0__lhs__1 >= (float4)G0__rhs__1;\n" }); // LHS and RHS x-dimension broadcast.
- _tests.push_back({ 2, 3, DataType::Fp32, 2, 3, 2, 3, DataType::Fp32, BinaryOp::MatMul_Nt_T,
+ _tests.push_back({ 2, 2, DataType::Fp32, 2, 3, 2, 3, DataType::Fp32, BinaryOp::MatMul_Nt_T,
"G0__dst__0.s0 = fma(G0__lhs__0.s0, G0__rhs__0.s0, G0__dst__0.s0);\n"
- "G0__dst__0.s0 = fma(G0__lhs__1.s0, G0__rhs__1.s0, G0__dst__0.s0);\n"
- "G0__dst__0.s0 = fma(G0__lhs__1.s0, G0__rhs__1.s0, G0__dst__0.s0);\n"
- "G0__dst__1.s0 = fma(G0__lhs__0.s0, G0__rhs__0.s1, G0__dst__1.s0);\n"
- "G0__dst__1.s0 = fma(G0__lhs__1.s0, G0__rhs__1.s1, G0__dst__1.s0);\n"
- "G0__dst__1.s0 = fma(G0__lhs__1.s0, G0__rhs__1.s1, G0__dst__1.s0);\n"
- "G0__dst__1.s0 = fma(G0__lhs__0.s0, G0__rhs__0.s2, G0__dst__1.s0);\n"
- "G0__dst__1.s0 = fma(G0__lhs__1.s0, G0__rhs__1.s2, G0__dst__1.s0);\n"
- "G0__dst__1.s0 = fma(G0__lhs__1.s0, G0__rhs__1.s2, G0__dst__1.s0);\n"
- "G0__dst__0.s1 = fma(G0__lhs__0.s1, G0__rhs__0.s0, G0__dst__0.s1);\n"
- "G0__dst__0.s1 = fma(G0__lhs__1.s1, G0__rhs__1.s0, G0__dst__0.s1);\n"
- "G0__dst__0.s1 = fma(G0__lhs__1.s1, G0__rhs__1.s0, G0__dst__0.s1);\n"
- "G0__dst__1.s1 = fma(G0__lhs__0.s1, G0__rhs__0.s1, G0__dst__1.s1);\n"
+ "G0__dst__0.s0 = fma(G0__lhs__0.s1, G0__rhs__0.s1, G0__dst__0.s0);\n"
+ "G0__dst__0.s0 = fma(G0__lhs__0.s2, G0__rhs__0.s2, G0__dst__0.s0);\n"
+ "G0__dst__0.s1 = fma(G0__lhs__0.s0, G0__rhs__1.s0, G0__dst__0.s1);\n"
+ "G0__dst__0.s1 = fma(G0__lhs__0.s1, G0__rhs__1.s1, G0__dst__0.s1);\n"
+ "G0__dst__0.s1 = fma(G0__lhs__0.s2, G0__rhs__1.s2, G0__dst__0.s1);\n"
+ "G0__dst__1.s0 = fma(G0__lhs__1.s0, G0__rhs__0.s0, G0__dst__1.s0);\n"
+ "G0__dst__1.s0 = fma(G0__lhs__1.s1, G0__rhs__0.s1, G0__dst__1.s0);\n"
+ "G0__dst__1.s0 = fma(G0__lhs__1.s2, G0__rhs__0.s2, G0__dst__1.s0);\n"
+ "G0__dst__1.s1 = fma(G0__lhs__1.s0, G0__rhs__1.s0, G0__dst__1.s1);\n"
"G0__dst__1.s1 = fma(G0__lhs__1.s1, G0__rhs__1.s1, G0__dst__1.s1);\n"
- "G0__dst__1.s1 = fma(G0__lhs__1.s1, G0__rhs__1.s1, G0__dst__1.s1);\n"
- "G0__dst__1.s1 = fma(G0__lhs__0.s1, G0__rhs__0.s2, G0__dst__1.s1);\n"
- "G0__dst__1.s1 = fma(G0__lhs__1.s1, G0__rhs__1.s2, G0__dst__1.s1);\n"
- "G0__dst__1.s1 = fma(G0__lhs__1.s1, G0__rhs__1.s2, G0__dst__1.s1);\n" });
+ "G0__dst__1.s1 = fma(G0__lhs__1.s2, G0__rhs__1.s2, G0__dst__1.s1);\n" });
}
bool run() override
diff --git a/compute_kernel_writer/validation/tests/CLKernelWriterDeclareTensorTest.h b/compute_kernel_writer/validation/tests/CLKernelWriterDeclareTensorTest.h
index 3e1056972e..855c747f13 100644
--- a/compute_kernel_writer/validation/tests/CLKernelWriterDeclareTensorTest.h
+++ b/compute_kernel_writer/validation/tests/CLKernelWriterDeclareTensorTest.h
@@ -81,7 +81,15 @@ public:
"{\n"
"}\n";
- const auto &actual_code = kernel->source_code();
+ std::string actual_code = kernel->source_code();
+
+ std::size_t pos = actual_code.find("__kernel");
+
+ if (pos != std::string::npos)
+ {
+ // Remove text before "__kernel"
+ actual_code = actual_code.substr(pos);
+ }
int test_id = 0;
VALIDATE_TEST(kernel->arguments().size() == 4, all_tests_passed, test_id++);