aboutsummaryrefslogtreecommitdiff
path: root/compute_kernel_writer
diff options
context:
space:
mode:
authorSiCong Li <sicong.li@arm.com>2023-07-18 17:56:49 +0100
committerSiCong Li <sicong.li@arm.com>2023-07-28 15:29:15 +0000
commit16b37527906c68885f81a8db35f9d6040d73efec (patch)
tree9669b5ebda00b3e3b1ac55992c144b09324b5997 /compute_kernel_writer
parent9129549110527fd53655d3e6b61e8e59bed6f97f (diff)
downloadComputeLibrary-16b37527906c68885f81a8db35f9d6040d73efec.tar.gz
Port ElementwiseBinary to CKW part 2
* Add fp16 support * Implement broadcasting to elementwise binary * Implement kernel name and kernel config id * Always use explicit cast in ckw unary, binary and ternary elementwise functions. This is to address the accidental use of double literals, with other benefits. * Refactor TypeConverter for smaller includes Resolves COMPMID-6260 Change-Id: I26b726746f8c0dd7b5942ad379d56f4d7642d15f Signed-off-by: SiCong Li <sicong.li@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9999 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Jakub Sujak <jakub.sujak@arm.com> Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'compute_kernel_writer')
-rw-r--r--compute_kernel_writer/prototype/src/Prototype.h67
1 files changed, 43 insertions, 24 deletions
diff --git a/compute_kernel_writer/prototype/src/Prototype.h b/compute_kernel_writer/prototype/src/Prototype.h
index 05c7306e3a..a8dc7fbfdb 100644
--- a/compute_kernel_writer/prototype/src/Prototype.h
+++ b/compute_kernel_writer/prototype/src/Prototype.h
@@ -2194,10 +2194,34 @@ struct GpuKernel
std::vector<std::pair<int32_t, TensorComponentType>> list_tensor_components; // List of tensor components (width, stride,..), required for the dispatch stage)
};
+// Generate all extension pragmas (hardcoded for now)
+inline std::string generate_extensions()
+{
+ std::string ext = R"(
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(cl_khr_fp16)
+
+#if defined(cl_arm_integer_dot_product_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
+#endif // defined(cl_arm_integer_dot_product_int8)
+
+#if defined(cl_arm_integer_dot_product_accumulate_int8)
+#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
+#endif // defined(cl_arm_integer_dot_product_accumulate_int8)
+
+#if defined(cl_arm_printf)
+#pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(cl_arm_printf);
+)";
+ return ext;
+}
+
// This function should produce an object with the source
inline std::string generate_code(GpuKernelWriterDataHolder &in, const std::string &name)
{
std::string code;
+ code += generate_extensions();
code += "__kernel void ";
code += name;
code += "(\n";
@@ -2783,6 +2807,8 @@ private:
case TensorSamplerAddressModeY::SkipMinEdgeOnly:
_writer->compound_statement_end();
break;
+ case TensorSamplerAddressModeY::None:
+ break;
default:
assert(false);
@@ -2799,6 +2825,8 @@ private:
_writer->write_text(" = 0.0f;\n");
_writer->compound_statement_end();
break;
+ case TensorSamplerAddressModeY::None:
+ break;
default:
assert(false);
@@ -2857,6 +2885,8 @@ private:
case TensorSamplerAddressModeZ::SkipMaxEdgeOnly:
_writer->compound_statement_end();
break;
+ case TensorSamplerAddressModeZ::None:
+ break;
default:
assert(false);
@@ -3622,14 +3652,15 @@ public:
const IVectorTile *src = operands.unpack(src_name);
const IVectorTile *dst = operands.unpack(dst_name);
- const int32_t dst_w = dst->format().w;
const int32_t dst_h = dst->format().h;
- const int32_t src_w = src->format().w;
const std::string dt = dst->underlying_source_variables()[0].type.str;
- const bool broadcast_src_x = dst_w != 1 && src_w == 1;
-
- const std::string src_prefix = broadcast_src_x ? "(" + dt + ")" : "";
+ // Always perform an explicit cast. This automatically covers at least the 2 scenarios:
+ // 1. Widen a scalar into a vector type. This enables scalar-vector broadcasting
+ // 2. Ensure non-ambiguity over function overloads.
+ // E.g. a constant tile may be accidentally initialized with a double literal. By casting it to single float,
+ // it avoids ambiguous function calls
+ const std::string src_prefix = "(" + dt + ")";
// Broadcasting on Y is automatic
for(int32_t y = 0; y < dst_h; ++y)
@@ -3679,18 +3710,13 @@ public:
const IVectorTile *second = operands.unpack(second_name);
const IVectorTile *dst = operands.unpack(dst_name);
- const int32_t dst_w = dst->format().w;
const int32_t dst_h = dst->format().h;
- const int32_t first_w = first->format().w;
- const int32_t second_w = second->format().w;
const auto datatype = dst->underlying_source_variables()[0].type;
const std::string datatype_str = datatype.str;
- const bool broadcast_first_x = dst_w != 1 && first_w == 1;
- const bool broadcast_second_x = dst_w != 1 && second_w == 1;
-
- const std::string first_prefix = broadcast_first_x ? "(" + datatype_str + ")" : "";
- const std::string second_prefix = broadcast_second_x ? "(" + datatype_str + ")" : "";
+ // Always perform an explicit cast. See similar comments in op_unary_elementwise_function
+ const std::string first_prefix = "(" + datatype_str + ")";
+ const std::string second_prefix = "(" + datatype_str + ")";
const bool is_float = (datatype.dt == DataType::Fp32 || datatype.dt == DataType::Fp16);
@@ -3727,20 +3753,13 @@ public:
const IVectorTile *third = operands.unpack(third_name);
const IVectorTile *dst = operands.unpack(dst_name);
- const int32_t dst_w = dst->format().w;
const int32_t dst_h = dst->format().h;
- const int32_t first_w = first->format().w;
- const int32_t second_w = second->format().w;
- const int32_t third_w = third->format().w;
const std::string dt = dst->underlying_source_variables()[0].type.str;
- const bool broadcast_first_x = dst_w != 1 && first_w == 1;
- const bool broadcast_second_x = dst_w != 1 && second_w == 1;
- const bool broadcast_third_x = dst_w != 1 && third_w == 1;
-
- const std::string first_prefix = broadcast_first_x ? "(" + dt + ")" : "";
- const std::string second_prefix = broadcast_second_x ? "(" + dt + ")" : "";
- const std::string third_prefix = broadcast_third_x ? "(" + dt + ")" : "";
+ // Always perform an explicit cast. See similar comments in op_unary_elementwise_function
+ const std::string first_prefix = "(" + dt + ")";
+ const std::string second_prefix = "(" + dt + ")";
+ const std::string third_prefix = "(" + dt + ")";
// Broadcasting on Y is automatic
for(int32_t y = 0; y < dst_h; ++y)