aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Android.bp1
-rw-r--r--compute_kernel_writer/CMakeLists.txt4
-rw-r--r--compute_kernel_writer/prototype/include/ckw/KernelWriter.h29
-rw-r--r--compute_kernel_writer/prototype/include/ckw/TileOperand.h25
-rw-r--r--compute_kernel_writer/prototype/src/KernelWriter.cpp58
-rw-r--r--compute_kernel_writer/prototype/src/Prototype.h18
-rw-r--r--compute_kernel_writer/prototype/src/TileOperand.cpp57
-rw-r--r--docs/user_guide/release_version_and_change_log.dox7
-rw-r--r--filelist.json1
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp333
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h86
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp17
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h14
-rw-r--r--tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp3
14 files changed, 618 insertions, 35 deletions
diff --git a/Android.bp b/Android.bp
index b7936e5671..14290b9e1c 100644
--- a/Android.bp
+++ b/Android.bp
@@ -647,6 +647,7 @@ cc_library_static {
"src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp",
"src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp",
"src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp",
+ "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp",
"src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp",
"src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp",
"src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp",
diff --git a/compute_kernel_writer/CMakeLists.txt b/compute_kernel_writer/CMakeLists.txt
index 1e82f9c6b3..9a97563025 100644
--- a/compute_kernel_writer/CMakeLists.txt
+++ b/compute_kernel_writer/CMakeLists.txt
@@ -102,7 +102,7 @@ target_compile_options(ckw
${CKW_CXX_FLAGS}
"$<$<CXX_COMPILER_ID:GNU>:${GNU_WARNINGS}>"
"$<$<CONFIG:Debug>:${CKW_ASSERTS_OPTS}>"
- "$<$<BOOL:${CKW_ASSERTS}>:${CKW_ASSERTS_OPTS}>"
+ "$<$<BOOL:${CKW_ENABLE_ASSERTS}>:${CKW_ASSERTS_OPTS}>"
# Set CMAKE_CXX_FLAGS last so user can overwrite options
${CMAKE_CXX_FLAGS}
PRIVATE
@@ -113,7 +113,7 @@ target_compile_options(ckw
target_compile_definitions(ckw PUBLIC
$<$<CONFIG:Debug>:COMPUTE_KERNEL_WRITER_DEBUG_ENABLED>
$<$<CONFIG:Debug>:COMPUTE_KERNEL_WRITER_ASSERTS_ENABLED>
- $<$<BOOL:${CKW_ASSERTS}>:COMPUTE_KERNEL_WRITER_ASSERTS_ENABLED>
+ $<$<BOOL:${CKW_ENABLE_ASSERTS}>:COMPUTE_KERNEL_WRITER_ASSERTS_ENABLED>
$<$<BOOL:${CKW_ENABLE_OPENCL}>:COMPUTE_KERNEL_WRITER_OPENCL_ENABLED>
)
diff --git a/compute_kernel_writer/prototype/include/ckw/KernelWriter.h b/compute_kernel_writer/prototype/include/ckw/KernelWriter.h
index c116e62650..72f85c78aa 100644
--- a/compute_kernel_writer/prototype/include/ckw/KernelWriter.h
+++ b/compute_kernel_writer/prototype/include/ckw/KernelWriter.h
@@ -129,11 +129,38 @@ public:
/** Load the data from the tensor memory to the tile using the sampling information.
*
+ * @param[out] tile The tile to be loaded.
+ * @param[in] tensor The tensor to be read.
+ * @param[in] sampler The tensor sampling information.
+ * @param[in] dilation_y Dilation in the Y dimension.
+ */
+ void op_load(TileOperand &tile, const TensorOperand &tensor, const TensorTileSampler &sampler, const TileOperand &dilation_y = TileOperand("dil_y", 1));
+
+ /** Load the data from the tensor memory to the tile using the indirect buffer approach and respective of the sampling information.
+ *
* @param[out] tile The tile to be loaded.
* @param[in] tensor The tensor to be read.
* @param[in] sampler The tensor sampling information.
*/
- void op_load(TileOperand &tile, TensorOperand &tensor, const TensorTileSampler &sampler);
+ void op_load_indirect(TileOperand &tile, const TensorOperand &tensor, const TensorTileSampler &sampler);
+
+ /** Construct an indirection buffer in @p tile containing the precalculated addresses of elements in the source tensor.
+ *
+ * @param[out] tile The tile to be loaded.
+ * @param[in] tensor The tensor the be read.
+ * @param[in] sampler The tensor sampling information.
+ * @param[in] x The X coordinate.
+ * @param[in] y The Y coordinate.
+ * @param[in] x_off Offset in the X dimension.
+ * @param[in] y_off Offset in the Y dimension.
+ */
+ void util_get_indirect_buffer(TileOperand &tile,
+ const TensorOperand &tensor,
+ const TensorTileSampler &sampler,
+ const TileOperand &x,
+ const TileOperand &y,
+ const TileOperand &x_off,
+ const TileOperand &y_off);
/** Store the tile to the tensor using the specified sampling information.
*
diff --git a/compute_kernel_writer/prototype/include/ckw/TileOperand.h b/compute_kernel_writer/prototype/include/ckw/TileOperand.h
index c071707a45..24ee373a24 100644
--- a/compute_kernel_writer/prototype/include/ckw/TileOperand.h
+++ b/compute_kernel_writer/prototype/include/ckw/TileOperand.h
@@ -37,6 +37,8 @@ namespace ckw
class Kernel;
+using TileContainer = std::vector<std::vector<std::string>>;
+
/** Tile operand which can be either scalar, vector or 2D tile. */
class TileOperand : public OperandBase
{
@@ -69,6 +71,13 @@ public:
*/
TileOperand(const ::std::string &name, float value);
+ /** Initialize a new instance of @ref TileOperand for compile-time constant variable.
+ *
+ * @param[in] name The name of the tile.
+ * @param[in] value The value of the tile.
+ */
+ TileOperand(const ::std::string &name, const ::std::vector<std::vector<std::string>> &value, DataType dt);
+
/** Prohibit copy of tile operand. */
TileOperand(const TileOperand &) = delete;
@@ -96,13 +105,21 @@ public:
/** Get the scalar value of the tile.
*
* The tile must have the shape of 1, 1 (i.e. scalar).
+ *
+ * @return Scalar value as a string.
+ */
+ std::string scalar_value() const;
+
+ /** Get the values of the tile.
+ *
+ * @return 2D container of values.
*/
- ScalarValue scalar_value() const;
+ const TileContainer &value() const;
private:
- TileInfo _info;
- ScalarValue _value{};
- bool _constant;
+ TileInfo _info;
+ TileContainer _value{};
+ bool _constant;
};
} // namespace ckw
diff --git a/compute_kernel_writer/prototype/src/KernelWriter.cpp b/compute_kernel_writer/prototype/src/KernelWriter.cpp
index 9122e518b4..f29cf12802 100644
--- a/compute_kernel_writer/prototype/src/KernelWriter.cpp
+++ b/compute_kernel_writer/prototype/src/KernelWriter.cpp
@@ -128,6 +128,10 @@ TileOperand &KernelWriter::declare_tile_operand(std::unique_ptr<TileOperand> ope
name,
prototype::TileInfo(info.data_type(), info.width(), info.height()));
}
+ else
+ {
+ _impl->declare_const_tile(name, operand.value(), operand.data_type());
+ }
return operand;
}
@@ -136,7 +140,7 @@ TileOperand &KernelWriter::declare_tile_operand(std::unique_ptr<TileOperand> ope
// Load and store
// =================================================================================================
-void KernelWriter::op_load(TileOperand &tile, TensorOperand &tensor, const TensorTileSampler &sampler)
+void KernelWriter::op_load(TileOperand &tile, const TensorOperand &tensor, const TensorTileSampler &sampler, const TileOperand &dilation_y)
{
prototype::TensorOperand impl_tensor(
tensor.name(),
@@ -152,9 +156,59 @@ void KernelWriter::op_load(TileOperand &tile, TensorOperand &tensor, const Tenso
auto impl_z = sampler.z().create_impl_operand(_impl.get());
auto impl_b = sampler.b().create_impl_operand(_impl.get());
+ auto impl_dilation_y = dilation_y.create_impl_operand(_impl.get());
+
+ auto impl_dst = tile.create_impl_operand(_impl.get());
+
+ _impl->op_load_immediate(impl_tensor, impl_dst, impl_x, impl_y, impl_z, impl_b, impl_dilation_y);
+}
+
+void KernelWriter::op_load_indirect(TileOperand &tile, const TensorOperand &tensor, const TensorTileSampler &sampler)
+{
+ prototype::TensorOperand impl_tensor(
+ tensor.name(),
+ prototype::GpuSampler{
+ sampler.format(),
+ prototype::to_gpu_tensor_storage(tensor.storage_type()),
+ sampler.address_mode_x(),
+ sampler.address_mode_y(),
+ sampler.address_mode_z() });
+
+ auto impl_x = sampler.x().create_impl_operand(_impl.get());
+ auto impl_y = sampler.y().create_impl_operand(_impl.get());
+ auto impl_z = sampler.z().create_impl_operand(_impl.get());
+ auto impl_b = sampler.b().create_impl_operand(_impl.get());
+
+ auto impl_dst = tile.create_impl_operand(_impl.get());
+
+ _impl->op_load_indirect(impl_tensor, impl_dst, impl_x, impl_y, impl_z, impl_b);
+}
+
+void KernelWriter::util_get_indirect_buffer(TileOperand &tile,
+ const TensorOperand &tensor,
+ const TensorTileSampler &sampler,
+ const TileOperand &x,
+ const TileOperand &y,
+ const TileOperand &x_off,
+ const TileOperand &y_off)
+{
+ prototype::TensorOperand impl_tensor(
+ tensor.name(),
+ prototype::GpuSampler{
+ sampler.format(),
+ prototype::to_gpu_tensor_storage(tensor.storage_type()),
+ sampler.address_mode_x(),
+ sampler.address_mode_y(),
+ sampler.address_mode_z() });
+
+ auto impl_x = x.create_impl_operand(_impl.get());
+ auto impl_y = y.create_impl_operand(_impl.get());
+ auto impl_x_off = x_off.create_impl_operand(_impl.get());
+ auto impl_y_off = y_off.create_impl_operand(_impl.get());
+
auto impl_dst = tile.create_impl_operand(_impl.get());
- _impl->op_load_immediate(impl_tensor, impl_dst, impl_x, impl_y, impl_z, impl_b);
+ _impl->util_get_indirect_buffer(impl_dst, impl_tensor, impl_x, impl_y, impl_x_off, impl_y_off);
}
void KernelWriter::op_store(TensorOperand &tensor, const TileOperand &tile, const TensorTileSampler &sampler)
diff --git a/compute_kernel_writer/prototype/src/Prototype.h b/compute_kernel_writer/prototype/src/Prototype.h
index a8dc7fbfdb..2b519471ac 100644
--- a/compute_kernel_writer/prototype/src/Prototype.h
+++ b/compute_kernel_writer/prototype/src/Prototype.h
@@ -3009,7 +3009,7 @@ private:
address += " + (";
address += x + ") * sizeof(" + dst_type + ")";
}
- if(y != "0" && (_mapper.is_one_component_y() != true))
+ if(y != "0")
{
const std::string stride_y = _mapper.tensor_component_stride_y();
address += " + (";
@@ -3249,7 +3249,7 @@ private:
std::string coord_x = "(" + x + ") >> 2";
std::string coord_y = "(";
- if(y != "0" && (_mapper.is_one_component_y() != true))
+ if(y != "0")
{
coord_y += y;
}
@@ -4024,13 +4024,6 @@ public:
_data->code += ", ";
_data->code += x_s->scalar(0, i).str;
_data->code += " >= 0);\n";
- // mi_0 = select(wxh, mi_0, y_s >= 0);
- _data->code += dst->scalar(0, i).str;
- _data->code += " = select(-1, ";
- _data->code += dst->scalar(0, i).str;
- _data->code += ", ";
- _data->code += y_s->scalar(0, i).str;
- _data->code += " >= 0);\n";
// mi_0 = select(wxh, mi_0, x_s < width);
_data->code += dst->scalar(0, i).str;
_data->code += " = select(-1, ";
@@ -4039,6 +4032,13 @@ public:
_data->code += x_s->scalar(0, i).str;
_data->code += " < ";
_data->code += width + ");\n";
+ // mi_0 = select(wxh, mi_0, y_s >= 0);
+ _data->code += dst->scalar(0, i).str;
+ _data->code += " = select(-1, ";
+ _data->code += dst->scalar(0, i).str;
+ _data->code += ", ";
+ _data->code += y_s->scalar(0, i).str;
+ _data->code += " >= 0);\n";
// mi_0 = select(wxh, mi_0, y_s < height);
_data->code += dst->scalar(0, i).str;
_data->code += " = select(-1, ";
diff --git a/compute_kernel_writer/prototype/src/TileOperand.cpp b/compute_kernel_writer/prototype/src/TileOperand.cpp
index fcb3cb6415..bf6a15b9df 100644
--- a/compute_kernel_writer/prototype/src/TileOperand.cpp
+++ b/compute_kernel_writer/prototype/src/TileOperand.cpp
@@ -30,22 +30,42 @@ namespace ckw
{
TileOperand::TileOperand(const std::string &name, const TileInfo &info)
- : OperandBase(name), _info(info), _value{ 0 }, _constant(false)
+ : OperandBase(name),
+ _info(info),
+ _value{ std::vector<std::string>{ "0" } },
+ _constant(false)
{
}
TileOperand::TileOperand(const std::string &name, DataType data_type)
- : OperandBase(name), _info(TileInfo{ data_type }), _value(0), _constant(false)
+ : OperandBase(name),
+ _info(TileInfo{ data_type }),
+ _value{ std::vector<std::string>{ "0" } },
+ _constant(false)
{
}
TileOperand::TileOperand(const std::string &name, int32_t value)
- : OperandBase(name), _info(TileInfo{ DataType::Int32 }), _value(value), _constant(true)
+ : OperandBase(name),
+ _info(TileInfo{ DataType::Int32 }),
+ _value{ std::vector<std::string>{ std::to_string(value) } },
+ _constant(true)
{
}
TileOperand::TileOperand(const std::string &name, float value)
- : OperandBase(name), _info(TileInfo{ DataType::Fp32 }), _value(value), _constant(true)
+ : OperandBase(name),
+ _info(TileInfo{ DataType::Fp32 }),
+ _value{ std::vector<std::string>{ std::to_string(value) } },
+ _constant(true)
+{
+}
+
+TileOperand::TileOperand(const std::string &name, const TileContainer &vals, DataType dt)
+ : OperandBase(name),
+ _info(TileInfo{ dt, static_cast<int32_t>(vals.size()), static_cast<int32_t>(vals[0].size()) }),
+ _value(vals),
+ _constant(true)
{
}
@@ -55,17 +75,23 @@ prototype::Operand TileOperand::create_impl_operand(prototype::IGpuKernelWriter
if(_constant)
{
- switch(_info.data_type())
+ if(is_scalar())
{
- case DataType::Int32:
- return prototype::Operand(std::to_string(_value.get<int32_t>()),
- prototype::OperandType::ScalarInt32);
+ switch(_info.data_type())
+ {
+ case DataType::Int32:
+ return prototype::Operand(_value[0][0], prototype::OperandType::ScalarInt32);
- case DataType::Fp32:
- return prototype::Operand(std::to_string(_value.get<float>()), prototype::OperandType::ScalarFp32);
+ case DataType::Fp32:
+ return prototype::Operand(_value[0][0], prototype::OperandType::ScalarFp32);
- default:
- CKW_ASSERT(false);
+ default:
+ CKW_ASSERT(false);
+ }
+ }
+ else
+ {
+ return prototype::Operand(name());
}
}
else
@@ -94,11 +120,16 @@ bool TileOperand::is_scalar() const
return _info.width() == 1 && _info.height() == 1;
}
-ScalarValue TileOperand::scalar_value() const
+std::string TileOperand::scalar_value() const
{
CKW_ASSERT(is_scalar());
CKW_ASSERT(is_constant());
+ return _value[0][0];
+}
+
+const TileContainer &TileOperand::value() const
+{
return _value;
}
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index 801f1f0b0f..edc0c3b9e7 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -48,6 +48,13 @@ v23.08 Public major release
- @ref opencl::kernels::ClMatMulNativeMMULKernel support for FP32 and FP16, with batch support
- Enable transposed convolution with non-square kernels on CPU and GPU.
- Added support for input data type U64/S64 in CLCast.
+ - Added new Compute Kernel Writer (CKW) subproject that offers a C++ interface to generate tile-based OpenCL code in just-in-time fashion.
+ - Port the following kernels in the experimental Dynamic Fusion interface to use the new Compute Kernel Writer interface with support for FP16/FP32 only:
+ - @ref experimental::dynamic_fusion::GpuCkwActivation
+ - @ref experimental::dynamic_fusion::GpuCkwCast
+ - @ref experimental::dynamic_fusion::GpuCkwDirectConv2d
+ - @ref experimental::dynamic_fusion::GpuCkwElementwiseBinary
+ - @ref experimental::dynamic_fusion::GpuCkwStore
- Various optimizations and bug fixes.
v23.05.1 Public patch release
diff --git a/filelist.json b/filelist.json
index b7845a760a..953b81de5a 100644
--- a/filelist.json
+++ b/filelist.json
@@ -2353,6 +2353,7 @@
"src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp",
"src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp",
"src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp",
+ "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp",
"src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp",
"src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp",
"src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp",
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp
new file mode 100644
index 0000000000..3c906646a6
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "ckw/TensorTileSampler.h"
+#include "ckw/TileInfo.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+using TileContainer = std::vector<std::vector<std::string>>;
+
+GpuCkwDirectConv2d::GpuCkwDirectConv2d(ComponentId id,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings)
+ : IGpuCkwComponentDriver{ id, tensors },
+ _src{},
+ _wei{},
+ _bia{},
+ _dst{},
+ _attributes{ attributes },
+ _settings{ settings }
+{
+ _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+ _wei = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
+ _bia = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
+ _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _wei, _dst); // Bias can be null
+}
+
+void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const
+{
+ const auto desc = _settings.direct_conv_descriptor();
+ ARM_COMPUTE_ERROR_ON_MSG(desc.export_input_to_cl_image || desc.export_output_to_cl_image,
+ "Only the weights tensor can be exported to cl_image");
+
+ const unsigned int channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
+ const unsigned int width_idx = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::HEIGHT);
+
+ const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+ // Tunable parameters
+ const int32_t m0 = root_window.y().step();
+ const int32_t n0 = root_window.x().step();
+ const int32_t k0 = adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx));
+ const int32_t partial_n0 = _dst->dimension(0) % n0;
+
+ const int32_t K = _src->dimension(channel_idx);
+
+ // Exporting the weights tensor to an OpenCL image object is currently only supported when:
+ // a) k0 is equal to 4
+ // The current implementation expects to read a vector of 4 float values into the OpenCL image object.
+ // b) K is a multiple of 4
+ // This is a limitation in the current interface due to the variable table being responsible for maintaining
+ // information about the TensorStorageType rather than the TensorTileSampler. As a result, TensorStorageType cannot
+ // be reassigned, and we cannot use a texture object for the weights tensor in cases where we expect to have an
+ // extra loop to compute the left-over elements.
+ const bool use_cl_image_for_weights = desc.export_weights_to_cl_image && (k0 == 4) && (K % 4 == 0);
+
+ GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
+ GpuCkwComponentArgument *wei = vtable.declare_variable(
+ comp_group, writer, _wei, use_cl_image_for_weights ? TensorStorageType::ClImage2dReadOnly : TensorStorageType::ClBufferUint8Ptr, "wei");
+ GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+ GpuCkwComponentArgument *bia = nullptr;
+
+ const bool using_bias = _bia != nullptr;
+
+ if(using_bias)
+ {
+ bia = vtable.declare_variable(comp_group, writer, _bia, TensorStorageType::ClBufferUint8Ptr, "bia");
+ }
+
+ // Constants
+ const auto kernel_height = static_cast<int32_t>(_wei->dimension(height_idx));
+ const auto kernel_width = static_cast<int32_t>(_wei->dimension(width_idx));
+ const auto src_channels = static_cast<int32_t>(_src->dimension(channel_idx));
+ auto &tile_kernel_w = writer->declare_tile("kernel_w", kernel_width);
+ auto &tile_kernel_size = writer->declare_tile("kernel_size", kernel_width * kernel_height);
+ auto &tile_src_c = writer->declare_tile("src_c", static_cast<int32_t>(_src->dimension(channel_idx)));
+ auto &tile_dst_w = writer->declare_tile("dst_w", static_cast<int32_t>(_dst->dimension(width_idx)));
+ auto &tile_stride_x = writer->declare_tile("stride_x", static_cast<int32_t>(_attributes.stride().x()));
+ auto &tile_stride_y = writer->declare_tile("stride_y", static_cast<int32_t>(_attributes.stride().y()));
+ auto &tile_pad_x = writer->declare_tile("pad_x", static_cast<int32_t>(_attributes.pad().left));
+ auto &tile_pad_y = writer->declare_tile("pad_y", static_cast<int32_t>(_attributes.pad().top));
+ auto &tile_k0 = writer->declare_tile("k0", k0);
+ auto &tile_0 = writer->declare_tile("0", 0);
+ auto &tile_1 = writer->declare_tile("1", 1);
+
+ auto &tile_gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32);
+ auto &tile_gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
+ auto &tile_gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32);
+
+ writer->op_get_global_id(tile_gid_0, 0);
+ writer->op_get_global_id(tile_gid_1, 1);
+ writer->op_get_global_id(tile_gid_2, 2);
+
+ auto &tile_cout = writer->declare_tile("cout", ckw::DataType::Int32); // OFM
+ auto &tile_mout = writer->declare_tile("mout", ckw::DataType::Int32); // WIDTH x HEIGHT
+ auto &tile_bout = writer->declare_tile("bout", ckw::DataType::Int32); // BATCH SIZE IDX
+
+ // Get the boundary aware coordinates at each global dimension index
+ get_coord(writer, tile_cout, tile_gid_0, n0, partial_n0, tile_cout.name() + "_dim0_", tile_0);
+ get_coord(writer, tile_mout, tile_gid_1, m0, 0, tile_mout.name() + "_dim1_", tile_0);
+ get_coord(writer, tile_bout, tile_gid_2, 1, 0, tile_bout.name() + "_dim2_", tile_0);
+
+ TensorTileSampler src_sampler;
+ src_sampler.width(k0);
+ src_sampler.height(m0);
+ src_sampler.format(TensorSamplerFormat::C_WH_1);
+ // We cannot have out-of-bounds reads in the X dimension (mapped to the IFMs) as we have an extra loop to
+ // compute left-over elements
+ src_sampler.address_mode_x(TensorSamplerAddressModeX::None);
+ // We cannot have out-of-bounds reads when the kernel height is equal to 1. Otherwise, we need to ensure the
+ // indirection buffer mi does not contain negative values representing out-of-bounds reads.
+ src_sampler.address_mode_y(kernel_height == 1 ? TensorSamplerAddressModeY::None : TensorSamplerAddressModeY::SkipMinEdgeOnly);
+ src_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
+
+ TensorTileSampler wei_sampler;
+ wei_sampler.width(k0);
+ wei_sampler.height(n0);
+ wei_sampler.format(TensorSamplerFormat::C_WH_1);
+ // We cannot have out-of-bounds accesses for the weights
+ wei_sampler.address_mode_x(TensorSamplerAddressModeX::None);
+ wei_sampler.address_mode_y(TensorSamplerAddressModeY::None);
+ wei_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
+
+ TensorTileSampler dst_sampler;
+ dst_sampler.width(n0);
+ dst_sampler.height(m0);
+ dst_sampler.format(TensorSamplerFormat::C_WH_1);
+ dst_sampler.address_mode_x(TensorSamplerAddressModeX::OverlappingMin);
+ dst_sampler.address_mode_y(TensorSamplerAddressModeY::ClampToMaxEdgeOnly);
+ dst_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
+ dst_sampler.x(tile_cout);
+ dst_sampler.y(tile_mout);
+ dst_sampler.z(tile_0);
+ dst_sampler.b(tile_bout);
+
+ if(!dst->has_tile())
+ {
+ auto &tile = writer->declare_tile("dst", TileInfo(to_ckw(_dst->data_type()), m0, n0));
+ dst->init_virtual_tensor(tile, dst_sampler);
+ }
+ auto &tile_dst = dst->tile();
+
+ writer->op_assign(tile_dst, tile_0);
+
+ // We create a 2d container of size (M0, 1) to store the indices for iteration
+ TileContainer it;
+ for(int m = 0; m < m0; ++m)
+ {
+ std::vector<std::string> idx { std::to_string(m) };
+ it.push_back({ idx });
+ }
+ const auto &tile_it = writer->declare_tile("it", it, ckw::DataType::Int32);
+
+ auto &tile_xi = writer->declare_tile("xi", TileInfo(ckw::DataType::Int32, m0, 1));
+ auto &tile_yi = writer->declare_tile("yi", TileInfo(ckw::DataType::Int32, m0, 1));
+
+ // Convert the linear index to coordinate
+ // xi = ((mout + i) % dst_w) * stride_x - pad_x
+ // yi = ((mout + i) / dst_w) * stride_y - pad_y
+ writer->op_binary_expression(tile_xi, tile_mout, BinaryOp::Add, tile_it);
+ writer->op_binary_expression(tile_yi, tile_mout, BinaryOp::Add, tile_it);
+ writer->op_binary_expression(tile_xi, tile_xi, BinaryOp::Mod, tile_dst_w);
+ writer->op_binary_expression(tile_yi, tile_yi, BinaryOp::Div, tile_dst_w);
+ writer->op_binary_expression(tile_xi, tile_xi, BinaryOp::Mul, tile_stride_x);
+ writer->op_binary_expression(tile_yi, tile_yi, BinaryOp::Mul, tile_stride_y);
+ writer->op_binary_expression(tile_xi, tile_xi, BinaryOp::Sub, tile_pad_x);
+ writer->op_binary_expression(tile_yi, tile_yi, BinaryOp::Sub, tile_pad_y);
+
+ auto &tile_y_b = writer->declare_tile("y_b", ckw::DataType::Int32);
+ writer->op_binary_expression(tile_y_b, tile_cout, BinaryOp::Mul, tile_kernel_size);
+
+ auto &tile_i = writer->declare_tile("i", ckw::DataType::Int32);
+ writer->op_assign(tile_i, tile_0);
+
+ // clang-format off
+ writer->op_for_loop(tile_i, BinaryOp::Less, tile_kernel_size, tile_i, AssignmentOp::Increment, tile_1, [&]()
+ {
+ auto &tile_x_k = writer->declare_tile("x_k", ckw::DataType::Int32);
+ auto &tile_y_k = writer->declare_tile("y_k", ckw::DataType::Int32);
+
+ writer->op_binary_expression(tile_x_k, tile_i, BinaryOp::Mod, tile_kernel_w);
+ writer->op_binary_expression(tile_y_k, tile_i, BinaryOp::Div, tile_kernel_w);
+
+ auto &tile_ck = writer->declare_tile("ck", ckw::DataType::Int32);
+ writer->op_assign(tile_ck, tile_0);
+
+ auto &tile_mi = writer->declare_tile("mi", TileInfo(ckw::DataType::Int32, m0, 1));
+ // Construct an indirection buffer containing the precalculated addresses of elements in the source tensor
+ // x_s = xi + x_k
+ // y_s = yi + y_k
+ // mi = x_s + y_s * width;
+ // mi = select(-1, mi, x_s >= 0);
+ // mi = select(-1, mi, x_s < width);
+ // mi = select(-1, mi, y_s >= 0);
+ // mi = select(-1, mi, y_s < height);
+ writer->util_get_indirect_buffer(tile_mi, src->tensor(), src_sampler, tile_xi, tile_yi, tile_x_k, tile_y_k);
+
+ src_sampler.x(tile_ck);
+ src_sampler.y(tile_mi);
+ src_sampler.z(tile_0);
+ src_sampler.b(tile_bout);
+
+ wei_sampler.x(tile_ck);
+ wei_sampler.y(tile_y_b);
+ wei_sampler.z(tile_0);
+ wei_sampler.b(tile_0);
+
+ auto &tile_src_c_minus_k0 = writer->declare_tile("src_c_minus_k0", src_channels - k0);
+
+ writer->op_for_loop(tile_ck, BinaryOp::LessEqual, tile_src_c_minus_k0, tile_ck, AssignmentOp::Increment, tile_k0, [&]()
+ {
+ auto &tile_lhs = writer->declare_tile("lhs", TileInfo(to_ckw(_src->data_type()), m0, k0));
+ auto &tile_rhs = writer->declare_tile("rhs", TileInfo(to_ckw(_wei->data_type()), n0, k0));
+ writer->op_assign(tile_lhs, tile_0);
+ writer->op_assign(tile_rhs, tile_0);
+
+ writer->op_load_indirect(tile_lhs, src->tensor(), src_sampler);
+ writer->op_load(tile_rhs, wei->tensor(), wei_sampler, tile_kernel_size);
+
+ writer->op_binary_expression(tile_dst, tile_lhs, BinaryOp::MatMul_Nt_T, tile_rhs);
+ });
+
+ // Left-over accumulations for when K is not a multiple of k0
+ if(!(K % k0 == 0))
+ {
+ writer->op_for_loop(tile_ck, BinaryOp::Less, tile_src_c, tile_ck, AssignmentOp::Increment, tile_1, [&]()
+ {
+ auto &tile_lhs = writer->declare_tile("lhs_leftover", TileInfo(to_ckw(_src->data_type()), m0, 1));
+ auto &tile_rhs = writer->declare_tile("rhs_leftover", TileInfo(to_ckw(_wei->data_type()), n0, 1));
+ writer->op_assign(tile_lhs, tile_0);
+ writer->op_assign(tile_rhs, tile_0);
+
+ writer->op_load_indirect(tile_lhs, src->tensor(), src_sampler);
+ writer->op_load(tile_rhs, wei->tensor(), wei_sampler, tile_kernel_size);
+
+ writer->op_binary_expression(tile_dst, tile_lhs, BinaryOp::MatMul_Nt_T, tile_rhs);
+ });
+ }
+
+ writer->op_binary_expression(tile_y_b, tile_y_b, BinaryOp::Add, tile_1);
+ });
+ // clang-format on
+
+ // Bias addition
+ // NOTE: This operation will be removed from this kernel as the interface is standardized. The intended way of
+ // performing bias addition is to fuse this convolution kernel with a following elementwise addition kernel.
+ if(using_bias)
+ {
+ if(!bia->has_tile())
+ {
+ // Reuse the destination sampler for the bias
+ writer->op_load_once(bia, dst_sampler);
+ }
+ auto &tile_bia = bia->tile();
+
+ writer->op_binary_expression(tile_dst, tile_dst, BinaryOp::Add, tile_bia);
+ }
+}
+
+Window GpuCkwDirectConv2d::get_window() const
+{
+ ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+ const auto dst_shape = _dst->tensor_shape();
+ const auto desc = _settings.direct_conv_descriptor();
+
+ const unsigned int n0 = adjust_vec_size(desc.n0, dst_shape[0]);
+ const unsigned int m0 = adjust_vec_size(desc.m0, dst_shape[1] * dst_shape[2]);
+
+ Window win = calculate_max_window(dst_shape, Steps(n0, m0));
+
+ const size_t dim_y_collapsed = ceil_to_multiple(dst_shape[1] * dst_shape[2], m0);
+ win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, m0));
+ win.set(Window::DimZ, Window::Dimension(0, dst_shape.total_size_upper(3), 1));
+
+ return win;
+}
+
+std::string GpuCkwDirectConv2d::get_name(const ComponentGroup &comp_group) const
+{
+ ARM_COMPUTE_UNUSED(comp_group);
+
+ return "direct_conv2d";
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h
new file mode 100644
index 0000000000..ac32d2df32
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuCkwDirectConv2d final : public IGpuCkwComponentDriver
+{
+public:
+ using Attributes = ClComponentDirectConv2d::Attributes;
+ using Settings = ClComponentDirectConv2d::Settings;
+
+public:
+ /** Constructor
+ *
+ * For supported configurations please refer to @ref ClComponentDirectConv2d::validate()
+ *
+ * @param[in] id Component id
+ * @param[in] tensors Tensor arguments to the component
+ * @param[in] attributes Component attributes. Attributes are a set of parameters that define what a component does
+ * @param[in] settings Component settings. Settings are a set of parameters that influence the implementation of a component
+ */
+ GpuCkwDirectConv2d(ComponentId id,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings);
+
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwDirectConv2d);
+
+ /** Destructor */
+ ~GpuCkwDirectConv2d() override = default;
+
+ // Inherited methods overriden
+ virtual void write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const override;
+ Window get_window() const override;
+ std::string get_name(const ComponentGroup &comp_group) const override;
+
+private:
+ const ITensorInfo *_src;
+ const ITensorInfo *_wei;
+ const ITensorInfo *_bia;
+ const ITensorInfo *_dst;
+
+ const Attributes _attributes;
+ const Settings _settings;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D */
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
index 3965deced1..a713c82003 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
@@ -26,8 +26,14 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
+
#include "src/core/CL/CLValidate.h"
+
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h"
+#else // ACL_INTERNAL_TEST_CKW_IN_DF
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h"
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
namespace arm_compute
{
@@ -145,16 +151,27 @@ ClComponentDirectConv2d::ClComponentDirectConv2d(
const Attributes &attributes,
const Settings &settings)
: IGpuKernelComponent{ id, properties, tensors },
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
_component_writer{ std::make_unique<ClTemplateDirectConv2d>(id, tensors, attributes, settings) }
+#else // ACL_INTERNAL_TEST_CKW_IN_DF
+ _component_writer{ std::make_unique<GpuCkwDirectConv2d>(id, tensors, attributes, settings) }
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
{
}
+
ClComponentDirectConv2d::~ClComponentDirectConv2d()
{
}
+
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
const IGpuTemplateComponentWriter *ClComponentDirectConv2d::template_writer() const
+#else // ACL_INTERNAL_TEST_CKW_IN_DF
+const IGpuCkwComponentDriver *ClComponentDirectConv2d::ckw_component_driver() const
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
{
return _component_writer.get();
}
+
} // namespace dynamic_fusion
} // namespace experimental
} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
index 8e555dce57..24acb1b2c1 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
@@ -66,7 +66,11 @@ private:
};
/** Forward declaration */
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
class ClTemplateDirectConv2d;
+#else // ACL_INTERNAL_TEST_CKW_IN_DF
+class GpuCkwDirectConv2d;
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
class ClComponentDirectConv2d final : public IGpuKernelComponent
{
@@ -134,8 +138,12 @@ public:
ClComponentDirectConv2d(ClComponentDirectConv2d &&component) = default;
/** Allow instances of this class to be moved */
ClComponentDirectConv2d &operator=(ClComponentDirectConv2d &&component) = default;
- /** Get template writer for the component */
+ /** Get writer for the component */
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
const IGpuTemplateComponentWriter *template_writer() const override;
+#else // ACL_INTERNAL_TEST_CKW_IN_DF
+ const IGpuCkwComponentDriver *ckw_component_driver() const override;
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
/** Get component type */
GpuComponentType type() const override
{
@@ -143,7 +151,11 @@ public:
}
private:
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
std::unique_ptr<ClTemplateDirectConv2d> _component_writer;
+#else // ACL_INTERNAL_TEST_CKW_IN_DF
+ std::unique_ptr<GpuCkwDirectConv2d> _component_writer;
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
};
} // namespace dynamic_fusion
} // namespace experimental
diff --git a/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp b/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp
index 5ab1fafe2f..f27a1796c9 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp
@@ -22,7 +22,6 @@
* SOFTWARE.
*/
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF // Do not include this test if ACL_INTERNAL_TEST_CKW_IN_DF and the op has not been ported to ckw
#include "tests/AssetsLibrary.h"
#include "tests/CL/CLAccessor.h"
#include "tests/framework/Fixture.h"
@@ -250,5 +249,3 @@ TEST_SUITE_END() // CL
} // namespace validation
} // namespace test
} // namespace arm_compute
-
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF