diff options
Diffstat (limited to 'src/dynamic_fusion/sketch/gpu/ckw_driver/components')
24 files changed, 4360 insertions, 0 deletions
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp new file mode 100644 index 0000000000..18fda5bd6b --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp @@ -0,0 +1,295 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "GpuCkwActivation.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" + +#include "compute_kernel_writer/include/ckw/KernelWriter.h" +#include <cstdint> +#include <string> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ + +GpuCkwActivation::GpuCkwActivation(ComponentId id, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes) // NOLINT + : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes} +{ + _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); + _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); + ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); +} + +void GpuCkwActivation::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const +{ + /******************************************************************************** + * 1 - Define tensors + ********************************************************************************/ + GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src"); + GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst"); + + /******************************************************************************** + * 2 - Define CKW constants + ********************************************************************************/ + const auto dst_h = static_cast<int32_t>(_dst->dimension(1)); + const auto dst_dt = to_ckw(_dst->data_type()); + + // CKW constants + auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32)); + auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32)); + auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32)); + auto const_neg_1_fp = writer->declare_constant_tile(ckw::ConstantData({{-1.0f}}, dst_dt)); + auto const_pos_1_fp = writer->declare_constant_tile(ckw::ConstantData({{1.0f}}, dst_dt)); + auto const_0_fp = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt)); + auto const_A_fp = writer->declare_constant_tile(ckw::ConstantData({{_attributes.a()}}, dst_dt)); + auto const_B_fp = writer->declare_constant_tile(ckw::ConstantData({{_attributes.b()}}, dst_dt)); + + /******************************************************************************** + * 3 - Define the compute block parameters and destination tile (if not root component) + * Bind the tile to the tensor to share it among different components and + * initialize the compute block parameters + ********************************************************************************/ + // The compute block parameters depend on the employed tensor format + + // Destination compute block size + int32_t dst_n0 = -1; + int32_t dst_m0 = -1; + + // Destination compute block size left-over + int32_t dst_n0_partial = -1; + int32_t dst_m0_partial = -1; + + // Shift-back for the overlapping-min strategy + int32_t dst_shift_back = -1; + + if (!dst->has_tile()) + { + // If ROOT component, we use ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1 + // as tensor format + const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); + + dst_n0 = root_window.x().step(); + dst_m0 = root_window.y().step(); + dst_n0_partial = _dst->dimension(0) % dst_n0; + dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0; + dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0; + + ckw::TensorSampler sampler_dst; + sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1); + + if (dst_n0_partial == 0) + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None); + } + else + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin); + } + + if (dst_m0_partial == 0) + { + sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None); + } + else + { + sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly); + } + + sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr); + + // Declare destination tile + auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0)); + + // Bind tile to the tensor + dst->init_virtual_tensor(tile_dst, sampler_dst); + } + else + { + // dst_m0_partial depends on the TensorSamplerFormat + dst_n0 = dst->tile().tile_info().width(); + dst_m0 = dst->tile().tile_info().height(); + dst_n0_partial = _dst->dimension(0) % dst_n0; + + ckw::TensorSampler sampler_dst = dst->tensor_sampler(); + + if (sampler_dst.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1) + { + dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0; + } + else if (sampler_dst.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2) + { + dst_m0_partial = _dst->dimension(1) % dst_m0; + } + + // Shift-back for the overlapping-min strategy + dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0; + } + + const auto &tile_dst = dst->tile(); + + /******************************************************************************** + * 4 - Define the compute block parameters CKW constants + ********************************************************************************/ + // Only now we can declare the N0 and M0 as constant + auto const_dst_n0 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32)); + auto const_dst_m0 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32)); + auto const_dst_shift_back_n0 = + writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32)); + + /******************************************************************************** + * 5 - Define the sampler for the input tensor + ********************************************************************************/ + if (!src->has_tile()) + { + // Sampler + ckw::TensorSampler sampler_src = dst->tensor_sampler(); + + auto tile_gid_0 = writer->declare_tile("gid_0_src", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_1 = writer->declare_tile("gid_1_src", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_2 = writer->declare_tile("gid_2_src", ckw::TileInfo(ckw::DataType::Int32)); + + writer->op_get_global_id(tile_gid_0, 0); + writer->op_get_global_id(tile_gid_1, 1); + writer->op_get_global_id(tile_gid_2, 2); + + auto tile_nout0 = writer->declare_tile("nout0_src", ckw::TileInfo(ckw::DataType::Int32)); // OFM + auto tile_mout0 = + writer->declare_tile("mout0_src", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT + auto tile_mout1 = writer->declare_tile("mout1_src", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0 + auto tile_bout0 = writer->declare_tile("bout0_src", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX + + get_coordinate_from_gws_overlapping_min(writer, tile_nout0, tile_gid_0, const_dst_n0, const_dst_shift_back_n0, + const_0_i32); + get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_dst_m0); + + // Get the boundary aware coordinates at each global dimension index + if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1) + { + writer->op_assign(tile_mout1, const_0_i32); + get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32); + } + else if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2) + { + writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32); + writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32); + } + + auto tile_src = writer->declare_tile("src", ckw::TileInfo(dst_dt, dst_m0, dst_n0)); + + writer->op_load(tile_src, src->tensor(), sampler_src, tile_nout0, tile_mout0, tile_mout1, tile_bout0); + + // Here, init_virtual_tensor() it is used to bring the tile_src outside the compound statement + src->init_virtual_tensor(tile_src, sampler_src); + } + + const auto &tile_src = src->tile(); + + /******************************************************************************** + * 7 - Write the rest of the code + ********************************************************************************/ + switch (_attributes.activation()) + { + case ActivationLayerInfo::ActivationFunction::LOGISTIC: + { + // dst = src * -1 + writer->op_binary(tile_dst, ckw::BinaryOp::Mul, tile_src, const_neg_1_fp); + // dst = exp(src * -1) + writer->op_unary(tile_dst, ckw::UnaryOp::Exp, tile_dst); + // dst = 1 + (exp(src * -1)) + writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, const_pos_1_fp); + // dst = 1 / 1 + (exp(src * -1)) + writer->op_binary(tile_dst, ckw::BinaryOp::Div, const_pos_1_fp, tile_dst); + break; + } + case ActivationLayerInfo::ActivationFunction::TANH: + { + writer->op_unary(tile_dst, ckw::UnaryOp::Tanh, tile_src); + break; + } + case ActivationLayerInfo::ActivationFunction::RELU: + { + // dst = max(src, 0) + writer->op_binary(tile_dst, ckw::BinaryOp::Max, tile_src, const_0_fp); + break; + } + case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: + { + //dst = max(src, 0) + writer->op_binary(tile_dst, ckw::BinaryOp::Max, tile_src, const_0_fp); + //dst = min(max(src, 0), A_VAL) + writer->op_binary(tile_dst, ckw::BinaryOp::Min, tile_dst, const_A_fp); + break; + } + case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: + { + //dst = max(src, B_VAL) + writer->op_binary(tile_dst, ckw::BinaryOp::Max, tile_src, const_B_fp); + //dst = min(max(src, B_VAL), A_VAL) + writer->op_binary(tile_dst, ckw::BinaryOp::Min, tile_dst, const_A_fp); + break; + } + default: + CKW_ASSERT(false); + break; + } + ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component"); +} + +Window GpuCkwActivation::get_window() const +{ + ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); + + TensorShape output_shape = _dst->tensor_shape(); + // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged + // This is in line with the collapsing convention used by operators like Conv2d + output_shape.collapse(2U, 1U); + constexpr uint32_t vector_size_byte_opencl = 16; + const uint32_t num_elems_processed_per_iteration = + adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); + Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); + + return win; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h new file mode 100644 index 0000000000..386e933a72 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWACTIVATION +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWACTIVATION + +#include "src/core/common/Macros.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +class GpuCkwActivation : public IGpuCkwComponentDriver +{ +public: + using Attributes = ClComponentActivation::Attributes; + /** Constructor + * + * For supported configurations please refer to @ref GpuCkwActivation::validate() + * + * @param[in] id Component id + * @param[in] tensors Tensor arguments to the component + * @param[in] attributes Component attributes + */ + GpuCkwActivation(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwActivation); + /** Destructor */ + ~GpuCkwActivation() override = default; + // Inherited methods overriden: + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + Window get_window() const override; + +private: + const ITensorInfo *_src; + const ITensorInfo *_dst; + Attributes _attributes; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute + +#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWACTIVATION */ diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp new file mode 100644 index 0000000000..d3e0dbafd4 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "GpuCkwCast.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" + +#include "compute_kernel_writer/include/ckw/KernelWriter.h" +#include <cstdint> +#include <string> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ + +GpuCkwCast::GpuCkwCast(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes) + : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes} +{ + _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); + _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); + ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); + ARM_COMPUTE_ERROR_ON_MSG(is_data_type_float(_src->data_type()) == false, + "The source data type must be a floating-point data type"); +} + +void GpuCkwCast::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const +{ + /******************************************************************************** + * 1 - Define tensors + ********************************************************************************/ + GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src"); + GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst"); + + /******************************************************************************** + * 2 - Define CKW constants + ********************************************************************************/ + const auto dst_h = static_cast<int32_t>(_dst->dimension(1)); + + // CKW constants + auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32)); + auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32)); + auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32)); + + /******************************************************************************** + * 3 - Define the compute block parameters and destination tile (if not root component) + * Bind the tile to the tensor to share it among different components and + * initialize the compute block parameters + ********************************************************************************/ + // The compute block parameters depend on the employed tensor format + + // Destination compute block size + int32_t dst_n0 = -1; + int32_t dst_m0 = -1; + + // Destination compute block size left-over + int32_t dst_n0_partial = -1; + int32_t dst_m0_partial = -1; + + // Shift-back for the overlapping-min strategy + int32_t dst_shift_back = -1; + + if (!dst->has_tile()) + { + // If ROOT component, we use ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1 + // as tensor format + const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); + + dst_n0 = root_window.x().step(); + dst_m0 = root_window.y().step(); + dst_n0_partial = _dst->dimension(0) % dst_n0; + dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0; + dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0; + + ckw::TensorSampler sampler_dst; + sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1); + if (dst_n0_partial == 0) + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None); + } + else + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin); + } + + if (dst_m0_partial == 0) + { + sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None); + } + else + { + sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly); + } + + sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr); + + // Declare destination tile + ckw::DataType dst_dt = to_ckw(_dst->data_type()); + auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0)); + + // Bind tile to the tensor + dst->init_virtual_tensor(tile_dst, sampler_dst); + } + else + { + // Change dst_n0 and dst_m0 if NOT root component! + // ATTENTION: + // dst_m0_partial depends on the TensorSamplerFormat + dst_n0 = dst->tile().tile_info().width(); + dst_m0 = dst->tile().tile_info().height(); + dst_n0_partial = _dst->dimension(0) % dst_n0; + + ckw::TensorSampler sampler_dst = dst->tensor_sampler(); + + if (sampler_dst.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1) + { + dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0; + } + else if (sampler_dst.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2) + { + dst_m0_partial = _dst->dimension(1) % dst_m0; + } + + // Shift-back for the overlapping-min strategy + dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0; + } + + const auto &tile_dst = dst->tile(); + + /******************************************************************************** + * 4 - Define the compute block parameters CKW constants + ********************************************************************************/ + // Only now we can declare the N0 and M0 as constant + auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32)); + auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32)); + auto const_dst_shift_back_n0_i32 = + writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32)); + + /******************************************************************************** + * 5 - Define the sampler for the input tensor + ********************************************************************************/ + if (!src->has_tile()) + { + // Sampler + ckw::TensorSampler sampler_src = dst->tensor_sampler(); + + auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32)); + + writer->op_get_global_id(tile_gid_0, 0); + writer->op_get_global_id(tile_gid_1, 1); + writer->op_get_global_id(tile_gid_2, 2); + + auto tile_cout0 = writer->declare_tile("cout0", ckw::TileInfo(ckw::DataType::Int32)); // OFM + auto tile_mout0 = writer->declare_tile("mout0", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT + auto tile_mout1 = writer->declare_tile("mout1", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0 + auto tile_bout0 = writer->declare_tile("bout0", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX + + // Calculate coordinates + get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_dst_n0_i32, + const_dst_shift_back_n0_i32, const_0_i32); + get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_dst_m0_i32); + + // Get the boundary aware coordinates at each global dimension index + if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1) + { + writer->op_assign(tile_mout1, const_0_i32); + get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32); + } + else if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2) + { + writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32); + writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32); + } + ckw::DataType src_dt = to_ckw(_src->data_type()); + auto tile_src = writer->declare_tile("src", ckw::TileInfo(src_dt, dst_m0, dst_n0)); + + writer->op_load(tile_src, src->tensor(), sampler_src, tile_cout0, tile_mout0, tile_mout1, tile_bout0); + + // Here, init_virtual_tensor() it is used to bring the tile_src outside the compound statement + src->init_virtual_tensor(tile_src, sampler_src); + } + + auto tile_src = src->tile(); + + /******************************************************************************** + * 6 - Extra operations required before writing the main code (optional) + ********************************************************************************/ + + // Not required + + /******************************************************************************** + * 7 - Write the rest of the code + ********************************************************************************/ + // Only None ConvertPolicy is supported for floating-point data types + ckw::ConvertPolicy convert_policy = ckw::ConvertPolicy::None; + + writer->op_cast(tile_dst, tile_src, convert_policy); + ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component"); +} + +Window GpuCkwCast::get_window() const +{ + ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); + + TensorShape output_shape = _dst->tensor_shape(); + // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged + // This is in line with the collapsing convention used by operators like Conv2d + output_shape.collapse(2U, 1U); + constexpr uint32_t vector_size_byte_opencl = 16; + const uint32_t num_elems_processed_per_iteration = + adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); + Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); + + return win; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h new file mode 100644 index 0000000000..2389301196 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWCAST +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWCAST + +#include "src/core/common/Macros.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +class GpuCkwCast : public IGpuCkwComponentDriver +{ +public: + using Attributes = ClComponentCast::Attributes; + /** Constructor + * + * For supported configurations please refer to @ref ClComponentCast::validate() + * + * @param[in] id Component id + * @param[in] tensors Tensor arguments to the component + * @param[in] attributes Component attributes + */ + GpuCkwCast(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwCast); + /** Destructor */ + ~GpuCkwCast() override = default; + // Inherited methods overriden: + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + Window get_window() const override; + +private: + const ITensorInfo *_src; + const ITensorInfo *_dst; + Attributes _attributes; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute + +#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWCAST */ diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.cpp new file mode 100644 index 0000000000..cfccab186b --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.cpp @@ -0,0 +1,361 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" + +#include "compute_kernel_writer/include/ckw/KernelWriter.h" +#include <cstdint> +#include <string> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +GpuCkwDepthwiseConv2d::GpuCkwDepthwiseConv2d(ComponentId id, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings) + : IGpuCkwComponentDriver{id, tensors}, _src{}, _wei{}, _bia{}, _dst{}, _attributes{attributes}, _settings{settings} +{ + _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); + _wei = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); + if (this->tensors().get_const_tensor(TensorType::ACL_SRC_2)) + { + _bia = this->tensors().get_const_tensor(TensorType::ACL_SRC_2); + } + _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); + ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _wei, _bia, _dst); +} + +void GpuCkwDepthwiseConv2d::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const +{ + // Data Layout is NHWC + const uint32_t width_idx = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::WIDTH); + const uint32_t height_idx = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::HEIGHT); + + /******************************************************************************** + * 1 - Define tensors + ********************************************************************************/ + GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src"); + GpuCkwComponentArgument *wei = vtable.declare_variable(comp_group, writer, _wei, "wei"); + GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst"); + GpuCkwComponentArgument *bia = nullptr; + + const bool using_bias = _bia != nullptr; + + if (using_bias) + { + bia = vtable.declare_variable(comp_group, writer, _bia, "bia"); + } + + /******************************************************************************** + * 2 - Define CKW constants + ********************************************************************************/ + const auto dst_dt = to_ckw(_dst->data_type()); + const auto kernel_height = static_cast<int32_t>(_wei->dimension(height_idx)); + const auto kernel_width = static_cast<int32_t>(_wei->dimension(width_idx)); + const auto src_w = static_cast<int32_t>(_src->dimension(width_idx)); + const auto src_h = static_cast<int32_t>(_src->dimension(height_idx)); + const auto dst_h = static_cast<int32_t>(_dst->dimension(height_idx)); + const auto stride_x = static_cast<int32_t>(_attributes.stride().x()); + const auto stride_y = static_cast<int32_t>(_attributes.stride().y()); + const auto pad_x = static_cast<int32_t>(_attributes.pad().left); + const auto pad_y = static_cast<int32_t>(_attributes.pad().top); + const auto depth_multiplier = static_cast<int32_t>(_attributes.depth_multiplier()); + const auto dilation_x = static_cast<int32_t>(_attributes.dilation().x()); + const auto dilation_y = static_cast<int32_t>(_attributes.dilation().y()); + const auto kernel_size = kernel_width * kernel_height; + + // CKW constants + auto const_kernel_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{kernel_width}}, ckw::DataType::Int32)); + auto const_kernel_size_i32 = + writer->declare_constant_tile(ckw::ConstantData({{kernel_size}}, ckw::DataType::Int32)); + auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32)); + auto const_src_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32)); + auto const_src_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32)); + auto const_stride_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_x}}, ckw::DataType::Int32)); + auto const_stride_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_y}}, ckw::DataType::Int32)); + auto const_pad_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{pad_x}}, ckw::DataType::Int32)); + auto const_pad_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{pad_y}}, ckw::DataType::Int32)); + auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32)); + auto const_neg_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{-1}}, ckw::DataType::Int32)); + auto const_depth_multiplier_i32 = + writer->declare_constant_tile(ckw::ConstantData({{depth_multiplier}}, ckw::DataType::Int32)); + auto const_dilation_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{dilation_x}}, ckw::DataType::Int32)); + auto const_dilation_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{dilation_y}}, ckw::DataType::Int32)); + auto const_0_fp = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt)); + + /******************************************************************************** + * 3 - Define the compute block parameters and destination tile (if not root component) + * Bind the tile to the tensor to share it among different components and + * initialize the compute block parameters + ********************************************************************************/ + // The compute block parameters depend on the employed tensor format + const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); + + // Destination compute block size + const int32_t dst_n0 = root_window.x().step(); + const int32_t dst_m0 = root_window.y().step(); + + // Destination compute block size left-over + const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0; + const int32_t dst_m0_partial = _dst->dimension(1) % dst_m0; + + // Shift-back for the overlapping-min strategy + const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0; + + const int32_t src_m0 = kernel_width + (dst_m0 - 1); + const int32_t src_n0 = depth_multiplier > 1 ? 1 : dst_n0; + const int32_t wei_m0 = kernel_width; + const int32_t wei_n0 = dst_n0; + + ckw::TensorSampler sampler_dst; + sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2); + if (dst_n0_partial == 0) + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None); + } + else + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin); + } + + if (dst_m0_partial == 0) + { + sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None); + } + else + { + sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly); + } + + sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr); + + // Declare destination tile + auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0)); + + // Initialize the destination tile + writer->op_assign(tile_dst, const_0_fp); + + // Bind tile to the tensor + dst->init_virtual_tensor(tile_dst, sampler_dst); + + /******************************************************************************** + * 4 - Define the compute block parameters CKW constants + ********************************************************************************/ + // Only now we can declare the N0 and M0 as constant + auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32)); + auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32)); + auto const_shift_back_dst_n0_i32 = + writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32)); + + /******************************************************************************** + * 5 - Define the sampler for the input tensors + ********************************************************************************/ + // SOURCE SAMPLER + ckw::TensorSampler sampler_src; + sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2); + sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None); + sampler_src.address_mode_y(ckw::TensorSamplerAddressModeY::SkipLessThanZero); + sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + sampler_src.storage(ckw::TensorStorageType::BufferUint8Ptr); + + // WEIGHTS SAMPLER + // We cannot have out-of-bounds accesses for the weights + ckw::TensorSampler sampler_wei; + sampler_wei.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2); + sampler_wei.address_mode_x(ckw::TensorSamplerAddressModeX::None); + sampler_wei.address_mode_y(ckw::TensorSamplerAddressModeY::None); + sampler_wei.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + if (_settings.export_weights_to_cl_image()) + { + sampler_wei.storage(ckw::TensorStorageType::Texture2dReadOnly); + } + else + { + sampler_wei.storage(ckw::TensorStorageType::BufferUint8Ptr); + } + + // BIAS SAMPLER + ckw::TensorSampler sampler_bia; + sampler_bia.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2); + sampler_bia.address_mode_x(sampler_dst.address_mode_x()); + sampler_bia.address_mode_y(ckw::TensorSamplerAddressModeY::None); + sampler_bia.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + sampler_bia.storage(ckw::TensorStorageType::BufferUint8Ptr); + + /******************************************************************************** + * 6 - Extra operations required before writing the main code (Optional) + ********************************************************************************/ + // Not required + + /******************************************************************************** + * 7 - Get the coordinates of the destination tile + ********************************************************************************/ + auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32)); + + writer->op_get_global_id(tile_gid_0, 0); + writer->op_get_global_id(tile_gid_1, 1); + writer->op_get_global_id(tile_gid_2, 2); + + auto tile_cout0 = writer->declare_tile("cout0", ckw::TileInfo(ckw::DataType::Int32)); // OFM + auto tile_mout0 = writer->declare_tile("mout0", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH + auto tile_mout1 = writer->declare_tile("mout1", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT + auto tile_bout0 = writer->declare_tile("bout0", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX + + // Calculate coordinates + get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_dst_n0_i32, + const_shift_back_dst_n0_i32, const_0_i32); + get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_dst_m0_i32); + writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32); + writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32); + + auto tile_src_ci = writer->declare_tile("src_ci", ckw::DataType::Int32); + writer->op_binary(tile_src_ci, ckw::BinaryOp::Div, tile_cout0, const_depth_multiplier_i32); + + auto tile_src_xi = writer->declare_tile("src_xi", ckw::DataType::Int32); + writer->op_binary(tile_src_xi, ckw::BinaryOp::Mul, tile_mout0, const_stride_x_i32); + writer->op_binary(tile_src_xi, ckw::BinaryOp::Sub, tile_src_xi, const_pad_x_i32); + + auto tile_src_yi = writer->declare_tile("src_yi", ckw::DataType::Int32); + writer->op_binary(tile_src_yi, ckw::BinaryOp::Mul, tile_mout1, const_stride_y_i32); + writer->op_binary(tile_src_yi, ckw::BinaryOp::Sub, tile_src_yi, const_pad_y_i32); + + // Loop variables + auto tile_yk = writer->declare_tile("yk", ckw::DataType::Int32); + + writer->op_assign(tile_yk, const_0_i32); + + // clang-format off + writer->op_for_loop(tile_yk, ckw::BinaryOp::Less, const_kernel_size_i32, tile_yk, ckw::AssignmentOp::Increment, const_kernel_w_i32, + [&]() + { + auto tile_src = writer->declare_tile("a", ckw::TileInfo(to_ckw(_src->data_type()), src_m0, src_n0)); + auto tile_wei = writer->declare_tile("b", ckw::TileInfo(to_ckw(_wei->data_type()), wei_m0, wei_n0)); + + writer->op_assign(tile_src, const_0_fp); + + auto tile_x_gte_0 = writer->declare_tile("x_gte_0", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_y_gte_0 = writer->declare_tile("y_gte_0", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_x_lt_w = writer->declare_tile("x_lt_w", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_y_lt_h = writer->declare_tile("y_lt_h", ckw::TileInfo(ckw::DataType::Int32)); + + // Check if yi + yk * DILATION_Y is out-of-bound + writer->op_binary(tile_y_gte_0, ckw::BinaryOp::GreaterEqual, tile_src_yi, const_0_i32); + writer->op_binary(tile_y_lt_h, ckw::BinaryOp::Less, tile_src_yi, const_src_h_i32); + + auto tile_src_mi = writer->declare_tile("src_mi", ckw::TileInfo(ckw::DataType::Int32)); + + // Load src + for(int32_t xk = 0; xk < src_m0; ++xk) + { + auto const_xk_i32 = writer->declare_constant_tile(ckw::ConstantData({{xk}}, ckw::DataType::Int32)); + + // xi + xk * DILATION_X + writer->op_binary(tile_src_mi, ckw::BinaryOp::Mul, const_xk_i32, const_dilation_x_i32); + writer->op_binary(tile_src_mi, ckw::BinaryOp::Add, tile_src_mi, tile_src_xi); + + // Check if xi + xk * DILATION_X is out-of-bound + writer->op_binary(tile_x_gte_0, ckw::BinaryOp::GreaterEqual, tile_src_mi, const_0_i32); + writer->op_binary(tile_x_lt_w, ckw::BinaryOp::Less, tile_src_mi, const_src_w_i32); + + // Set mi to -1 if we have out-of-bound memory accesses + writer->op_ternary(tile_src_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_src_mi, tile_x_gte_0); + writer->op_ternary(tile_src_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_src_mi, tile_x_lt_w); + writer->op_ternary(tile_src_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_src_mi, tile_y_gte_0); + writer->op_ternary(tile_src_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_src_mi, tile_y_lt_h); + + writer->op_load(tile_src.row(xk), src->tensor(), sampler_src, tile_src_ci, tile_src_mi, tile_src_yi, tile_bout0); + } + + // Load wei + writer->op_load(tile_wei, wei->tensor(), sampler_wei, tile_cout0, tile_yk, const_0_i32, const_0_i32); + + // Attention: MAC (Multiply-and-Accumulate) ternary operator is currently unsupported in CKW + // Therefore, this part should be replaced with the MAC ternary operator when availabe + auto tile_tmp = writer->declare_tile("tmp", ckw::TileInfo(to_ckw(_src->data_type()), 1, dst_n0)); + for(int32_t m0 = 0; m0 < dst_m0; ++m0) + { + for(int32_t xk = 0; xk < kernel_width; ++xk) + { + auto tile_a = tile_src.row(m0 + xk); + auto tile_b = tile_wei.row(xk); + auto tile_c = tile_dst.row(m0); + + writer->op_binary(tile_tmp, ckw::BinaryOp::Mul, tile_a, tile_b); + writer->op_binary(tile_c, ckw::BinaryOp::Add, tile_c, tile_tmp); + } + } + writer->op_binary(tile_src_yi, ckw::BinaryOp::Add, tile_src_yi, const_dilation_y_i32); + }); + // clang-format on + + // Bias addition + // NOTE: This operation will be removed from this kernel as the interface is standardized. The intended way of + // performing bias addition is to fuse this convolution kernel with a following elementwise addition kernel. + if (using_bias) + { + if (!bia->has_tile()) + { + auto tile_bia = writer->declare_tile("bia", ckw::TileInfo(to_ckw(_src->data_type()), 1, dst_n0)); + writer->op_load(tile_bia, bia->tensor(), sampler_bia, tile_cout0, const_0_i32, const_0_i32, const_0_i32); + bia->init_virtual_tensor(tile_bia, sampler_bia); + } + auto &tile_bia = bia->tile(); + + writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_bia); + } + + ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component"); +} + +Window GpuCkwDepthwiseConv2d::get_window() const +{ + ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); + TensorShape output_shape = _dst->tensor_shape(); + + Window win = calculate_max_window(output_shape, Steps(_settings.n0(), _settings.m0())); + return win.collapse(win, Window::DimZ); +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h new file mode 100644 index 0000000000..a15d3ee710 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDEPTHWISECONV2D_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDEPTHWISECONV2D_H + +#include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h" + +#include "src/core/common/Macros.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ + +class GpuCkwDepthwiseConv2d : public IGpuCkwComponentDriver +{ +public: + using Attributes = ClComponentDepthwiseConv2d::Attributes; + using Settings = ClComponentDepthwiseConv2d::Settings; + + /** Constructor + * + * For supported configurations please refer to @ref ClComponentDepthwiseConv2d::validate() + * + * @param[in] id Component id + * @param[in] tensors Tensor arguments to the component + * @param[in] attributes Component attributes + * @param[in] settings Component settings + */ + GpuCkwDepthwiseConv2d(ComponentId id, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwDepthwiseConv2d); + /** Destructor */ + ~GpuCkwDepthwiseConv2d() override = default; + // Inherited methods overriden: + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + Window get_window() const override; + +private: + const ITensorInfo *_src; + const ITensorInfo *_wei; + const ITensorInfo *_bia; + const ITensorInfo *_dst; + Attributes _attributes; + Settings _settings; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDEPTHWISECONV2D_H diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp new file mode 100644 index 0000000000..eb4f644eb6 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp @@ -0,0 +1,427 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" + +#include "compute_kernel_writer/include/ckw/KernelWriter.h" +#include <cstdint> +#include <string> +#include <vector> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ + +using TileContainer = std::vector<std::vector<int32_t>>; + +GpuCkwDirectConv2d::GpuCkwDirectConv2d(ComponentId id, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings) + : IGpuCkwComponentDriver{id, tensors}, _src{}, _wei{}, _bia{}, _dst{}, _attributes{attributes}, _settings{settings} +{ + _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); + _wei = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); + _bia = this->tensors().get_const_tensor(TensorType::ACL_SRC_2); + _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); + ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _wei, _dst); // Bias can be null +} + +void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const +{ + const auto desc = _settings.direct_conv_descriptor(); + ARM_COMPUTE_ERROR_ON_MSG(desc.export_input_to_cl_image || desc.export_output_to_cl_image, + "Only the weights tensor can be exported to cl_image"); + + const uint32_t channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL); + const uint32_t width_idx = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::WIDTH); + const uint32_t height_idx = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::HEIGHT); + + /******************************************************************************** + * 1 - Define tensors + ********************************************************************************/ + GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src"); + GpuCkwComponentArgument *wei = vtable.declare_variable(comp_group, writer, _wei, "wei"); + GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst"); + GpuCkwComponentArgument *bia = nullptr; + + const bool using_bias = _bia != nullptr; + + if (using_bias) + { + bia = vtable.declare_variable(comp_group, writer, _bia, "bia"); + } + + /******************************************************************************** + * 2 - Define CKW constants + ********************************************************************************/ + const auto dst_dt = to_ckw(_dst->data_type()); + const auto kernel_height = static_cast<int32_t>(_wei->dimension(height_idx)); + const auto kernel_width = static_cast<int32_t>(_wei->dimension(width_idx)); + const auto src_c = static_cast<int32_t>(_src->dimension(channel_idx)); + const auto src_w = static_cast<int32_t>(_src->dimension(width_idx)); + const auto src_h = static_cast<int32_t>(_src->dimension(height_idx)); + const auto dst_w = static_cast<int32_t>(_dst->dimension(width_idx)); + const auto stride_x = static_cast<int32_t>(_attributes.stride().x()); + const auto stride_y = static_cast<int32_t>(_attributes.stride().y()); + const auto pad_x = static_cast<int32_t>(_attributes.pad().left); + const auto pad_y = static_cast<int32_t>(_attributes.pad().top); + const auto kernel_size = kernel_width * kernel_height; + const auto k0 = + static_cast<int32_t>(adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx))); + + // CKW constants + auto const_kernel_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{kernel_width}}, ckw::DataType::Int32)); + auto const_kernel_size_i32 = + writer->declare_constant_tile(ckw::ConstantData({{kernel_size}}, ckw::DataType::Int32)); + auto const_src_c_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_c}}, ckw::DataType::Int32)); + auto const_src_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32)); + auto const_src_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32)); + auto const_dst_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_w}}, ckw::DataType::Int32)); + auto const_stride_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_x}}, ckw::DataType::Int32)); + auto const_stride_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_y}}, ckw::DataType::Int32)); + auto const_pad_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{pad_x}}, ckw::DataType::Int32)); + auto const_pad_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{pad_y}}, ckw::DataType::Int32)); + auto const_k0_i32 = writer->declare_constant_tile(ckw::ConstantData({{k0}}, ckw::DataType::Int32)); + auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32)); + auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32)); + auto const_neg_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{-1}}, ckw::DataType::Int32)); + auto const_0_fp = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt)); + auto const_src_c_i32_minus_k0_i32 = + writer->declare_constant_tile(ckw::ConstantData({{src_c - k0}}, ckw::DataType::Int32)); + + /******************************************************************************** + * 3 - Define the compute block parameters and destination tile (if not root component) + * Bind the tile to the tensor to share it among different components and + * initialize the compute block parameters + ********************************************************************************/ + // The compute block parameters depend on the employed tensor format + const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); + + // Destination compute block size + const int32_t dst_n0 = root_window.x().step(); + const int32_t dst_m0 = root_window.y().step(); + + // Destination compute block size left-over + const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0; + const int32_t dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0; + + // Shift-back for the overlapping-min strategy + const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0; + + ckw::TensorSampler sampler_dst; + sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1); + if (dst_n0_partial == 0) + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None); + } + else + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin); + } + + if (dst_m0_partial == 0) + { + sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None); + } + else + { + sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly); + } + + sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr); + + // Declare destination tile + auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0)); + + // Initialize destination tile + writer->op_assign(tile_dst, const_0_fp); + + // Bind tile to the tensor + dst->init_virtual_tensor(tile_dst, sampler_dst); + + /******************************************************************************** + * 4 - Define the compute block parameters CKW constants + ********************************************************************************/ + // Only now we can declare the N0 and M0 as constant + auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32)); + auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32)); + auto const_shift_back_dst_n0_i32 = + writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32)); + + /******************************************************************************** + * 5 - Define the samplers for the input tensors + ********************************************************************************/ + // Exporting the weights tensor to an OpenCL image object is currently only supported when: + // a) k0 is equal to 4 + // The current implementation expects to read a vector of 4 float values into the OpenCL image object. + // b) K is a multiple of 4 + // This is a limitation in the current interface due to the variable table being responsible for maintaining + // information about the TensorStorageType rather than the TensorTileSampler. As a result, TensorStorageType cannot + // be reassigned, and we cannot use a texture object for the weights tensor in cases where we expect to have an + // extra loop to compute the left-over elements. + const bool use_cl_image_for_weights = desc.export_weights_to_cl_image && (k0 == 4) && (src_c % 4 == 0); + + // SOURCE SAMPLER + // - We cannot have out-of-bounds reads in the X dimension (mapped to the IFMs) as we have an extra loop to + // compute left-over elements + // - We cannot have out-of-bounds reads when the kernel height is equal to 1. In all other cases, we need to ensure the + // indirection buffer mi does not contain negative values representing out-of-bounds reads. + auto address_mode_y_src = + kernel_height == 1 ? ckw::TensorSamplerAddressModeY::None : ckw::TensorSamplerAddressModeY::SkipLessThanZero; + ckw::TensorSampler sampler_src; + sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1); // 3rd dimension collapsed with 2nd dimension + sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None); + sampler_src.address_mode_y(address_mode_y_src); + sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + sampler_src.storage(ckw::TensorStorageType::BufferUint8Ptr); + + // WEIGHTS SAMPLER + // We cannot have out-of-bounds accesses for the weights + ckw::TensorSampler sampler_wei; + sampler_wei.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1); // 3rd dimension collapsed with 2nd dimension + sampler_wei.address_mode_x(ckw::TensorSamplerAddressModeX::None); + sampler_wei.address_mode_y(ckw::TensorSamplerAddressModeY::None); + sampler_wei.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + if (use_cl_image_for_weights) + { + sampler_wei.storage(ckw::TensorStorageType::Texture2dReadOnly); + } + else + { + sampler_wei.storage(ckw::TensorStorageType::BufferUint8Ptr); + } + + // BIAS SAMPLER + ckw::TensorSampler sampler_bia; + + if (using_bias) + { + sampler_bia.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1); + sampler_bia.address_mode_x(sampler_dst.address_mode_x()); + sampler_bia.address_mode_y(ckw::TensorSamplerAddressModeY::None); + sampler_bia.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + sampler_bia.storage(ckw::TensorStorageType::BufferUint8Ptr); + } + + /******************************************************************************** + * 6 - Extra operations required before writing the main code (optional) + ********************************************************************************/ + + // Not required + + /******************************************************************************** + * 7 - Get the coordinates of the destination tile + ********************************************************************************/ + auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32)); + + writer->op_get_global_id(tile_gid_0, 0); + writer->op_get_global_id(tile_gid_1, 1); + writer->op_get_global_id(tile_gid_2, 2); + + auto tile_cout = writer->declare_tile("cout", ckw::TileInfo(ckw::DataType::Int32)); // OFM + auto tile_mout = writer->declare_tile("mout", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH x HEIGHT + auto tile_bout = writer->declare_tile("bout", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX + + // Calculate coordinates + get_coordinate_from_gws_overlapping_min(writer, tile_cout, tile_gid_0, const_dst_n0_i32, + const_shift_back_dst_n0_i32, const_0_i32); + get_coordinate_from_gws(writer, tile_mout, tile_gid_1, const_dst_m0_i32); + get_coordinate_from_gws(writer, tile_bout, tile_gid_2, const_pos_1_i32); + + /******************************************************************************** + * 8 - Write the rest of the code + ********************************************************************************/ + // We create a 2d container of size (dst_m0, 1) to store the indices for iteration + TileContainer it; + for (int32_t m = 0; m < dst_m0; ++m) + { + std::vector<int32_t> idx{m}; + it.push_back({idx}); + } + + const auto &const_idxs = writer->declare_constant_tile(ckw::ConstantData(it, ckw::DataType::Int32)); + + auto tile_xi = writer->declare_tile("xi", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1)); + auto tile_yi = writer->declare_tile("yi", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1)); + + // Convert the linear index to coordinate + // xi = ((mout + i) % dst_w) * stride_x - pad_x + // yi = ((mout + i) / dst_w) * stride_y - pad_y + writer->op_binary(tile_xi, ckw::BinaryOp::Add, tile_mout, const_idxs); + writer->op_binary(tile_yi, ckw::BinaryOp::Add, tile_mout, const_idxs); + writer->op_binary(tile_xi, ckw::BinaryOp::Mod, tile_xi, const_dst_w_i32); + writer->op_binary(tile_yi, ckw::BinaryOp::Div, tile_yi, const_dst_w_i32); + writer->op_binary(tile_xi, ckw::BinaryOp::Mul, tile_xi, const_stride_x_i32); + writer->op_binary(tile_yi, ckw::BinaryOp::Mul, tile_yi, const_stride_y_i32); + writer->op_binary(tile_xi, ckw::BinaryOp::Sub, tile_xi, const_pad_x_i32); + writer->op_binary(tile_yi, ckw::BinaryOp::Sub, tile_yi, const_pad_y_i32); + + auto tile_y_b = writer->declare_tile("y_b", ckw::TileInfo(ckw::DataType::Int32)); + writer->op_binary(tile_y_b, ckw::BinaryOp::Mul, tile_cout, const_kernel_size_i32); + + auto tile_i = writer->declare_tile("i", ckw::TileInfo(ckw::DataType::Int32)); + writer->op_assign(tile_i, const_0_i32); + + // clang-format off + writer->op_for_loop(tile_i, ckw::BinaryOp::Less, const_kernel_size_i32, tile_i, ckw::AssignmentOp::Increment, const_pos_1_i32, [&]() + { + auto tile_x_k = writer->declare_tile("x_k", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_y_k = writer->declare_tile("y_k", ckw::TileInfo(ckw::DataType::Int32)); + + writer->op_binary(tile_x_k, ckw::BinaryOp::Mod, tile_i, const_kernel_w_i32); + writer->op_binary(tile_y_k, ckw::BinaryOp::Div, tile_i, const_kernel_w_i32); + + auto tile_ck = writer->declare_tile("ck", ckw::TileInfo(ckw::DataType::Int32)); + writer->op_assign(tile_ck, const_0_i32); + + // Construct an indirection buffer containing the precalculated addresses of elements in the source tensor + // x_s = xi + x_k + // y_s = yi + y_k + // mi = x_s + y_s * width; + // mi = select(-1, mi, x_s >= 0); + // mi = select(-1, mi, x_s < width); + // mi = select(-1, mi, y_s >= 0); + // mi = select(-1, mi, y_s < height); + auto tile_xs = writer->declare_tile("xs", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1)); + auto tile_ys = writer->declare_tile("ys", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1)); + auto tile_mi = writer->declare_tile("mi", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1)); + + auto tile_xs_gte_0 = writer->declare_tile("xs_gte_0", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1)); + auto tile_ys_gte_0 = writer->declare_tile("ys_gte_0", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1)); + auto tile_xs_lt_w = writer->declare_tile("xs_lt_w", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1)); + auto tile_ys_lt_h = writer->declare_tile("ys_lt_h", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1)); + + writer->op_binary(tile_xs, ckw::BinaryOp::Add, tile_xi, tile_x_k); + writer->op_binary(tile_ys, ckw::BinaryOp::Add, tile_yi, tile_y_k); + writer->op_binary(tile_mi, ckw::BinaryOp::Mul, tile_ys, const_src_w_i32); + writer->op_binary(tile_mi, ckw::BinaryOp::Add, tile_mi, tile_xs); + writer->op_binary(tile_xs_gte_0, ckw::BinaryOp::GreaterEqual, tile_xs, const_0_i32); + writer->op_binary(tile_ys_gte_0, ckw::BinaryOp::GreaterEqual, tile_ys, const_0_i32); + writer->op_binary(tile_xs_lt_w, ckw::BinaryOp::Less, tile_xs, const_src_w_i32); + writer->op_binary(tile_ys_lt_h, ckw::BinaryOp::Less, tile_ys, const_src_h_i32); + writer->op_ternary(tile_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_mi, tile_xs_gte_0); + writer->op_ternary(tile_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_mi, tile_ys_gte_0); + writer->op_ternary(tile_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_mi, tile_xs_lt_w); + writer->op_ternary(tile_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_mi, tile_ys_lt_h); + + writer->op_for_loop(tile_ck, ckw::BinaryOp::LessEqual, const_src_c_i32_minus_k0_i32, tile_ck, ckw::AssignmentOp::Increment, const_k0_i32, [&]() + { + auto tile_lhs = writer->declare_tile("lhs", ckw::TileInfo(to_ckw(_src->data_type()), dst_m0, k0)); + auto tile_rhs = writer->declare_tile("rhs", ckw::TileInfo(to_ckw(_wei->data_type()), dst_n0, k0)); + writer->op_assign(tile_lhs, const_0_fp); + writer->op_assign(tile_rhs, const_0_fp); + + writer->op_load_indirect(tile_lhs, src->tensor(), sampler_src, tile_ck, tile_mi, const_0_i32, tile_bout); + writer->op_load_dilated(tile_rhs, wei->tensor(), sampler_wei, tile_ck, tile_y_b, const_0_i32, const_0_i32, const_pos_1_i32, const_kernel_size_i32); + + writer->op_binary(tile_dst, ckw::BinaryOp::MatMul_Nt_T, tile_lhs, tile_rhs); + }); + + // Left-over accumulations for when K is not a multiple of k0 + if(((src_c % k0) != 0)) + { + writer->op_for_loop(tile_ck, ckw::BinaryOp::Less, const_src_c_i32, tile_ck, ckw::AssignmentOp::Increment, const_pos_1_i32, [&]() + { + auto tile_lhs = writer->declare_tile("lhs_leftover", ckw::TileInfo(to_ckw(_src->data_type()), dst_m0, 1)); + auto tile_rhs = writer->declare_tile("rhs_leftover", ckw::TileInfo(to_ckw(_wei->data_type()), dst_n0, 1)); + writer->op_assign(tile_lhs, const_0_fp); + writer->op_assign(tile_rhs, const_0_fp); + + writer->op_load_indirect(tile_lhs, src->tensor(), sampler_src, tile_ck, tile_mi, const_0_i32, tile_bout); + writer->op_load_dilated(tile_rhs, wei->tensor(), sampler_wei, tile_ck, tile_y_b, const_0_i32, const_0_i32, const_pos_1_i32, const_kernel_size_i32); + + writer->op_binary(tile_dst, ckw::BinaryOp::MatMul_Nt_T, tile_lhs, tile_rhs); + }); + } + + writer->op_binary(tile_y_b, ckw::BinaryOp::Add, tile_y_b, const_pos_1_i32); + }); + // clang-format on + + // NOTE: The bias addition will be removed from this kernel as the interface is standardized. The intended way of + // performing bias addition is to fuse this convolution kernel with a following elementwise addition kernel. + if (using_bias) + { + if (!bia->has_tile()) + { + auto tile_bia = writer->declare_tile("bia", ckw::TileInfo(to_ckw(_src->data_type()), 1, dst_n0)); + writer->op_load(tile_bia, bia->tensor(), sampler_bia, tile_cout, const_0_i32, const_0_i32, const_0_i32); + bia->init_virtual_tensor(tile_bia, sampler_bia); + } + auto &tile_bia = bia->tile(); + + writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_bia); + } + + ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component"); +} + +Window GpuCkwDirectConv2d::get_window() const +{ + ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); + + const auto dst_shape = _dst->tensor_shape(); + const auto desc = _settings.direct_conv_descriptor(); + + const uint32_t dst_n0 = adjust_vec_size(desc.n0, dst_shape[0]); + const uint32_t dst_m0 = adjust_vec_size(desc.m0, dst_shape[1] * dst_shape[2]); + + Window win = calculate_max_window(dst_shape, Steps(dst_n0, dst_m0)); + + const size_t dim_y_collapsed = ceil_to_multiple(dst_shape[1] * dst_shape[2], dst_m0); + win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, dst_m0)); + win.set(Window::DimZ, Window::Dimension(0, dst_shape.total_size_upper(3), 1)); + + return win; +} + +std::string GpuCkwDirectConv2d::get_name(const ComponentGroup &comp_group) const +{ + ARM_COMPUTE_UNUSED(comp_group); + + return "direct_conv2d"; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h new file mode 100644 index 0000000000..139cf620e2 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D_H + +#include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h" + +#include "src/core/common/Macros.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +class GpuCkwDirectConv2d : public IGpuCkwComponentDriver +{ +public: + using Attributes = ClComponentDirectConv2d::Attributes; + using Settings = ClComponentDirectConv2d::Settings; + +public: + /** Constructor + * + * For supported configurations please refer to @ref ClComponentDirectConv2d::validate() + * + * @param[in] id Component id + * @param[in] tensors Tensor arguments to the component + * @param[in] attributes Component attributes. Attributes are a set of parameters that define what a component does + * @param[in] settings Component settings. Settings are a set of parameters that influence the implementation of a component + */ + GpuCkwDirectConv2d(ComponentId id, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwDirectConv2d); + /** Destructor */ + ~GpuCkwDirectConv2d() override = default; + + // Inherited methods overriden + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + Window get_window() const override; + std::string get_name(const ComponentGroup &comp_group) const override; + +private: + const ITensorInfo *_src; + const ITensorInfo *_wei; + const ITensorInfo *_bia; + const ITensorInfo *_dst; + + Attributes _attributes; + Settings _settings; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute + +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D_H diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp new file mode 100644 index 0000000000..fb55acad53 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp @@ -0,0 +1,434 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "GpuCkwElementwiseBinary.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" +#include "src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "support/StringSupport.h" + +#include "compute_kernel_writer/include/ckw/KernelWriter.h" +#include "compute_kernel_writer/include/ckw/types/ConstantData.h" +#include "compute_kernel_writer/include/ckw/types/TensorSamplerTypes.h" +#include <cstdint> +#include <string> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +GpuCkwElementwiseBinary::GpuCkwElementwiseBinary(ComponentId id, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes) + : IGpuCkwComponentDriver{id, tensors}, _lhs{}, _rhs{}, _dst{}, _attributes{attributes} +{ + _lhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); + _rhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); + _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); + ARM_COMPUTE_ERROR_ON_NULLPTR(_lhs, _rhs, _dst); +} + +void GpuCkwElementwiseBinary::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const +{ + /******************************************************************************** + * 1 - Define tensors + ********************************************************************************/ + GpuCkwComponentArgument *lhs = vtable.declare_variable(comp_group, writer, _lhs, "lhs"); + GpuCkwComponentArgument *rhs = vtable.declare_variable(comp_group, writer, _rhs, "rhs"); + GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst"); + + /******************************************************************************** + * 2 - Define CKW constants + ********************************************************************************/ + const auto dst_h = static_cast<int32_t>(_dst->dimension(1)); + + // CKW constants + auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32)); + auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32)); + auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32)); + + /******************************************************************************** + * 3 - Define the compute block parameters and destination tile (if not root component) + * Bind the tile to the tensor to share it among different components and + * initialize the compute block parameters + ********************************************************************************/ + // The compute block parameters depend on the employed tensor format + + // Destination compute block size + int32_t dst_n0 = -1; + int32_t dst_m0 = -1; + + // Destination compute block size left-over + int32_t dst_n0_partial = -1; + int32_t dst_m0_partial = -1; + + if (!dst->has_tile()) + { + // If ROOT component, we use ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1 + // as tensor format + const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); + + dst_n0 = root_window.x().step(); + dst_m0 = root_window.y().step(); + dst_n0_partial = _dst->dimension(0) % dst_n0; + dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0; + + ckw::TensorSampler sampler_dst; + sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1); + if (dst_n0_partial == 0) + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None); + } + else + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin); + } + + if (dst_m0_partial == 0) + { + sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None); + } + else + { + sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly); + } + sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr); + + // Declare destination tile + ckw::DataType dst_dt = to_ckw(_dst->data_type()); + auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0)); + + // Bind tile to the tensor + dst->init_virtual_tensor(tile_dst, sampler_dst); + } + else + { + // Change dst_n0 and dst_m0 if NOT root component! + dst_n0 = dst->tile().tile_info().width(); + dst_m0 = dst->tile().tile_info().height(); + + // Here, it is not required the calculation of dst_n0_partial and dst_m0_partial + // because if we enter this condition it means that the element-wise op is not the + // root component and the address modes have been already set. + } + + const auto &tile_dst = dst->tile(); + + /******************************************************************************** + * 4 - Define the compute block parameters CKW constants + ********************************************************************************/ + // ... + + /******************************************************************************** + * 5 - Define the samplers for the input tensors + ********************************************************************************/ + // Check whether the lhs tensor is a tile or tensor + // If it is a tile, create a sampler and load the content in a tile + if (!lhs->has_tile()) + { + // Sampler + ckw::TensorSampler sampler_lhs = dst->tensor_sampler(); + + bool broadcast_x = false; + bool broadcast_y = false; + + int32_t lhs_n0 = dst_n0; + int32_t lhs_m0 = dst_m0; + + // Check whether we have broadcasting + // In case of broadcast, lhs can only be a vector or scalar. + // Broadcasting in other dimensions is not supported + if (_dst->dimension(0) != _lhs->dimension(0)) + { + broadcast_x = true; + lhs_n0 = 1; + } + + if (sampler_lhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1) + { + if (_dst->dimension(1) * _dst->dimension(2) != _lhs->dimension(1) * _lhs->dimension(2)) + { + broadcast_y = true; + lhs_m0 = 1; + } + } + else if (sampler_lhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2) + { + if (_dst->dimension(1) != _lhs->dimension(1)) + { + broadcast_y = true; + lhs_m0 = 1; + } + } + + const int32_t lhs_partial_n0 = _lhs->dimension(0) % lhs_n0; + const int32_t lhs_shift_back = (lhs_n0 - lhs_partial_n0) % lhs_n0; + + // Constants + auto const_lhs_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{lhs_n0}}, ckw::DataType::Int32)); + auto const_lhs_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{lhs_m0}}, ckw::DataType::Int32)); + auto const_lhs_shift_back_n0_i32 = + writer->declare_constant_tile(ckw::ConstantData({{lhs_shift_back}}, ckw::DataType::Int32)); + + auto tile_gid_0 = writer->declare_tile("gid_0_lhs", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_1 = writer->declare_tile("gid_1_lhs", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_2 = writer->declare_tile("gid_2_lhs", ckw::TileInfo(ckw::DataType::Int32)); + + writer->op_get_global_id(tile_gid_0, 0); + writer->op_get_global_id(tile_gid_1, 1); + writer->op_get_global_id(tile_gid_2, 2); + + auto tile_cout0 = writer->declare_tile("cout0_lhs", ckw::TileInfo(ckw::DataType::Int32)); // OFM + auto tile_mout0 = + writer->declare_tile("mout0_lhs", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT + auto tile_mout1 = writer->declare_tile("mout1_lhs", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0 + auto tile_bout0 = writer->declare_tile("bout0_lhs", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX + + // Calculate coordinates + if (!broadcast_x) + { + get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_lhs_n0_i32, + const_lhs_shift_back_n0_i32, const_0_i32); + } + else + { + writer->op_assign(tile_cout0, const_0_i32); + } + + if (!broadcast_y) + { + get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_lhs_m0_i32); + } + else + { + writer->op_assign(tile_mout0, const_0_i32); + } + + // Get the boundary aware coordinates at each global dimension index + if (sampler_lhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1) + { + writer->op_assign(tile_mout1, const_0_i32); + get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32); + } + else if (sampler_lhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2) + { + // For tile_mout1 and tile_bout0 the step can only be 1 + if (!broadcast_y) + { + writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32); + } + else + { + // If broadcast_y == true, it means that we have either a scalar or vector + // because broadcasting in other dimensions is not supported + writer->op_assign(tile_mout1, const_0_i32); + } + + writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32); + } + + ckw::DataType lhs_dt = to_ckw(_lhs->data_type()); + auto tile_lhs = writer->declare_tile("lhs", ckw::TileInfo(lhs_dt, lhs_m0, lhs_n0)); + + writer->op_load(tile_lhs, lhs->tensor(), sampler_lhs, tile_cout0, tile_mout0, tile_mout1, tile_bout0); + + // Here, init_virtual_tensor() is used to bring the tile_lhs outside the compound statement + lhs->init_virtual_tensor(tile_lhs, sampler_lhs); + } + + // Check whether the rhs tensor is a tile or tensor + // If it is a tile, create a sampler and load the content in a tile + if (!rhs->has_tile()) + { + // Sampler + ckw::TensorSampler sampler_rhs = dst->tensor_sampler(); + + bool broadcast_x = false; + bool broadcast_y = false; + + int32_t rhs_n0 = dst_n0; + int32_t rhs_m0 = dst_m0; + + // Check whether we have broadcasting + // In case of broadcast, rhs can only be a vector or scalar. + // Broadcasting in other dimensions is not supported + if (_dst->dimension(0) != _rhs->dimension(0)) + { + broadcast_x = true; + rhs_n0 = 1; + } + + if (sampler_rhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1) + { + if (_dst->dimension(1) * _dst->dimension(2) != _rhs->dimension(1) * _rhs->dimension(2)) + { + broadcast_y = true; + rhs_m0 = 1; + } + } + else if (sampler_rhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2) + { + if (_dst->dimension(1) != _rhs->dimension(1)) + { + broadcast_y = true; + rhs_m0 = 1; + } + } + + const int32_t rhs_partial_n0 = _rhs->dimension(0) % rhs_n0; + const int32_t rhs_shift_back = (rhs_n0 - rhs_partial_n0) % rhs_n0; + + // Constants + auto const_rhs_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{rhs_n0}}, ckw::DataType::Int32)); + auto const_rhs_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{rhs_m0}}, ckw::DataType::Int32)); + auto const_rhs_shift_back_n0_i32 = + writer->declare_constant_tile(ckw::ConstantData({{rhs_shift_back}}, ckw::DataType::Int32)); + + auto tile_gid_0 = writer->declare_tile("gid_0_rhs", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_1 = writer->declare_tile("gid_1_rhs", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_2 = writer->declare_tile("gid_2_rhs", ckw::TileInfo(ckw::DataType::Int32)); + + writer->op_get_global_id(tile_gid_0, 0); + writer->op_get_global_id(tile_gid_1, 1); + writer->op_get_global_id(tile_gid_2, 2); + + auto tile_cout0 = writer->declare_tile("cout0_rhs", ckw::TileInfo(ckw::DataType::Int32)); // OFM + auto tile_mout0 = + writer->declare_tile("mout0_rhs", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT + auto tile_mout1 = writer->declare_tile("mout1_rhs", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0 + auto tile_bout0 = writer->declare_tile("bout0_rhs", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX + + // Calculate coordinates + if (!broadcast_x) + { + get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_rhs_n0_i32, + const_rhs_shift_back_n0_i32, const_0_i32); + } + else + { + writer->op_assign(tile_cout0, const_0_i32); + } + + if (!broadcast_y) + { + get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_rhs_m0_i32); + } + else + { + writer->op_assign(tile_mout0, const_0_i32); + } + + // Get the boundary aware coordinates at each global dimension index + if (sampler_rhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1) + { + writer->op_assign(tile_mout1, const_0_i32); + get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32); + } + else if (sampler_rhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2) + { + // For tile_mout1 and tile_bout0 the step can only be 1 + const auto src_w = static_cast<int32_t>(_rhs->dimension(1)); + auto const_src_w = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32)); + if (!broadcast_y) + { + writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_mout1, const_src_w); + } + else + { + // If broadcast_y == true, it means that we have either a scalar or vector + // because broadcasting in other dimensions is not supported + writer->op_assign(tile_mout1, const_0_i32); + } + + writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_mout1, const_src_w); + } + + ckw::DataType rhs_dt = to_ckw(_rhs->data_type()); + auto tile_rhs = writer->declare_tile("rhs", ckw::TileInfo(rhs_dt, rhs_m0, rhs_n0)); + + writer->op_load(tile_rhs, rhs->tensor(), sampler_rhs, tile_cout0, tile_mout0, tile_mout1, tile_bout0); + + // Here, init_virtual_tensor() is used to bring the tile_rhs outside the compound statement + rhs->init_virtual_tensor(tile_rhs, sampler_rhs); + } + + const auto &tile_lhs = lhs->tile(); + const auto &tile_rhs = rhs->tile(); + + /******************************************************************************** + * 7 - Write the rest of the code + ********************************************************************************/ + // Perform the element-wise operation + writer->op_binary(tile_dst, to_ckw(_attributes), tile_lhs, tile_rhs); + + ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component"); +} + +Window GpuCkwElementwiseBinary::get_window() const +{ + ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); + + TensorShape output_shape = _dst->tensor_shape(); + // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged + // This is in line with the collapsing convention used by operators like Conv2d + output_shape.collapse(2U, 1U); + constexpr uint32_t vector_size_byte_opencl = 16; + const uint32_t num_elems_processed_per_iteration = + adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); + Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); + + return win; +} + +std::string GpuCkwElementwiseBinary::get_name(const ComponentGroup &comp_group) const +{ + ARM_COMPUTE_UNUSED(comp_group); + const std::vector<std::string> build_params = { + "elementwise_binary", + "op", + to_string(_attributes.operation()), + "dt", + lower_string(string_from_data_type(_dst->data_type())), + }; + return join(build_params, "_"); +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h new file mode 100644 index 0000000000..c6cbba28d3 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWELEMENTWISEBINARY_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWELEMENTWISEBINARY_H + +#include "src/core/common/Macros.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +class GpuCkwElementwiseBinary : public IGpuCkwComponentDriver +{ +public: + using Attributes = ClComponentElementwiseBinary::Attributes; + /** Constructor + * + * For supported configurations please refer to @ref ClComponentElementwiseBinary::validate() + * + * @param[in] id Component id + * @param[in] tensors Tensor arguments to the component + * @param[in] attributes Component attributes + */ + GpuCkwElementwiseBinary(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwElementwiseBinary); + /** Destructor */ + ~GpuCkwElementwiseBinary() override = default; + // Inherited methods overriden: + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + Window get_window() const override; + std::string get_name(const ComponentGroup &comp_group) const override; + +private: + const ITensorInfo *_lhs; + const ITensorInfo *_rhs; + const ITensorInfo *_dst; + Attributes _attributes; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute + +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWELEMENTWISEBINARY_H diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp new file mode 100644 index 0000000000..14ad3847fc --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" +#include "support/StringSupport.h" + +#include "compute_kernel_writer/include/ckw/KernelWriter.h" +#include <cstdint> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ + +GpuCkwMatMul::GpuCkwMatMul(ComponentId id, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings) + : IGpuCkwComponentDriver{id, tensors}, _lhs{}, _rhs{}, _dst{}, _attributes{attributes}, _settings{settings} +{ + _lhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); + _rhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); + _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); + ARM_COMPUTE_ERROR_ON_NULLPTR(_lhs, _rhs, _dst); +} + +void GpuCkwMatMul::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const +{ + /******************************************************************************** + * 1 - Define tensors + ********************************************************************************/ + GpuCkwComponentArgument *lhs = vtable.declare_variable(comp_group, writer, _lhs, "lhs"); + GpuCkwComponentArgument *rhs = vtable.declare_variable(comp_group, writer, _rhs, "rhs"); + GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst"); + + /******************************************************************************** + * 2 - Define CKW constants + ********************************************************************************/ + const auto k = + _attributes.adj_lhs() ? static_cast<int32_t>(_lhs->dimension(1)) : static_cast<int32_t>(_lhs->dimension(0)); + const auto k0 = static_cast<int32_t>(adjust_vec_size(_settings.k0(), k)); + const auto dst_dt = to_ckw(_dst->data_type()); + + // CKW constants + auto const_k_i32 = writer->declare_constant_tile(ckw::ConstantData({{k}}, ckw::DataType::Int32)); + auto const_k0_i32 = writer->declare_constant_tile(ckw::ConstantData({{k0}}, ckw::DataType::Int32)); + auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32)); + auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32)); + auto const_0_fp = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt)); + auto const_k_minus_k0_i32 = writer->declare_constant_tile(ckw::ConstantData({{k - k0}}, ckw::DataType::Int32)); + + /******************************************************************************** + * 3 - Define the compute block parameters and destination tile (if not root component) + * Bind the tile to the tensor to share it among different components and + * initialize the compute block parameters + ********************************************************************************/ + // The n0 and m0 parameters from root_window only refers to the output + const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); + + // Destination compute block size + const int32_t dst_n0 = root_window.x().step(); + const int32_t dst_m0 = root_window.y().step(); + + // Destination compute block size left-over + const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0; + const int32_t dst_m0_partial = _dst->dimension(1) % dst_m0; + + // Shift-back for the overlapping-min strategy + const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0; + + ckw::TensorSampler sampler_dst; + sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2); + if (dst_n0_partial == 0) + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None); + } + else + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin); + } + + if (dst_m0_partial == 0) + { + sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None); + } + else + { + sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly); + } + + sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr); + + // Declare destination tile + auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0)); + + // Initialize destination tile + writer->op_assign(tile_dst, const_0_fp); + + // Bind tile to the tensor + dst->init_virtual_tensor(tile_dst, sampler_dst); + + /******************************************************************************** + * 4 - Define the compute block parameters CKW constants + ********************************************************************************/ + // Only now we can declare the N0 and M0 as constant + auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32)); + auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32)); + auto const_shift_back_dst_n0_i32 = + writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32)); + + /******************************************************************************** + * 5 - Define the samplers for the input tensors + ********************************************************************************/ + // LHS SAMPLER + // The assumption here is that M is multiple of M0. This limitation will be removed once + // we have the support for OverlappingMin as address mode for the Y direction + ckw::TensorSampler sampler_lhs; + sampler_lhs.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2); + sampler_lhs.address_mode_x(ckw::TensorSamplerAddressModeX::None); + sampler_lhs.address_mode_y(ckw::TensorSamplerAddressModeY::None); + sampler_lhs.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + sampler_lhs.storage(ckw::TensorStorageType::BufferUint8Ptr); + + // RHS SAMPLER + ckw::TensorSampler sampler_rhs; + sampler_rhs.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2); + sampler_rhs.address_mode_x(ckw::TensorSamplerAddressModeX::None); + sampler_rhs.address_mode_y(ckw::TensorSamplerAddressModeY::None); + sampler_rhs.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + sampler_rhs.storage(ckw::TensorStorageType::BufferUint8Ptr); + + /******************************************************************************** + * 6 - Extra operations required before writing the main code (optional) + ********************************************************************************/ + + // Not required + + /******************************************************************************** + * 7 - Get the coordinates of the destination tile + ********************************************************************************/ + auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32)); + + writer->op_get_global_id(tile_gid_0, 0); + writer->op_get_global_id(tile_gid_1, 1); + writer->op_get_global_id(tile_gid_2, 2); + + auto tile_idx_n = writer->declare_tile("idx_n", ckw::TileInfo(ckw::DataType::Int32)); // N index + auto tile_idx_m = writer->declare_tile("idx_m", ckw::TileInfo(ckw::DataType::Int32)); // M index + auto tile_idx_b = writer->declare_tile("idx_b", ckw::TileInfo(ckw::DataType::Int32)); // BATCH index + + // Calculate coordinates + get_coordinate_from_gws_overlapping_min(writer, tile_idx_n, tile_gid_0, const_dst_n0_i32, + const_shift_back_dst_n0_i32, const_0_i32); + get_coordinate_from_gws(writer, tile_idx_m, tile_gid_1, const_dst_m0_i32); + get_coordinate_from_gws(writer, tile_idx_b, tile_gid_2, const_pos_1_i32); + + /******************************************************************************** + * 8 - Write the rest of the code + ********************************************************************************/ + auto tile_idx_k = writer->declare_tile("idx_k", ckw::TileInfo(ckw::DataType::Int32)); // K index + + writer->op_assign(tile_idx_k, const_0_i32); + + // clang-format off + writer->op_for_loop(tile_idx_k, ckw::BinaryOp::LessEqual, const_k_minus_k0_i32, tile_idx_k, ckw::AssignmentOp::Increment, const_k0_i32, + [&]() + { + auto tile_lhs = writer->declare_tile("lhs", ckw::TileInfo(to_ckw(_lhs->data_type()), dst_m0, k0)); + auto tile_rhs = writer->declare_tile("rhs", ckw::TileInfo(to_ckw(_rhs->data_type()), dst_n0, k0)); + writer->op_assign(tile_lhs, const_0_fp); + writer->op_assign(tile_rhs, const_0_fp); + + writer->op_load(tile_lhs, lhs->tensor(), sampler_lhs, tile_idx_k, tile_idx_m, tile_idx_b, const_0_i32); + writer->op_load(tile_rhs, rhs->tensor(), sampler_rhs, tile_idx_k, tile_idx_n, tile_idx_b, const_0_i32); + + writer->op_binary(tile_dst, ckw::BinaryOp::MatMul_Nt_T, tile_lhs, tile_rhs); + + }); + + // Left-over accumulations for when K is not a multiple of k0 + if(((k % k0) != 0)) + { + writer->op_for_loop(tile_idx_k, ckw::BinaryOp::Less, const_k_i32, tile_idx_k, ckw::AssignmentOp::Increment, const_pos_1_i32, [&]() + { + auto tile_lhs = writer->declare_tile("lhs", ckw::TileInfo(to_ckw(_lhs->data_type()), dst_m0, 1)); + auto tile_rhs = writer->declare_tile("rhs", ckw::TileInfo(to_ckw(_rhs->data_type()), dst_n0, 1)); + writer->op_assign(tile_lhs, const_0_fp); + writer->op_assign(tile_rhs, const_0_fp); + + writer->op_load(tile_lhs, lhs->tensor(), sampler_lhs, tile_idx_k, tile_idx_m, tile_idx_b, const_0_i32); + writer->op_load(tile_rhs, rhs->tensor(), sampler_rhs, tile_idx_k, tile_idx_n, tile_idx_b, const_0_i32); + + writer->op_binary(tile_dst, ckw::BinaryOp::MatMul_Nt_T, tile_lhs, tile_rhs); + }); + } + // clang-format on +} + +Window GpuCkwMatMul::get_window() const +{ + ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); + + const int32_t m = _dst->dimension(1); + const int32_t n = _dst->dimension(0); + const bool adj_lhs = _attributes.adj_lhs(); + + const int32_t m0 = adj_lhs ? adjust_vec_size(_settings.m0(), m) : std::min(_settings.m0(), m); + const int32_t n0 = adjust_vec_size(_settings.n0(), n); + + // Configure kernel window + Window win = calculate_max_window(_dst->tensor_shape(), Steps(n0, m0)); + win = win.collapse(win, Window::DimZ); + + return win; +} + +std::string GpuCkwMatMul::get_name(const ComponentGroup &comp_group) const +{ + ARM_COMPUTE_UNUSED(comp_group); + + std::string kernel_name("mat_mul_native"); + + const int32_t m = _dst->dimension(1); + const int32_t n = _dst->dimension(0); + const int32_t k = _attributes.adj_lhs() ? _lhs->tensor_shape().y() : _lhs->tensor_shape().x(); + + kernel_name += _attributes.adj_lhs() ? "_t" : "_nt"; + kernel_name += _attributes.adj_rhs() ? "_t" : "_nt"; + kernel_name += "_"; + kernel_name += support::cpp11::to_string(m); + kernel_name += "_"; + kernel_name += support::cpp11::to_string(n); + kernel_name += "_"; + kernel_name += support::cpp11::to_string(k); + kernel_name += "_"; + kernel_name += support::cpp11::to_string(_dst->dimension(2)); + kernel_name += "_"; + kernel_name += support::cpp11::to_string(_settings.m0()); + kernel_name += "_"; + kernel_name += support::cpp11::to_string(_settings.n0()); + kernel_name += "_"; + kernel_name += support::cpp11::to_string(_settings.k0()); + + return kernel_name; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h new file mode 100644 index 0000000000..790418bf50 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWMATMUL_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWMATMUL_H + +#include "arm_compute/dynamic_fusion/sketch/attributes/MatMulAttributes.h" + +#include "src/core/common/Macros.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +class GpuCkwMatMul final : public IGpuCkwComponentDriver +{ +public: + using Attributes = ClComponentMatMul::Attributes; + using Settings = ClComponentMatMul::Settings; + +public: + /** Constructor + * + * For supported configurations please refer to @ref ClComponentMatMul::validate() + * + * @param[in] id Component id + * @param[in] tensors Tensor arguments to the component + * @param[in] attributes Component attributes. Attributes are a set of parameters that define what a component does + * @param[in] settings Component settings. Settings are a set of parameters that influence the implementation of a component + */ + GpuCkwMatMul(ComponentId id, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings); + + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwMatMul); + + /** Destructor */ + ~GpuCkwMatMul() override = default; + + // Inherited methods overriden + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + Window get_window() const override; + std::string get_name(const ComponentGroup &comp_group) const override; + +private: + const ITensorInfo *_lhs; + const ITensorInfo *_rhs; + const ITensorInfo *_dst; + + Attributes _attributes; + Settings _settings; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute + +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWMATMUL_H diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp new file mode 100644 index 0000000000..d027f348ef --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp @@ -0,0 +1,405 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" + +#include "compute_kernel_writer/include/ckw/KernelWriter.h" +#include <cstdint> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +GpuCkwPool2d::GpuCkwPool2d(ComponentId id, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings) + : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}, _settings{settings} + +{ + _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); + _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); + ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); +} + +void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const +{ + const uint32_t width_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::WIDTH); + const uint32_t height_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::HEIGHT); + + /******************************************************************************** + * 1 - Define tensors + ********************************************************************************/ + GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src"); + GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst"); + + /******************************************************************************** + * 2 - Define CKW constants + ********************************************************************************/ + const auto dst_dt = to_ckw(_dst->data_type()); + const auto pool_sz_x = static_cast<int32_t>(_attributes.pool_size().x()); + const auto pool_sz_y = static_cast<int32_t>(_attributes.pool_size().y()); + const auto pad_x = static_cast<int32_t>(_attributes.pad().left); + const auto pad_y = static_cast<int32_t>(_attributes.pad().top); + const auto stride_x = static_cast<int32_t>(_attributes.stride().x()); + const auto stride_y = static_cast<int32_t>(_attributes.stride().y()); + const auto src_w = static_cast<int32_t>(_src->dimension(width_idx)); + const auto src_h = static_cast<int32_t>(_src->dimension(height_idx)); + const auto dst_h = static_cast<int32_t>(_dst->dimension(height_idx)); + + // CKW constants + auto const_pool_sz_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{pool_sz_x}}, ckw::DataType::Int32)); + auto const_pool_sz_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{pool_sz_y}}, ckw::DataType::Int32)); + auto const_pad_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{pad_x}}, ckw::DataType::Int32)); + auto const_pad_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{pad_y}}, ckw::DataType::Int32)); + auto const_stride_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_x}}, ckw::DataType::Int32)); + auto const_stride_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_y}}, ckw::DataType::Int32)); + auto const_src_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32)); + auto const_src_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32)); + auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32)); + auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32)); + auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32)); + auto const_0_fp = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt)); + auto const_lowest_val_fp = + writer->declare_constant_tile(ckw::ConstantData({{std::numeric_limits<float>::lowest()}}, ckw::DataType::Fp32)); + auto const_neg_inf_val_fp = writer->declare_constant_tile(ckw::ConstantData({{-1.0f / 0.0f}}, ckw::DataType::Fp32)); + + /******************************************************************************** + * 3 - Define the compute block parameters and destination tile (if not root component) + * Bind the tile to the tensor to share it among different components and + * initialize the compute block parameters + ********************************************************************************/ + // The n0 and m0 parameters from root_window only refers to the output + const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); + + // Destination compute block size + const int32_t dst_n0 = root_window.x().step(); + const int32_t dst_m0 = root_window.y().step(); + + // Destination compute block size left-over + const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0; + const int32_t dst_m0_partial = _dst->dimension(1) % dst_m0; + + // Shift-back for the overlapping-min strategy + const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0; + + ckw::TensorSampler sampler_dst; + sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2); + if (dst_n0_partial == 0) + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None); + } + else + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin); + } + + if (dst_m0_partial == 0) + { + sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None); + } + else + { + sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly); + } + + sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr); + + // Declare destination tile + auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0)); + + // Initialize destination tile + writer->op_assign(tile_dst, const_0_fp); + + // Bind tile to the tensor + dst->init_virtual_tensor(tile_dst, sampler_dst); + + /******************************************************************************** + * 4 - Define the compute block parameters CKW constants + ********************************************************************************/ + // Only now we can declare the N0 and M0 as constant + auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32)); + auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32)); + auto const_shift_back_dst_n0_i32 = + writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32)); + + /******************************************************************************** + * 5 - Define the sampler for the input tensor + ********************************************************************************/ + ckw::TensorSampler sampler_src; + sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2); + sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None); + sampler_src.address_mode_y(ckw::TensorSamplerAddressModeY::None); + sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + + /******************************************************************************** + * 6 - Extra operations required before writing the main code + ********************************************************************************/ + // Check if it is global pooling + const bool is_global_pooling = (pool_sz_x == src_w) && (pool_sz_y == src_h) && (pad_x == 0) && (pad_y == 0); + + // Accumulate always in F32 if the pool type is not MAX + const bool acc_f32 = (dst_dt == ckw::DataType::Fp32) || + ((dst_dt == ckw::DataType::Fp16) && _attributes.pool_type() != PoolingType::MAX); + + const auto acc_dt = acc_f32 ? ckw::DataType::Fp32 : ckw::DataType::Fp16; + + const bool is_wider_acc = dst_dt != acc_dt; + + /******************************************************************************** + * 7 - Get the coordinates of the destination tile + ********************************************************************************/ + auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32)); + + writer->op_get_global_id(tile_gid_0, 0); + writer->op_get_global_id(tile_gid_1, 1); + writer->op_get_global_id(tile_gid_2, 2); + + auto tile_cout0 = writer->declare_tile("cout0", ckw::TileInfo(ckw::DataType::Int32)); // OFM + auto tile_mout0 = writer->declare_tile("mout0", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH + auto tile_mout1 = writer->declare_tile("mout1", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT + auto tile_bout0 = writer->declare_tile("bout0", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX + + // Calculate coordinates + get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_dst_n0_i32, + const_shift_back_dst_n0_i32, const_0_i32); + get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_dst_m0_i32); + writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32); + writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32); + + /******************************************************************************** + * 8 - Write the rest of the code + ********************************************************************************/ + // A tile used to temporarily store results or as an accumulator in case of AVG and L2 pooling. + auto tile_res = writer->declare_tile("tile_res", ckw::TileInfo(acc_dt, dst_m0, dst_n0)); + + // Initialise result tile with appropriate value + if (_attributes.pool_type() == PoolingType::MAX) + { + if (_settings.use_inf_as_limit()) + { + writer->op_cast(tile_res, const_neg_inf_val_fp, ckw::ConvertPolicy::None); + } + else + { + writer->op_cast(tile_res, const_lowest_val_fp, ckw::ConvertPolicy::None); + } + } + else + { + writer->op_cast(tile_res, const_0_fp, ckw::ConvertPolicy::None); + } + + // tile_idx_in_w = tile_mout0 * STRIDE_X - PAD_X + auto tile_src_coord_x_start = writer->declare_tile("idx_in_w", ckw::DataType::Int32); + writer->op_binary(tile_src_coord_x_start, ckw::BinaryOp::Mul, tile_mout0, const_stride_x_i32); + writer->op_binary(tile_src_coord_x_start, ckw::BinaryOp::Sub, tile_src_coord_x_start, const_pad_x_i32); + + // tile_idx_in_h = tile_mout1 * STRIDE_Y - PAD_Y + auto tile_src_coord_y_start = writer->declare_tile("idx_in_h", ckw::DataType::Int32); + writer->op_binary(tile_src_coord_y_start, ckw::BinaryOp::Mul, tile_mout1, const_stride_y_i32); + writer->op_binary(tile_src_coord_y_start, ckw::BinaryOp::Sub, tile_src_coord_y_start, const_pad_y_i32); + + auto tile_neg_src_coord_x_start = writer->declare_tile("neg_src_coord_x_start", ckw::DataType::Int32); + auto tile_neg_src_coord_y_start = writer->declare_tile("neg_src_coord_y_start", ckw::DataType::Int32); + + writer->op_binary(tile_neg_src_coord_x_start, ckw::BinaryOp::Sub, const_0_i32, tile_src_coord_x_start); + writer->op_binary(tile_neg_src_coord_y_start, ckw::BinaryOp::Sub, const_0_i32, tile_src_coord_y_start); + + // int pool_x_s = max((int)0, -idx_in_w); + // int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w); + // int pool_y_s = max((int)0, -idx_in_h); + // int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h); + auto tile_pool_x_s = writer->declare_tile("pool_x_s", ckw::DataType::Int32); + auto tile_pool_y_s = writer->declare_tile("pool_y_s", ckw::DataType::Int32); + auto tile_pool_x_e = writer->declare_tile("pool_x_e", ckw::DataType::Int32); + auto tile_pool_y_e = writer->declare_tile("pool_y_e", ckw::DataType::Int32); + + writer->op_binary(tile_pool_x_s, ckw::BinaryOp::Max, const_0_i32, tile_neg_src_coord_x_start); + writer->op_binary(tile_pool_x_e, ckw::BinaryOp::Add, const_src_w_i32, tile_neg_src_coord_x_start); + writer->op_binary(tile_pool_x_e, ckw::BinaryOp::Min, const_pool_sz_x_i32, tile_pool_x_e); + writer->op_binary(tile_pool_y_s, ckw::BinaryOp::Max, const_0_i32, tile_neg_src_coord_y_start); + writer->op_binary(tile_pool_y_e, ckw::BinaryOp::Add, const_src_h_i32, tile_neg_src_coord_y_start); + writer->op_binary(tile_pool_y_e, ckw::BinaryOp::Min, const_pool_sz_y_i32, tile_pool_y_e); + + // #if defined(EXCLUDE_PADDING) + // int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s); + // #else // defined(EXCLUDE_PADDING) + // int filter_size = POOL_SIZE_X * POOL_SIZE_Y; + // #endif // defined(EXCLUDE_PADDING) + auto tile_filter_size = writer->declare_tile("filter_size", ckw::DataType::Int32); + if (_attributes.exclude_padding()) + { + auto tile_x_diff = writer->declare_tile("x_diff", ckw::DataType::Int32); + auto tile_y_diff = writer->declare_tile("y_diff", ckw::DataType::Int32); + + writer->op_binary(tile_x_diff, ckw::BinaryOp::Sub, tile_pool_x_e, tile_pool_x_s); + writer->op_binary(tile_y_diff, ckw::BinaryOp::Sub, tile_pool_y_e, tile_pool_y_s); + writer->op_binary(tile_filter_size, ckw::BinaryOp::Mul, tile_x_diff, tile_y_diff); + } + else + { + writer->op_binary(tile_filter_size, ckw::BinaryOp::Mul, const_pool_sz_x_i32, const_pool_sz_y_i32); + } + + auto tile_x = writer->declare_tile("x", ckw::DataType::Int32); + auto tile_y = writer->declare_tile("y", ckw::DataType::Int32); + + if (is_global_pooling) + { + writer->op_assign(tile_y, const_0_i32); + writer->op_assign(tile_pool_y_e, const_pool_sz_y_i32); + } + else + { + writer->op_assign(tile_y, tile_pool_y_s); + } + + // Y dim for-loop + writer->op_for_loop( + tile_y, ckw::BinaryOp::Less, tile_pool_y_e, tile_y, ckw::AssignmentOp::Increment, const_pos_1_i32, + [&]() + { + // Reset the iterator for the inner loop + if (is_global_pooling) + { + writer->op_assign(tile_x, const_0_i32); + writer->op_assign(tile_pool_x_e, const_pool_sz_x_i32); + } + else + { + writer->op_assign(tile_x, tile_pool_x_s); + } + + auto tile_src_coord_y = writer->declare_tile("src_coord_y", ckw::DataType::Int32); + writer->op_binary(tile_src_coord_y, ckw::BinaryOp::Add, tile_src_coord_y_start, tile_y); + + // X dim for-loop + writer->op_for_loop( + tile_x, ckw::BinaryOp::Less, tile_pool_x_e, tile_x, ckw::AssignmentOp::Increment, const_pos_1_i32, + [&]() + { + auto tile_src_coord_x = writer->declare_tile("src_coord_x", ckw::DataType::Int32); + writer->op_binary(tile_src_coord_x, ckw::BinaryOp::Add, tile_src_coord_x_start, tile_x); + + ckw::DataType src_dt = to_ckw(_src->data_type()); + auto tile_src = writer->declare_tile("tile_src", ckw::TileInfo(acc_dt, dst_m0, dst_n0)); + + // Load src tile + if (is_wider_acc) + { + auto tile_src0 = writer->declare_tile("src_tile0", ckw::TileInfo(src_dt, dst_m0, dst_n0)); + writer->op_load(tile_src0, src->tensor(), sampler_src, tile_cout0, tile_src_coord_x, + tile_src_coord_y, tile_bout0); + writer->op_cast(tile_src, tile_src0, ckw::ConvertPolicy::None); + } + else + { + writer->op_load(tile_src, src->tensor(), sampler_src, tile_cout0, tile_src_coord_x, + tile_src_coord_y, tile_bout0); + } + + // Take the square of the input, for L2 Pooling + if (_attributes.pool_type() == PoolingType::L2) + { + writer->op_binary(tile_src, ckw::BinaryOp::Mul, tile_src, tile_src); + } + + // Perfom Pooling op + if (_attributes.pool_type() == PoolingType::MAX) + { + writer->op_binary(tile_res, ckw::BinaryOp::Max, tile_res, tile_src); + } + else + { + writer->op_binary(tile_res, ckw::BinaryOp::Add, tile_res, tile_src); + } + }); + }); + + if ((_attributes.pool_type() == PoolingType::AVG) || (_attributes.pool_type() == PoolingType::L2)) + { + // Filter_size is automatically broadcasted in the operation + auto tile_filter_size_fp = writer->declare_tile("filter_size_fp", ckw::TileInfo(acc_dt)); + writer->op_cast(tile_filter_size_fp, tile_filter_size, ckw::ConvertPolicy::None); + writer->op_binary(tile_res, ckw::BinaryOp::Div, tile_res, tile_filter_size_fp); + } + + // Take square root of the result in L2 pooling + if (_attributes.pool_type() == PoolingType::L2) + { + writer->op_unary(tile_res, ckw::UnaryOp::Sqrt, tile_res); + } + + // Store the results and do casting if mixed precision + if (is_wider_acc) + { + writer->op_cast(tile_dst, tile_res, ckw::ConvertPolicy::None); + } + else + { + writer->op_assign(tile_dst, tile_res); + } +} + +Window GpuCkwPool2d::get_window() const +{ + ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); + + TensorShape output_shape = _dst->tensor_shape(); + const uint32_t vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0)); + // Create and configure kernel window + auto win = calculate_max_window(output_shape, Steps(vec_size)); + win = win.collapse_if_possible(win, Window::DimZ); // collapse window on batch size. + return win; +} + +std::string GpuCkwPool2d::get_name(const ComponentGroup &comp_group) const +{ + ARM_COMPUTE_UNUSED(comp_group); + + return "pool2dMxN"; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h new file mode 100644 index 0000000000..822282a108 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWPOOL2D_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWPOOL2D_H + +#include "src/core/common/Macros.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h" + +#include <string> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +class GpuCkwPool2d : public IGpuCkwComponentDriver +{ +public: + using Attributes = ClComponentPool2d::Attributes; + using Settings = ClComponentPool2d::Settings; + + /** Constructor + * + * For supported configurations please refer to @ref ClComponentCast::validate() + * + * @param[in] id Component id + * @param[in] tensors Tensor arguments to the component + * @param[in] attributes Component attributes + * @param[in] settings Component settings + */ + GpuCkwPool2d(ComponentId id, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwPool2d); + /** Destructor */ + ~GpuCkwPool2d() override = default; + // Inherited methods overriden: + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + Window get_window() const override; + std::string get_name(const ComponentGroup &comp_group) const override; + +private: + const ITensorInfo *_src; + const ITensorInfo *_dst; + Attributes _attributes; + Settings _settings; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute + +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWPOOL2D_H diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp new file mode 100644 index 0000000000..edd7ea9a38 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp @@ -0,0 +1,576 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/utils/ScaleUtils.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "support/StringSupport.h" + +#include <cstdint> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +constexpr uint32_t opencl_vector_size_in_bytes = 16; +} // namespace + +GpuCkwResize::GpuCkwResize(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes) + : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes} +{ + _src = this->tensors().get_const_tensor(TensorType::ACL_SRC); + _dst = this->tensors().get_const_tensor(TensorType::ACL_DST); + ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); +} + +void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const +{ + const uint32_t width_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::WIDTH); + const uint32_t height_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::HEIGHT); + + /******************************************************************************** + * 1 - Define tensors + ********************************************************************************/ + GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src"); + GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst"); + + /******************************************************************************** + * 2 - Define CKW constants + ********************************************************************************/ + const auto dst_dt = to_ckw(_dst->data_type()); + const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx), + _attributes.align_corners()); + const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx), + _attributes.align_corners()); + const auto src_w = static_cast<int32_t>(_src->dimension(width_idx)); + const auto src_h = static_cast<int32_t>(_src->dimension(height_idx)); + const auto dst_h = static_cast<int32_t>(_dst->dimension(height_idx)); + + // CKW constants + auto const_src_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32)); + auto const_src_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32)); + auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32)); + auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32)); + auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32)); + auto const_0_fp = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt)); + auto const_pos_0_5_fp = writer->declare_constant_tile(ckw::ConstantData({{0.5f}}, ckw::DataType::Fp32)); + auto const_scale_x_fp = writer->declare_constant_tile(ckw::ConstantData({{scale_x}}, ckw::DataType::Fp32)); + auto const_scale_y_fp = writer->declare_constant_tile(ckw::ConstantData({{scale_y}}, ckw::DataType::Fp32)); + + /******************************************************************************** + * 3 - Define the compute block parameters and destination tile (if not root component) + * Bind the tile to the tensor to share it among different components and + * initialize the compute block parameters + ********************************************************************************/ + // The n0 and m0 parameters from root_window only refers to the output + const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); + + // Destination compute block size + const int32_t dst_n0 = root_window.x().step(); + + // dst_m0 must be 1 + ARM_COMPUTE_ERROR_ON(root_window.y().step() != 1); + + // Destination compute block size left-over + const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0; + + // Shift-back for the overlapping-min strategy + const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0; + + ckw::TensorSampler sampler_dst; + sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2); + if (dst_n0_partial == 0) + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None); + } + else + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin); + } + sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None); + sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr); + + // Declare destination tile + auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, 1, dst_n0)); + + // Initialize destination tile + writer->op_assign(tile_dst, const_0_fp); + + // Bind tile to the tensor + dst->init_virtual_tensor(tile_dst, sampler_dst); + + /******************************************************************************** + * 4 - Define the compute block parameters CKW constants + ********************************************************************************/ + auto const_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32)); + auto const_shift_back_n0_i32 = + writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32)); + + /******************************************************************************** + * 5 - Define the samplers for the input tensor + ********************************************************************************/ + ckw::TensorSampler sampler_src; + sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2); + sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None); + sampler_src.address_mode_y(ckw::TensorSamplerAddressModeY::None); + sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + + /******************************************************************************** + * 6 - Extra operations required before writing the main code + ********************************************************************************/ + + // .... + + /******************************************************************************** + * 7 - Get the coordinates of the destination tile + ********************************************************************************/ + auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32)); + + writer->op_get_global_id(tile_gid_0, 0); + writer->op_get_global_id(tile_gid_1, 1); + writer->op_get_global_id(tile_gid_2, 2); + + auto tile_co = writer->declare_tile("co", ckw::TileInfo(ckw::DataType::Int32)); // OFM + auto tile_xo = writer->declare_tile("xo", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH + auto tile_yo = writer->declare_tile("yo", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT + auto tile_bo = writer->declare_tile("bo", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX + + // Calculate coordinates + get_coordinate_from_gws_overlapping_min(writer, tile_co, tile_gid_0, const_n0_i32, const_shift_back_n0_i32, + const_0_i32); + writer->op_assign(tile_xo, tile_gid_1); + writer->op_binary(tile_yo, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32); + writer->op_binary(tile_bo, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32); + + /******************************************************************************** + * 8 - Write the rest of the code + ********************************************************************************/ + auto tile_xi_f = writer->declare_tile("xi_f", ckw::DataType::Fp32); + auto tile_yi_f = writer->declare_tile("yi_f", ckw::DataType::Fp32); + + switch (_attributes.sampling_policy()) + { + case SamplingPolicy::TOP_LEFT: + // xi_f = (xo * scale_x) + // yi_f = (yo * scale_y) + writer->op_cast(tile_xi_f, tile_xo, ckw::ConvertPolicy::None); + writer->op_cast(tile_yi_f, tile_yo, ckw::ConvertPolicy::None); + writer->op_binary(tile_xi_f, ckw::BinaryOp::Mul, tile_xi_f, const_scale_x_fp); + writer->op_binary(tile_yi_f, ckw::BinaryOp::Mul, tile_yi_f, const_scale_y_fp); + break; + case SamplingPolicy::CENTER: + { + // xi_f = ((xo + 0.5f) * scale_x) + // yi_f = ((yo + 0.5f) * scale_y) + const auto &tile_xo_plus_half = writer->declare_tile("xo_plus_half", ckw::DataType::Fp32); + const auto &tile_yo_plus_half = writer->declare_tile("yo_plus_half", ckw::DataType::Fp32); + + writer->op_cast(tile_xo_plus_half, tile_xo, ckw::ConvertPolicy::None); + writer->op_cast(tile_yo_plus_half, tile_yo, ckw::ConvertPolicy::None); + writer->op_binary(tile_xo_plus_half, ckw::BinaryOp::Add, tile_xo_plus_half, const_pos_0_5_fp); + writer->op_binary(tile_yo_plus_half, ckw::BinaryOp::Add, tile_yo_plus_half, const_pos_0_5_fp); + writer->op_binary(tile_xi_f, ckw::BinaryOp::Mul, tile_xo_plus_half, const_scale_x_fp); + writer->op_binary(tile_yi_f, ckw::BinaryOp::Mul, tile_yo_plus_half, const_scale_y_fp); + } + break; + default: + ARM_COMPUTE_ERROR("Unsupported sampling policy"); + } + + if (_attributes.align_corners()) + { + writer->op_unary(tile_xi_f, ckw::UnaryOp::Round, tile_xi_f); + writer->op_unary(tile_yi_f, ckw::UnaryOp::Round, tile_yi_f); + } + + // xi0 = clamp((int)xi_f, 0, (int)src_w - 1) + // yi0 = clamp((int)yi_f, 0, (int)src_h - 1) + auto tile_xi_f_int = writer->declare_tile("xi_f_int", ckw::DataType::Int32); + auto tile_yi_f_int = writer->declare_tile("yi_f_int", ckw::DataType::Int32); + + writer->op_cast(tile_xi_f_int, tile_xi_f, ckw::ConvertPolicy::None); + writer->op_cast(tile_yi_f_int, tile_yi_f, ckw::ConvertPolicy::None); + + auto tile_src_w_minus_1 = writer->declare_tile("src_w_minus_1", ckw::DataType::Int32); + auto tile_src_h_minus_1 = writer->declare_tile("src_h_minus_1", ckw::DataType::Int32); + + writer->op_binary(tile_src_w_minus_1, ckw::BinaryOp::Sub, const_src_w_i32, const_pos_1_i32); + writer->op_binary(tile_src_h_minus_1, ckw::BinaryOp::Sub, const_src_h_i32, const_pos_1_i32); + + auto tile_xi0 = writer->declare_tile("xi0", ckw::DataType::Int32); + auto tile_yi0 = writer->declare_tile("yi0", ckw::DataType::Int32); + + writer->op_ternary(tile_xi0, ckw::TernaryOp::Clamp, tile_xi_f_int, const_0_i32, tile_src_w_minus_1); + writer->op_ternary(tile_yi0, ckw::TernaryOp::Clamp, tile_yi_f_int, const_0_i32, tile_src_h_minus_1); + + auto tile_src = writer->declare_tile("src_tile", ckw::TileInfo(dst_dt, 1, dst_n0)); + writer->op_load(tile_src, src->tensor(), sampler_src, tile_co, tile_xi0, tile_yi0, tile_bo); + + writer->op_assign(tile_dst, tile_src); +} + +void GpuCkwResize::do_bilinear_resize(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const +{ + const size_t width_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::WIDTH); + const size_t height_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::HEIGHT); + + /******************************************************************************** + * 1 - Define tensors + ********************************************************************************/ + GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src"); + GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst"); + + /******************************************************************************** + * 2 - Define CKW constants + ********************************************************************************/ + const auto dst_dt = to_ckw(_dst->data_type()); + const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx), + _attributes.align_corners()); + const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx), + _attributes.align_corners()); + const auto src_w = static_cast<int32_t>(_src->dimension(width_idx)); + const auto src_h = static_cast<int32_t>(_src->dimension(height_idx)); + const auto dst_h = static_cast<int32_t>(_dst->dimension(height_idx)); + + // CKW constants + auto const_src_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32)); + auto const_src_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32)); + auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32)); + auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32)); + auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32)); + auto const_0_fp = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt)); + auto const_pos_1_fp = writer->declare_constant_tile(ckw::ConstantData({{1.0f}}, ckw::DataType::Fp32)); + auto const_pos_0_5_fp = writer->declare_constant_tile(ckw::ConstantData({{0.5f}}, ckw::DataType::Fp32)); + auto const_scale_x_fp = writer->declare_constant_tile(ckw::ConstantData({{scale_x}}, ckw::DataType::Fp32)); + auto const_scale_y_fp = writer->declare_constant_tile(ckw::ConstantData({{scale_y}}, ckw::DataType::Fp32)); + + /******************************************************************************** + * 3 - Define the compute block parameters and destination tile (if not root component) + * Bind the tile to the tensor to share it among different components and + * initialize the compute block parameters + ********************************************************************************/ + // The n0 and m0 parameters from root_window only refers to the output + const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); + + // Destination compute block size + const int32_t dst_n0 = root_window.x().step(); + + // dst_m0 must be 1 + ARM_COMPUTE_ERROR_ON(root_window.y().step() != 1); + + // Destination compute block size left-over + const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0; + + // Shift-back for the overlapping-min strategy + const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0; + + ckw::TensorSampler sampler_dst; + sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2); + if (dst_n0_partial == 0) + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None); + } + else + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin); + } + sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None); + sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr); + + // Declare destination tile + auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, 1, dst_n0)); + + // Initialize destination tile + writer->op_assign(tile_dst, const_0_fp); + + // Bind tile to the tensor + dst->init_virtual_tensor(tile_dst, sampler_dst); + + /******************************************************************************** + * 4 - Define the compute block parameters CKW constants + ********************************************************************************/ + auto const_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32)); + auto const_shift_back_n0_i32 = + writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32)); + + /******************************************************************************** + * 5 - Define the sampler for the input tensor + ********************************************************************************/ + ckw::TensorSampler sampler_src; + sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2); + sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None); + sampler_src.address_mode_y(ckw::TensorSamplerAddressModeY::None); + sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + + /******************************************************************************** + * 6 - Extra operations required before writing the main code + ********************************************************************************/ + + // .... + + /******************************************************************************** + * 7 - Get the coordinates of the destination tile + ********************************************************************************/ + auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32)); + + writer->op_get_global_id(tile_gid_0, 0); + writer->op_get_global_id(tile_gid_1, 1); + writer->op_get_global_id(tile_gid_2, 2); + + auto tile_co = writer->declare_tile("co", ckw::TileInfo(ckw::DataType::Int32)); // OFM + auto tile_xo = writer->declare_tile("xo", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH + auto tile_yo = writer->declare_tile("yo", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT + auto tile_bo = writer->declare_tile("bo", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX + + // Calculate coordinates + get_coordinate_from_gws_overlapping_min(writer, tile_co, tile_gid_0, const_n0_i32, const_shift_back_n0_i32, + const_0_i32); + writer->op_assign(tile_xo, tile_gid_1); + writer->op_binary(tile_yo, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32); + writer->op_binary(tile_bo, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32); + + /******************************************************************************** + * 8 - Write the rest of the code + ********************************************************************************/ + auto tile_xi_f = writer->declare_tile("xi_f", ckw::DataType::Fp32); + auto tile_yi_f = writer->declare_tile("yi_f", ckw::DataType::Fp32); + + switch (_attributes.sampling_policy()) + { + case SamplingPolicy::TOP_LEFT: + // xi_f = (xo * scale_x) + // yi_f = (yo * scale_y) + writer->op_cast(tile_xi_f, tile_xo, ckw::ConvertPolicy::None); + writer->op_cast(tile_yi_f, tile_yo, ckw::ConvertPolicy::None); + writer->op_binary(tile_xi_f, ckw::BinaryOp::Mul, tile_xi_f, const_scale_x_fp); + writer->op_binary(tile_yi_f, ckw::BinaryOp::Mul, tile_yi_f, const_scale_y_fp); + break; + case SamplingPolicy::CENTER: + { + // xi_f = ((xo + 0.5f) * scale_x - 0.5f) + // yi_f = ((yo + 0.5f) * scale_y - 0.5f) + const auto &tile_xo_plus_half = writer->declare_tile("xo_plus_half", ckw::DataType::Fp32); + const auto &tile_yo_plus_half = writer->declare_tile("yo_plus_half", ckw::DataType::Fp32); + + writer->op_cast(tile_xo_plus_half, tile_xo, ckw::ConvertPolicy::None); + writer->op_cast(tile_yo_plus_half, tile_yo, ckw::ConvertPolicy::None); + writer->op_binary(tile_xo_plus_half, ckw::BinaryOp::Add, tile_xo_plus_half, const_pos_0_5_fp); + writer->op_binary(tile_yo_plus_half, ckw::BinaryOp::Add, tile_yo_plus_half, const_pos_0_5_fp); + writer->op_binary(tile_xi_f, ckw::BinaryOp::Mul, tile_xo_plus_half, const_scale_x_fp); + writer->op_binary(tile_yi_f, ckw::BinaryOp::Mul, tile_yo_plus_half, const_scale_y_fp); + + writer->op_binary(tile_xi_f, ckw::BinaryOp::Sub, tile_xi_f, const_pos_0_5_fp); + writer->op_binary(tile_yi_f, ckw::BinaryOp::Sub, tile_yi_f, const_pos_0_5_fp); + } + break; + default: + ARM_COMPUTE_ERROR("Unsupported sampling policy"); + } + + // xi = (int)floor(xi_f); + // yi = (int)floor(yi_f); + auto tile_xi_f_floor = writer->declare_tile("xi_f_floor", ckw::DataType::Fp32); + auto tile_yi_f_floor = writer->declare_tile("yi_f_floor", ckw::DataType::Fp32); + writer->op_unary(tile_xi_f_floor, ckw::UnaryOp::Floor, tile_xi_f); + writer->op_unary(tile_yi_f_floor, ckw::UnaryOp::Floor, tile_yi_f); + + auto tile_xi = writer->declare_tile("xi", ckw::DataType::Int32); + auto tile_yi = writer->declare_tile("yi", ckw::DataType::Int32); + writer->op_cast(tile_xi, tile_xi_f_floor, ckw::ConvertPolicy::None); + writer->op_cast(tile_yi, tile_yi_f_floor, ckw::ConvertPolicy::None); + + // xi0 = clamp(xi, 0, (int)src_w - 1); + // yi0 = clamp(yi, 0, (int)src_h - 1); + // xi1 = clamp(xi + 1, 0, (int)src_w - 1); + // yi1 = clamp(yi + 1, 0, (int)src_h - 1); + auto tile_src_w_minus_1 = writer->declare_tile("src_w_minus_1", ckw::DataType::Int32); + auto tile_src_h_minus_1 = writer->declare_tile("src_h_minus_1", ckw::DataType::Int32); + writer->op_binary(tile_src_w_minus_1, ckw::BinaryOp::Sub, const_src_w_i32, const_pos_1_i32); + writer->op_binary(tile_src_h_minus_1, ckw::BinaryOp::Sub, const_src_h_i32, const_pos_1_i32); + + auto tile_xi_plus_1 = writer->declare_tile("xi_plus_1", ckw::DataType::Int32); + auto tile_yi_plus_1 = writer->declare_tile("yi_plus_1", ckw::DataType::Int32); + writer->op_binary(tile_xi_plus_1, ckw::BinaryOp::Add, tile_xi, const_pos_1_i32); + writer->op_binary(tile_yi_plus_1, ckw::BinaryOp::Add, tile_yi, const_pos_1_i32); + + auto tile_xi0 = writer->declare_tile("xi0", ckw::DataType::Int32); + auto tile_yi0 = writer->declare_tile("yi0", ckw::DataType::Int32); + auto tile_xi1 = writer->declare_tile("xi1", ckw::DataType::Int32); + auto tile_yi1 = writer->declare_tile("yi1", ckw::DataType::Int32); + + writer->op_ternary(tile_xi0, ckw::TernaryOp::Clamp, tile_xi, const_0_i32, tile_src_w_minus_1); + writer->op_ternary(tile_yi0, ckw::TernaryOp::Clamp, tile_yi, const_0_i32, tile_src_h_minus_1); + writer->op_ternary(tile_xi1, ckw::TernaryOp::Clamp, tile_xi_plus_1, const_0_i32, tile_src_w_minus_1); + writer->op_ternary(tile_yi1, ckw::TernaryOp::Clamp, tile_yi_plus_1, const_0_i32, tile_src_h_minus_1); + + auto tile_in00 = writer->declare_tile("in00", ckw::TileInfo(dst_dt, 1, dst_n0)); + auto tile_in01 = writer->declare_tile("in01", ckw::TileInfo(dst_dt, 1, dst_n0)); + auto tile_in10 = writer->declare_tile("in10", ckw::TileInfo(dst_dt, 1, dst_n0)); + auto tile_in11 = writer->declare_tile("in11", ckw::TileInfo(dst_dt, 1, dst_n0)); + + writer->op_load(tile_in00, src->tensor(), sampler_src, tile_co, tile_xi0, tile_yi0, tile_bo); + writer->op_load(tile_in01, src->tensor(), sampler_src, tile_co, tile_xi1, tile_yi0, tile_bo); + writer->op_load(tile_in10, src->tensor(), sampler_src, tile_co, tile_xi0, tile_yi1, tile_bo); + writer->op_load(tile_in11, src->tensor(), sampler_src, tile_co, tile_xi1, tile_yi1, tile_bo); + + // Weights of each nearest pixel + auto tile_a = writer->declare_tile("a", ckw::DataType::Fp32); + auto tile_b = writer->declare_tile("b", ckw::DataType::Fp32); + auto tile_a1 = writer->declare_tile("a1", ckw::DataType::Fp32); + auto tile_b1 = writer->declare_tile("b1", ckw::DataType::Fp32); + + // a = (xi_f - (float)xi) + // b = (1.f - a) + // a1 = (yi_f - (float)yi) + // b1 = (1.f - a1) + auto tile_xi_float = writer->declare_tile("xi_float", ckw::DataType::Fp32); + auto tile_yi_float = writer->declare_tile("yi_float", ckw::DataType::Fp32); + writer->op_cast(tile_xi_float, tile_xi, ckw::ConvertPolicy::None); + writer->op_cast(tile_yi_float, tile_yi, ckw::ConvertPolicy::None); + + writer->op_binary(tile_a, ckw::BinaryOp::Sub, tile_xi_f, tile_xi_float); + writer->op_binary(tile_b, ckw::BinaryOp::Sub, const_pos_1_fp, tile_a); + writer->op_binary(tile_a1, ckw::BinaryOp::Sub, tile_yi_f, tile_yi_float); + writer->op_binary(tile_b1, ckw::BinaryOp::Sub, const_pos_1_fp, tile_a1); + + // Cast weights to source type + const auto &tile_a_src_type = writer->declare_tile("a_src_t", to_ckw(_src->data_type())); + const auto &tile_b_src_type = writer->declare_tile("b_src_t", to_ckw(_src->data_type())); + const auto &tile_a1_src_type = writer->declare_tile("a1_src_t", to_ckw(_src->data_type())); + const auto &tile_b1_src_type = writer->declare_tile("b1_src_t", to_ckw(_src->data_type())); + + writer->op_cast(tile_a_src_type, tile_a, ckw::ConvertPolicy::None); + writer->op_cast(tile_b_src_type, tile_b, ckw::ConvertPolicy::None); + writer->op_cast(tile_a1_src_type, tile_a1, ckw::ConvertPolicy::None); + writer->op_cast(tile_b1_src_type, tile_b1, ckw::ConvertPolicy::None); + + // in00 * b * b1 + writer->op_binary(tile_in00, ckw::BinaryOp::Mul, tile_in00, tile_b_src_type); + writer->op_binary(tile_in00, ckw::BinaryOp::Mul, tile_in00, tile_b1_src_type); + + // in01 * a * b1 + writer->op_binary(tile_in01, ckw::BinaryOp::Mul, tile_in01, tile_a_src_type); + writer->op_binary(tile_in01, ckw::BinaryOp::Mul, tile_in01, tile_b1_src_type); + + // in10 * b * a1 + writer->op_binary(tile_in10, ckw::BinaryOp::Mul, tile_in10, tile_b_src_type); + writer->op_binary(tile_in10, ckw::BinaryOp::Mul, tile_in10, tile_a1_src_type); + + // in11 * a * a1 + writer->op_binary(tile_in11, ckw::BinaryOp::Mul, tile_in11, tile_a_src_type); + writer->op_binary(tile_in11, ckw::BinaryOp::Mul, tile_in11, tile_a1_src_type); + + // Summation of above terms + writer->op_assign(tile_dst, tile_in00); + writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_in01); + writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_in10); + writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_in11); +} + +void GpuCkwResize::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const +{ + switch (_attributes.interpolation_policy()) + { + case InterpolationPolicy::NEAREST_NEIGHBOR: + do_nearest_neighbor_resize(comp_group, vtable, writer); + break; + case InterpolationPolicy::BILINEAR: + do_bilinear_resize(comp_group, vtable, writer); + break; + default: + ARM_COMPUTE_ERROR("Unsupported interpolation policy"); + } +} + +Window GpuCkwResize::get_window() const +{ + ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); + + const uint32_t n0 = adjust_vec_size(opencl_vector_size_in_bytes / _src->element_size(), _src->dimension(0)); + Window win = calculate_max_window(*_dst, Steps(n0)); + return win.collapse(win, Window::DimZ); +} + +std::string GpuCkwResize::get_tuner_id(const ComponentGroup &comp_group) const +{ + ARM_COMPUTE_UNUSED(comp_group); + + std::string tuner_id = "resize_"; + tuner_id += _attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR ? "nearest_neighbor" : ""; + tuner_id += _attributes.interpolation_policy() == InterpolationPolicy::BILINEAR ? "bilinear" : ""; + tuner_id += "_"; + tuner_id += _attributes.sampling_policy() == SamplingPolicy::CENTER ? "center" : "topleft"; + tuner_id += "_"; + tuner_id += support::cpp11::to_string(_dst->dimension(0)); + tuner_id += "_"; + tuner_id += support::cpp11::to_string(_dst->dimension(1)); + tuner_id += "_"; + tuner_id += support::cpp11::to_string(_dst->dimension(2)); + tuner_id += "_"; + tuner_id += support::cpp11::to_string(_dst->dimension(3)); + + return tuner_id; +} + +std::string GpuCkwResize::get_name(const ComponentGroup &comp_group) const +{ + ARM_COMPUTE_UNUSED(comp_group); + + std::string name = "resize_"; + name += _attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR ? "nearest_neighbor" : ""; + name += _attributes.interpolation_policy() == InterpolationPolicy::BILINEAR ? "bilinear" : ""; + + return name; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h new file mode 100644 index 0000000000..1266c05921 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWRESIZE_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWRESIZE_H + +#include "src/core/common/Macros.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +class GpuCkwResize final : public IGpuCkwComponentDriver +{ +public: + using Attributes = ClComponentResize::Attributes; + +public: + /** Constructor + * + * @param[in] id Component id + * @param[in] tensors Tensor arguments to the components + * @param[in] attributes Component attributes + */ + GpuCkwResize(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes); + + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwResize); + + /** Destructor */ + ~GpuCkwResize() override = default; + + // Inherited methods overriden + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + Window get_window() const override; + std::string get_name(const ComponentGroup &comp_group) const override; + std::string get_tuner_id(const ComponentGroup &comp_group) const override; + +private: + /** Resize using nearest neighbor interpolation + * + * @param[in] comp_group Component group to which this component belongs to + * @param[in, out] vtable Table of variables declared by this component + * @param[in, out] writer CKW writer that writes code scoped to this kernel component + */ + void do_nearest_neighbor_resize(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const; + + /** Resize using bilinear interpolation + * + * @param[in] comp_group Component group to which this component belongs to + * @param[in, out] vtable Table of variables declared by this component + * @param[in, out] writer CKW writer that writes code scoped to this kernel component + */ + void do_bilinear_resize(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const; + + const ITensorInfo *_src; + const ITensorInfo *_dst; + Attributes _attributes; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute + +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWRESIZE_H diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp new file mode 100644 index 0000000000..d9d741fea5 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "GpuCkwStore.h" + +#include "arm_compute/core/Error.h" + +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" + +#include <cstdint> +#include <string> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +GpuCkwStore::GpuCkwStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors) + : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{} +{ + _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); + _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); +} +void GpuCkwStore::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const +{ + /******************************************************************************** + * 1 - Define tensors + ********************************************************************************/ + GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src"); + GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst"); + + /******************************************************************************** + * 2 - Define CKW constants + ********************************************************************************/ + const auto dst_h = static_cast<int32_t>(_dst->dimension(2)); + + auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32)); + auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32)); + auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32)); + + /******************************************************************************** + * 3 - Define the compute block parameters and destination tile (if not root component) + * Bind the tile to the tensor to share it among different components and + * initialize the compute block parameters + ********************************************************************************/ + const auto &tile_src = src->tile(); + auto &sampler_src = src->tensor_sampler(); + + const auto dst_n0 = static_cast<int32_t>(tile_src.tile_info().width()); + const auto dst_m0 = static_cast<int32_t>(tile_src.tile_info().height()); + const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0; + const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0; + + /******************************************************************************** + * 4 - Define the compute block parameters CKW constants + ********************************************************************************/ + auto const_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32)); + auto const_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32)); + auto const_shift_back_n0_i32 = + writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32)); + + /******************************************************************************** + * 5 - Define the samplers for the input tensor + ********************************************************************************/ + // Not required + + /******************************************************************************** + * 6 - Extra operations required before writing the main code + ********************************************************************************/ + // Not required + + /******************************************************************************** + * 7 - Get the coordinates of the destination tile + ********************************************************************************/ + auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32)); + + writer->op_get_global_id(tile_gid_0, 0); + writer->op_get_global_id(tile_gid_1, 1); + writer->op_get_global_id(tile_gid_2, 2); + + auto tile_nout0 = writer->declare_tile("cout0", ckw::TileInfo(ckw::DataType::Int32)); // OFM + auto tile_mout0 = writer->declare_tile("mout0", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT + auto tile_mout1 = writer->declare_tile("mout1", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0 + auto tile_bout0 = writer->declare_tile("bout0", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX + + // Calculate coordinates + get_coordinate_from_gws_overlapping_min(writer, tile_nout0, tile_gid_0, const_n0_i32, const_shift_back_n0_i32, + const_0_i32); + get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_m0_i32); + + // Get the boundary aware coordinates at each global dimension index + if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1) + { + writer->op_assign(tile_mout1, const_0_i32); + get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32); + } + else if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2) + { + // For tile_mout1 and tile_bout0 the step can only be 1 + writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32); + writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32); + } + + /******************************************************************************** + * 8 - Write the rest of the code + ********************************************************************************/ + writer->op_store(dst->tensor(), tile_src, sampler_src, tile_nout0, tile_mout0, tile_mout1, tile_bout0); +} + +std::string GpuCkwStore::get_name(const ComponentGroup &comp_group) const +{ + ARM_COMPUTE_UNUSED(comp_group); + return "store"; +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h new file mode 100644 index 0000000000..c9ce7eb269 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWSTORE_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWSTORE_H + +#include "src/core/common/Macros.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +class GpuCkwStore : public IGpuCkwComponentDriver +{ +public: + /** Constructor + * + * @param[in] id Component id + * @param[in] tensors Tensor arguments to the component + */ + GpuCkwStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwStore); + /** Destructor */ + ~GpuCkwStore() override = default; + // Inherited methods overriden: + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + std::string get_name(const ComponentGroup &comp_group) const override; + +private: + const ITensorInfo *_src; + const ITensorInfo *_dst; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute + +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWSTORE_H diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.cpp new file mode 100644 index 0000000000..1e6f0841ad --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.cpp @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "CkwHelper.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +void get_coordinate_from_gws(GpuCkwScopedKernelWriter writer, + ckw::TileOperand &coord, + const ckw::TileOperand &gid, + ckw::TileOperand &step) +{ + writer->op_binary(coord, ckw::BinaryOp::Mul, gid, step); +} + +void get_coordinate_from_gws_overlapping_min(GpuCkwScopedKernelWriter writer, + ckw::TileOperand &coord, + const ckw::TileOperand &gid, + ckw::TileOperand &step, + ckw::TileOperand &shift_back, + ckw::TileOperand &const_0) +{ + // Applied formula: max((gid * step) - shift_back, 0) + // where the shift_back operand is: (step - leftover_step) % step + + writer->op_binary(coord, ckw::BinaryOp::Mul, gid, step); + writer->op_binary(coord, ckw::BinaryOp::Sub, coord, shift_back); + writer->op_binary(coord, ckw::BinaryOp::Max, coord, const_0); +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h new file mode 100644 index 0000000000..956e7c8ecb --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_CKWHELPER_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_CKWHELPER_H + +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Get coordinate along one axis. + * + * @param[in,out] writer Writer + * @param[out] coord Resultant coordinate + * @param[in] gid Global work item id + * @param[in] step Step size / vector size + */ +void get_coordinate_from_gws(GpuCkwScopedKernelWriter writer, + ckw::TileOperand &coord, + const ckw::TileOperand &gid, + ckw::TileOperand &step); + +/** Get boundary aware coordinate along one axis. + * + * @param[in,out] writer Writer + * @param[out] coord Resultant coordinate + * @param[in] gid Global work item id + * @param[in] step Step size / vector size + * @param[in] shift_back It is (step - leftover_step) % step + * @param[in] const_0 Constant tile of value 0 + */ +void get_coordinate_from_gws_overlapping_min(GpuCkwScopedKernelWriter writer, + ckw::TileOperand &coord, + const ckw::TileOperand &gid, + ckw::TileOperand &step, + ckw::TileOperand &shift_back, + ckw::TileOperand &const_0); +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_CKWHELPER_H diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.cpp new file mode 100644 index 0000000000..ad31b06362 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.cpp @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "Common.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +ckw::DataType to_ckw(DataType dt) +{ + switch (dt) + { + case DataType::F32: + return ckw::DataType::Fp32; + case DataType::F16: + return ckw::DataType::Fp16; + case DataType::S32: + return ckw::DataType::Int32; + case DataType::S16: + return ckw::DataType::Int16; + case DataType::S8: + case DataType::QASYMM8_SIGNED: + return ckw::DataType::Int8; + case DataType::U32: + return ckw::DataType::Uint32; + case DataType::U16: + return ckw::DataType::Uint16; + case DataType::U8: + case DataType::QASYMM8: + return ckw::DataType::Uint8; + default: + return ckw::DataType::Unknown; + } +} + +ckw::TensorShape to_ckw(const TensorShape &shape) +{ + ARM_COMPUTE_ERROR_ON(shape.num_max_dimensions < std::tuple_size<ckw::TensorShape>{}); + ARM_COMPUTE_ERROR_ON(std::tuple_size<ckw::TensorShape>{} != 5); + /// NOTE: Overflow danger. Use size_t? + return ckw::TensorShape{static_cast<int32_t>(shape[0]), static_cast<int32_t>(shape[1]), + static_cast<int32_t>(shape[2]), static_cast<int32_t>(shape[3]), + static_cast<int32_t>(shape[4])}; +} + +ckw::TensorDataLayout to_ckw(DataLayout dl) +{ + switch (dl) + { + case DataLayout::NHWC: + return ckw::TensorDataLayout::Nhwc; + case DataLayout::NDHWC: + return ckw::TensorDataLayout::Ndhwc; + default: + return ckw::TensorDataLayout::Unknown; + } +} + +ckw::TensorInfo to_ckw(const ITensorInfo &tensor_info) +{ + return ckw::TensorInfo{to_ckw(tensor_info.data_type()), to_ckw(tensor_info.tensor_shape()), + to_ckw(tensor_info.data_layout()), tensor_info.id()}; +} + +ckw::TensorStorageType to_ckw(const TensorStorageType &storage) +{ + switch (storage) + { + case TensorStorageType::ClBufferUint8Ptr: + return ckw::TensorStorageType::BufferUint8Ptr; + case TensorStorageType::ClImage2dReadOnly: + return ckw::TensorStorageType::Texture2dReadOnly; + case TensorStorageType::ClImage2dWriteOnly: + return ckw::TensorStorageType::Texture2dWriteOnly; + case TensorStorageType::Unknown: + return ckw::TensorStorageType::Unknown; + default: + ARM_COMPUTE_ERROR("Unknown tensor storage type"); + } +} + +TensorComponentType from_ckw(const ckw::TensorComponentType &component) +{ + switch (component) + { + case ckw::TensorComponentType::OffsetFirstElement: + return TensorComponentType::OffsetFirstElement; + case ckw::TensorComponentType::Stride0: + return TensorComponentType::Stride0; + case ckw::TensorComponentType::Stride1: + return TensorComponentType::Stride1; + case ckw::TensorComponentType::Stride2: + return TensorComponentType::Stride2; + case ckw::TensorComponentType::Stride3: + return TensorComponentType::Stride3; + case ckw::TensorComponentType::Stride4: + return TensorComponentType::Stride4; + case ckw::TensorComponentType::Dim0: + return TensorComponentType::Dim0; + case ckw::TensorComponentType::Dim1: + return TensorComponentType::Dim1; + case ckw::TensorComponentType::Dim2: + return TensorComponentType::Dim2; + case ckw::TensorComponentType::Dim3: + return TensorComponentType::Dim3; + case ckw::TensorComponentType::Dim4: + return TensorComponentType::Dim4; + case ckw::TensorComponentType::Dim1xDim2: + return TensorComponentType::Dim1xDim2; + case ckw::TensorComponentType::Dim2xDim3: + return TensorComponentType::Dim2xDim3; + case ckw::TensorComponentType::Dim1xDim2xDim3: + return TensorComponentType::Dim1xDim2xDim3; + case ckw::TensorComponentType::Unknown: + return TensorComponentType::Unknown; + default: + ARM_COMPUTE_ERROR("Unknown CKW tensor component"); + } +} + +TensorStorageType from_ckw(const ckw::TensorStorageType &storage) +{ + switch (storage) + { + case ckw::TensorStorageType::BufferUint8Ptr: + return TensorStorageType::ClBufferUint8Ptr; + case ckw::TensorStorageType::Texture2dReadOnly: + return TensorStorageType::ClImage2dReadOnly; + case ckw::TensorStorageType::Texture2dWriteOnly: + return TensorStorageType::ClImage2dWriteOnly; + case ckw::TensorStorageType::Unknown: + return TensorStorageType::Unknown; + default: + ARM_COMPUTE_ERROR("Unknown CKW tensor storage type"); + } +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h new file mode 100644 index 0000000000..26740cdd04 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_COMMON_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_COMMON_H + +#include "arm_compute/core/CoreTypes.h" +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/core/TensorShape.h" + +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" + +#include "compute_kernel_writer/include/ckw/TensorInfo.h" +#include "compute_kernel_writer/include/ckw/types/DataType.h" +#include "compute_kernel_writer/include/ckw/types/TensorComponentType.h" +#include "compute_kernel_writer/include/ckw/types/TensorStorageType.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Convert the Compute Library data type to Compute Kernel Writer data type + * + * @param[in] dt The Compute Library data type + * + * @return the Compute Kernel Writer data type (ckw::DataType) + */ +ckw::DataType to_ckw(DataType dt); + +/** Convert the Compute Library tensor shape to Compute Kernel Writer tensor shape + * + * @param[in] shape The Compute Library tensor shape + * + * @return the Compute Kernel Writer tensor shape (ckw::TensorShape) + */ +ckw::TensorShape to_ckw(const TensorShape &shape); + +/** Convert the Compute Library data layout to Compute Kernel Writer data layout + * + * @param[in] dl The Compute Library data layout + * + * @return the Compute Kernel Writer data layout (ckw::TensorDataLayout) + */ +ckw::TensorDataLayout to_ckw(DataLayout dl); + +/** Convert the Compute Library tensor info to Compute Kernel Writer tensor info + * + * @param[in] tensor_info The Compute Library tensor info + * + * @return the Compute Kernel Writer tensor info (ckw::TensorInfo) + */ +ckw::TensorInfo to_ckw(const ITensorInfo &tensor_info); + +/** Convert the Compute Library tensor storage to Compute Kernel Writer tensor storage + * + * @param[in] storage The Compute Library tensor storage + * + * @return the Compute Kernel Writer tensor storate (ckw::TensorStorageType) + */ +ckw::TensorStorageType to_ckw(const TensorStorageType &storage); + +/** Convert the Compute Kernel Writer tensor component to Compute Library tensor component + * + * @param[in] component The Compute Kernel Writer tensor component + * + * @return the Compute Library tensor component + */ +TensorComponentType from_ckw(const ckw::TensorComponentType &component); + +/** Convert the Compute Kernel Writer tensor storage to Compute Library tensor storage + * + * @param[in] storage The Compute Kernel Writer tensor storage + * + * @return the Compute Library tensor storage + */ +TensorStorageType from_ckw(const ckw::TensorStorageType &storage); + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_COMMON_H diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.cpp new file mode 100644 index 0000000000..5630e390d5 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.cpp @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h" + +#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +ckw::BinaryOp to_ckw(const ElementwiseBinaryCommonAttributes &attributes) +{ + switch (attributes.operation()) + { + case ElementwiseBinaryCommonAttributes::ElementwiseOp::Add: + return ckw::BinaryOp::Add; + case ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub: + return ckw::BinaryOp::Sub; + case ElementwiseBinaryCommonAttributes::ElementwiseOp::Div: + return ckw::BinaryOp::Div; + case ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul: + return ckw::BinaryOp::Mul; + case ElementwiseBinaryCommonAttributes::ElementwiseOp::Min: + case ElementwiseBinaryCommonAttributes::ElementwiseOp::Max: + case ElementwiseBinaryCommonAttributes::ElementwiseOp::Power: + case ElementwiseBinaryCommonAttributes::ElementwiseOp::Prelu: + case ElementwiseBinaryCommonAttributes::ElementwiseOp::SquaredDiff: + default: + ARM_COMPUTE_ERROR("Cannot convert ElementwiseBinaryCommonAttributes to corresponding ckw::BinaryOp"); + } +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h new file mode 100644 index 0000000000..644a407702 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY_H + +#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h" + +#include "compute_kernel_writer/include/ckw/types/Operators.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +ckw::BinaryOp to_ckw(const ElementwiseBinaryCommonAttributes &attributes); +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute + +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY_H |