diff options
Diffstat (limited to 'src/dynamic_fusion/sketch/gpu/template_writer/cl')
6 files changed, 1100 insertions, 0 deletions
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp new file mode 100644 index 0000000000..870de64eb8 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp @@ -0,0 +1,400 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "ClTemplateDirectConv2d.h" + +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h" + +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/helpers/WindowHelpers.h" + +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +ClTemplateDirectConv2d::ClTemplateDirectConv2d(ComponentId id, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings) + : IGpuTemplateComponentWriter{ id, tensors }, + _src{}, + _weight{}, + _bias{}, + _dst{}, + _attributes{ attributes }, + _settings{ settings } +{ + _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); + _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); + if(this->tensors().get_const_tensor(TensorType::ACL_SRC_2)) + { + _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2); + } + _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); + ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _weight, _dst); +} + +std::string ClTemplateDirectConv2d::get_name() const +{ + return "direct_conv2d"; +} + +std::string ClTemplateDirectConv2d::get_component_code(const ComponentGroup &comp_group) const +{ + ARM_COMPUTE_UNUSED(comp_group); + + const auto channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL); + const auto k0 = adjust_vec_size(is_data_type_quantized(_src->data_type()) ? 16u : 8u, _src->dimension(channel_idx)); + const bool leftover_loop = (_src->dimension(channel_idx) % k0) != 0; + + std::string code = R"_( +//------------------ START KERNEL {{meta_kernel_id}} --------------------- +// IN_0(src) {{src}} +// IN_1(wei) {{weight}} +)_"; + if(_bias && _bias->has_valid_id()) + { + code += R"_( +// IN_1(bia) {{bias}} +)_"; + } + code += R"_( +// OUT(dst, accum) {{dst}} + +// Initialize the accumulators +TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}}); +{ + // All the tensor dimensions are passed at compile time. + // In case of dynamic tensor support, the following dimensions should be passed as function argument. +#define _IWEI_WIDTH {{WEI_WIDTH}} +#define _IWEI_HEIGHT {{WEI_HEIGHT}} +#define _ISRC_WIDTH {{src}}_w +#define _ISRC_HEIGHT {{src}}_h +#define _ISRC_CHANNELS {{src}}_c +#define _IDST_WIDTH {{arg_dst}}_w +#define _IDST_HEIGHT {{arg_dst}}_h +#define _IDST_CHANNELS {{arg_dst}}_c +#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT) + + // .v = access the whole vector (OpenCL vector) + // .s[x] = access the vector element at position x (scalar access) + TILE(int, M0, 1, xi); + TILE(int, M0, 1, yi); + + // Convert the linear index to coordinate + LOOP_UNROLLING(int, i, 0, 1, M0, + { + xi[i].v = ((g_ind_1 + i) % _IDST_WIDTH) * {{STRIDE_X}}; + yi[i].v = ((g_ind_1 + i) / _IDST_WIDTH) * {{STRIDE_Y}}; + xi[i].v -= {{PAD_LEFT}}; + yi[i].v -= {{PAD_TOP}}; + }) + + LOOP_UNROLLING(int, i, 0, 1, M0, + { + {{dst}}[i].v = 0; + }) + + for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i) + { + int ck = 0; + int xk = i % _IWEI_WIDTH; + int yk = i / _IWEI_WIDTH; + + int k = 0; + for(; k <= (_ISRC_CHANNELS - K0); k += K0) + { + TILE({{SRC_DATA_TYPE}}, M0, K0, a); + TILE({{WEI_DATA_TYPE}}, N0, K0, b); + + // Initialize tiles + LOOP_UNROLLING(int, i, 0, 1, M0, + { + a[i].v = {{ZERO_VALUE}}; + }) + + LOOP_UNROLLING(int, i, 0, 1, N0, + { + b[i].v = {{ZERO_VALUE}}; + }) + + // Load tile from the src tensor + T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, K0, {{SRC_TENSOR_TYPE}}, {{src}}, g_ind_2, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a); + + // Load tile from the weights tensor + T_LOAD({{WEI_DATA_TYPE}}, N0, K0, {{WEI_TENSOR_TYPE}}, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b); + + // Compute the matrix multiplication between two tiles + T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, K0, NT, T, a, b, {{dst}}); + + ck += K0; + } + + // We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS + // This #if directive should be removed in case of dynamic tensor support +)_"; + + if(leftover_loop) + { + code += R"_( + // Left-over accumulations + for(; k < _ISRC_CHANNELS; ++k) + { + TILE({{SRC_DATA_TYPE}}, M0, 1, a); + TILE({{WEI_DATA_TYPE}}, N0, 1, b); + + // Initialize tiles + LOOP_UNROLLING(int, i, 0, 1, M0, + { + a[i].v = {{ZERO_VALUE}}; + }) + + LOOP_UNROLLING(int, i, 0, 1, N0, + { + b[i].v = {{ZERO_VALUE}}; + }) + + // Load tile from the src tensor + T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, 1, {{SRC_TENSOR_TYPE}}, {{src}}, g_ind_2, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a); + + // Load tile from the weights tensor + // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration + T_LOAD({{WEI_DATA_TYPE}}, N0, 1, BUFFER, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b); + + // Compute the matrix multiplication between two tiles + T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, 1, NT, T, a, b, {{dst}}); + + ++ck; + } + )_"; +} + +code += R"_( +#undef _I_WEI_WIDTH +#undef _I_WEI_HEIGHT +#undef _ISRC_WIDTH +#undef _ISRC_HEIGHT +#undef _ISRC_CHANNELS +#undef _IDST_WIDTH +#undef _IDST_HEIGHT +#undef _IDST_CHANNELS +#undef _IY_MULTIPLIER + + } +)_"; + + if(_bias && _bias->has_valid_id()) + { + code += R"_( + TILE({{BIA_DATA_TYPE}}, 1, N0, bias0); + + T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, g_ind_0, 0, 1, 0, bias0); + + // c = c + bias[broadcasted] + T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}}); + )_"; +} + +code += R"_( +} +//------------------ END KERNEL {{meta_kernel_id}} --------------------- +)_"; + return code; +} + +void ClTemplateDirectConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +{ + vtable.declare_variable( + _src, + GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + comp_group.is_intermediate_tensor(_src), + "src"); + + const GpuKernelArgumentInfo::Type weight_type = _settings.export_to_cl_image() ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer; + vtable.declare_variable( + _weight, + GpuKernelArgumentInfo(weight_type), + comp_group.is_intermediate_tensor(_weight), + "weight"); + + if(_bias && _bias->has_valid_id()) // optional bias + { + vtable.declare_variable( + _bias, + GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector), + comp_group.is_intermediate_tensor(_bias), + "bias"); + } + vtable.declare_variable( + _dst, + GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + comp_group.is_intermediate_tensor(_dst), + "dst"); +} + +TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +{ + TagLUT lut{}; + // Arguments and global shared variables + lut["src"] = vtable.get_variable(_src); + lut["weight"] = vtable.get_variable(_weight); + + if(_bias && _bias->has_valid_id()) // optional bias + { + lut["bias"] = vtable.get_variable(_bias); + lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type()); + } + lut["dst"] = vtable.get_variable(_dst); + + const auto dst_argument = vtable.get_variable(comp_group.get_dst_tensors()[0]); + lut["arg_dst"] = dst_argument.uniq_name; + + // Local build options + lut["meta_kernel_id"] = id(); + lut["ACC_DATA_TYPE"] = _src->data_type(); + lut["SRC_DATA_TYPE"] = _src->data_type(); + lut["WEI_DATA_TYPE"] = _weight->data_type(); + + lut["SRC_TENSOR_TYPE"] = "BUFFER"; + switch(vtable.get_variable(_weight).kernel_argument_info.type) + { + case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D: + case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D: + case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image: + { + lut["WEI_TENSOR_TYPE"] = "IMAGE"; + break; + } + default: + { + lut["WEI_TENSOR_TYPE"] = "BUFFER"; + break; + } + } + const auto width_idx = 1; + const auto height_idx = 2; + lut["WEI_WIDTH"] = _weight->dimension(width_idx); + lut["WEI_HEIGHT"] = _weight->dimension(height_idx); + + lut["STRIDE_X"] = _attributes.stride().x(); + lut["STRIDE_Y"] = _attributes.stride().y(); + + lut["PAD_LEFT"] = _attributes.pad().left; + lut["PAD_TOP"] = _attributes.pad().top; + + lut["ZERO_VALUE"] = 0; + + return lut; +} + +CLBuildOptions ClTemplateDirectConv2d::get_build_options(const ComponentGroup &comp_group) const +{ + const unsigned int channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL); + const DataType data_type = _src->data_type(); + + /// NOTE: For now tile sizes (n0, m0, n0) are set by the execution window. This may change in the future + const auto root_window = comp_group.get_root_component()->template_writer()->get_window(); + const unsigned int n0 = root_window.x().step(); + const unsigned int m0 = root_window.y().step(); + const unsigned int k0 = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, _src->dimension(channel_idx)); + const unsigned int partial_store_n0 = _dst->dimension(0) % n0; + + CLBuildOptions build_opts{}; + if(_settings.fast_relaxed_math()) + { + build_opts.add_option("-cl-fast-relaxed-math"); + } + else + { + // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations + // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations + build_opts.add_option("-cl-unsafe-math-optimizations"); + } + build_opts.add_option("-DIS_TILED"); + build_opts.add_option("-DN0=" + support::cpp11::to_string(n0)); + build_opts.add_option("-DM0=" + support::cpp11::to_string(m0)); + build_opts.add_option("-DK0=" + support::cpp11::to_string(k0)); + build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0)); + + return build_opts; +} + +std::string ClTemplateDirectConv2d::get_config_id() const +{ + const DataType data_type = _src->data_type(); + const DataLayout data_layout = _src->data_layout(); + + const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + const unsigned int kernel_size = _weight->dimension(width_idx); + + std::string config_id{}; + config_id += lower_string(string_from_data_type(data_type)); + config_id += "_"; + config_id += support::cpp11::to_string(kernel_size); + config_id += "_"; + config_id += support::cpp11::to_string(_attributes.stride().x()); + config_id += "_"; + config_id += support::cpp11::to_string(_attributes.stride().y()); + config_id += "_"; + config_id += support::cpp11::to_string(_dst->dimension(width_idx)); + config_id += "_"; + config_id += support::cpp11::to_string(_dst->dimension(height_idx)); + config_id += "_"; + config_id += lower_string(string_from_data_layout(data_layout)); + return config_id; +} + +std::set<std::string> ClTemplateDirectConv2d::get_headers_list() const +{ + return std::set<std::string>{ "helpers.h", "tile_helpers.h" }; +} + +Window ClTemplateDirectConv2d::get_window() const +{ + ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); + + const auto output_shape = _dst->tensor_shape(); + + const unsigned int vec_size = std::min(static_cast<unsigned int>(output_shape[0]), 4u); + const unsigned int num_rows = (_dst->tensor_shape()[0] > 16) ? ((_src->data_type() == DataType::F32) ? 2U : 4U) : 1U; + + // Create and configure kernel window + Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows)); + + const size_t dim_y_collapsed = ceil_to_multiple(output_shape[1] * output_shape[2], num_rows); + win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, num_rows)); + win.set(Window::DimZ, Window::Dimension(0, output_shape.total_size_upper(3), 1)); + + return win; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h new file mode 100644 index 0000000000..48027a9b8d --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDIRECTCONV2D +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDIRECTCONV2D + +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h" +#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h" +#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +class ClTemplateDirectConv2d final : public IGpuTemplateComponentWriter +{ +public: + using Attributes = ClComponentDirectConv2d::Attributes; + using Settings = ClComponentDirectConv2d::Settings; + /** Constructor + * + * Similar to @ref ClComponentDirectConv2d::validate() + * + * @param[in] id Component id + * @param[in] tensors Tensor arguments to the components + * @param[in] attributes Component attributes + * @param[in] settings Component settings + */ + ClTemplateDirectConv2d(ComponentId id, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings); + /** Prevent instances of this class from being copy constructed */ + ClTemplateDirectConv2d(const ClTemplateDirectConv2d &direct_conv2d) = delete; + /** Prevent instances of this class from being copied */ + ClTemplateDirectConv2d &operator=(const ClTemplateDirectConv2d &direct_conv2d) = delete; + /** Allow instances of this class to be move constructed */ + ClTemplateDirectConv2d(ClTemplateDirectConv2d &&direct_conv2d) = default; + /** Allow instances of this class to be moved */ + ClTemplateDirectConv2d &operator=(ClTemplateDirectConv2d &&direct_conv2d) = default; + /** Generate kernel component name */ + std::string get_name() const override; + /** Generate kernel component code template + * + * @param[in] comp_group Component group of which the component is a part of + * + * @return std::string Component code + */ + std::string get_component_code(const ComponentGroup &comp_group) const override; + /** Declare all variables used by the component in the @p vtable + * + * @param[out] vtable Variable table + * @param[in] comp_group Component group of which the component is a part of + */ + void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override; + /** Generate the tag look-up table used to instantiate the component code. + * + * @param[in] vtable Variable table + * @param[in] comp_group Component group of which the component is a part of + * + * @return TagLUT Tag lookup table + */ + TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override; + /** Generate the build options used in the component + * + * @param[in] comp_group Component group of which the component is a part of + * + * @return CLBuildOptions Build options + */ + CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override; + /** Generate the component config id string used for tuning */ + std::string get_config_id() const override; + /** Generate the header list used in the component */ + std::set<std::string> get_headers_list() const override; + /** Generate the execution window for the component */ + Window get_window() const override; + +private: + const ITensorInfo *_src; + const ITensorInfo *_weight; + const ITensorInfo *_bias; + const ITensorInfo *_dst; + Attributes _attributes; + Settings _settings; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDIRECTCONV2D */ diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp new file mode 100644 index 0000000000..6c4b8f52f2 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "ClTemplateStore.h" + +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +ClTemplateStore::ClTemplateStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors) + : IGpuTemplateComponentWriter{ id, tensors }, _src{}, _dst{} +{ + _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); + _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); +} + +std::string ClTemplateStore::get_name() const +{ + return "store"; +} + +std::string ClTemplateStore::get_component_code(const ComponentGroup &comp_group) const +{ + ARM_COMPUTE_UNUSED(comp_group); + return R"_( +//------------------ START KERNEL {{meta_kernel_id}} STORE --------------------- +{ +// This also follows NHWC layout +// g_ind_0 maps to global_id(0) maps to Channel +// g_ind_1 maps to global_id(1) maps to Height and Weight (Collapsed Window) +// g_ind_2 maps to global_id(2) maps to N / Batch +#define _IDST_WIDTH {{dst}}_w +#define _IDST_HEIGHT {{dst}}_h + TILE(uint, M0, 1, dst_indirect_y); + + // Calculate the destination indirect Y + LOOP_UNROLLING(int, i, 0, 1, M0, + { + dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1); + dst_indirect_y[i].v += g_ind_2 * (int)(_IDST_WIDTH * _IDST_HEIGHT); + }) + + bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0; + + T_STORE_INDIRECT_WIDTH_SELECT({{DST_DATA_TYPE}}, M0, N0, PARTIAL_N0, {{DST_TENSOR_TYPE}}, {{dst}}, g_ind_0, {{dst}}_stride_y, x_cond, {{src}}, dst_indirect_y); + +#undef _IDST_WIDTH +#undef _IDST_HEIGHT + //------------------ END KERNEL {{meta_kernel_id}} STORE --------------------- +} + +)_"; +} + +void ClTemplateStore::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +{ + // ARM_COMPUTE_UNUSED(comp_group) + vtable.declare_variable( + _src, + GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + comp_group.is_intermediate_tensor(_src), + "src"); + vtable.declare_variable( + _dst, + GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + comp_group.is_intermediate_tensor(_dst), + "dst"); +} + +TagLUT ClTemplateStore::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +{ + TagLUT lut{}; + + // Arguments and global shared variables + lut["src"] = vtable.get_variable(_src); + lut["dst"] = vtable.get_variable(_dst); + + // Local build options + lut["meta_kernel_id"] = id(); + lut["DST_TENSOR_TYPE"] = "BUFFER"; + const auto dst_info = comp_group.get_dst_tensors()[0]; + lut["DST_DATA_TYPE"] = dst_info->data_type(); + + return lut; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h new file mode 100644 index 0000000000..3f97a82204 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE + +#include "arm_compute/core/experimental/Types.h" +#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h" +#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +class ClTemplateStore final : public IGpuTemplateComponentWriter +{ +public: + /** Constructor + * + * @param[in] id Component id + * @param[in] tensors Tensor arguments to the components + */ + ClTemplateStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors); + /** Prevent instances of this class from being copy constructed */ + ClTemplateStore(const ClTemplateStore &store) = delete; + /** Prevent instances of this class from being copied */ + ClTemplateStore &operator=(const ClTemplateStore &store) = delete; + /** Allow instances of this class to be move constructed */ + ClTemplateStore(ClTemplateStore &&store) = default; + /** Allow instances of this class to be moved */ + ClTemplateStore &operator=(ClTemplateStore &&store) = default; + /** Generate kernel component name */ + std::string get_name() const override; + /** Generate kernel component code template + * + * @param[in] comp_group Component group of which the component is a part of + * + * @return std::string Component code + */ + std::string get_component_code(const ComponentGroup &comp_group) const override; + /** Declare all variables used by the component in the @p vtable + * + * @param[out] vtable Variable table + * @param[in] comp_group Component group of which the component is a part of + */ + void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override; + /** Generate the tag look-up table used to instantiate the component code. + * + * @param[in] vtable Variable table + * @param[in] comp_group Component group of which the component is a part of + * + * @return TagLUT Tag lookup table + */ + TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override; + +private: + const ITensorInfo *_src; + const ITensorInfo *_dst; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE */ diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp new file mode 100644 index 0000000000..cb643a741d --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp @@ -0,0 +1,297 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "ClTemplateWriter.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" +#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/// @note: some tags can be unused since they could be used only for the macros, or only for the component code +std::string ClTemplateWriter::replace_tags(const std::string &code_template, const TagLUT &tags) +{ + std::string replaced_code = ""; + bool scanning_pattern = false; + std::string pattern_found = ""; + for(size_t i = 0; i < code_template.size() - 1; ++i) + { + if(!scanning_pattern) + { + if(code_template[i] == '{' && code_template[i + 1] == '{') + { + i += 1; + scanning_pattern = true; + pattern_found = ""; + } + else + { + replaced_code += code_template[i]; + } + } + else + { + if(code_template[i] == '}' && code_template[i + 1] == '}') + { + i += 1; + scanning_pattern = false; + std::string err = "Pattern " + pattern_found + " not found in tags"; + ARM_COMPUTE_ERROR_ON_MSG(tags.find(pattern_found) == tags.end(), err.c_str()); + replaced_code += tags.find(pattern_found)->second.value; + } + else + { + pattern_found += code_template[i]; + } + } + } + + return replaced_code; +} +ClTemplateWriter::~ClTemplateWriter() +{ +} +ClTemplateWriter::ClTemplateWriter(const GpuKernelComponentGroup &components) + : _components{ components } +{ +} +std::string ClTemplateWriter::get_name() +{ + return write_kernel_name(); +} +std::string ClTemplateWriter::get_code() +{ + return write_code(); +} +std::string ClTemplateWriter::get_config_id() +{ + std::string config_id = get_name(); + for(const auto &comp : _components) + { + config_id += "--" + comp->template_writer()->get_config_id() + "--"; + } + + return config_id; +} + +CLBuildOptions ClTemplateWriter::get_build_options() +{ + CLBuildOptions build_opts{}; + + for(const auto &comp : _components) + { + build_opts.add_options(comp->template_writer()->get_build_options(_components).options()); + } + + return build_opts; +} + +Window ClTemplateWriter::get_window() const +{ + const auto root_comp = _components.get_root_component(); + ARM_COMPUTE_ERROR_ON_MSG(root_comp == nullptr, "No root component found"); + return root_comp->template_writer()->get_window(); +} + +std::map<ITensorInfo::Id, GpuKernelArgument> ClTemplateWriter::get_tensors() +{ + // Assemble GpuKernelArguments + std::map<ITensorInfo::Id, GpuKernelArgument> tensors; + for(const auto t : _components.get_argument_tensors()) + { + tensors.emplace( + t->id(), + GpuKernelArgument{ *t, _vtable.get_variable(t).kernel_argument_info }); + } + return tensors; +} + +std::string ClTemplateWriter::write_code() +{ + ARM_COMPUTE_ERROR_ON_MSG(_components.empty(), "No components found"); + + // These data structures will hold the data from all the components in the blueprint + std::set<std::string> headers_list{}; + std::set<std::string> additional_macros{}; + std::vector<std::string> component_codes{}; // vector because order matters + + // Pass 1: Declare all kernel variables + for(auto &component : _components) + { + component->template_writer()->declare_variables(_vtable, _components); + } + // Pass 2: Generate component codes + for(auto &component : _components) + { + const auto component_writer = component->template_writer(); + auto curr_headers_list = component_writer->get_headers_list(); + auto curr_additional_macros = component_writer->get_additional_macros(); + auto curr_component_code = component_writer->get_component_code(_components); + const auto var_lut = component_writer->get_tag_lut(_vtable, _components); // Ideally can be merged with get_component_code once we have finer-grained code generation technique + component_codes.push_back(replace_tags(curr_component_code, var_lut)); + + headers_list.insert(curr_headers_list.begin(), curr_headers_list.end()); + if(!additional_macros.empty()) // Some components might not have any + { + additional_macros.insert(replace_tags(curr_additional_macros, var_lut)); + } + } + + // Step 3: Assemble the data gathered by traversing the graph into the string "code" + std::string code = ""; + + for(auto &header : headers_list) + { +#if defined(EMBEDDED_KERNELS) + code += CLKernelLibrary::get().get_program(header).first; +#else // defined(EMBEDDED_KERNELS) + code += "#include \"" + header + "\"\n"; +#endif // defined(EMBEDDED_KERNELS) + } + + for(auto ¯os : additional_macros) + { + code += macros; + } + + code += write_kernel_signature(_vtable.get_variable_list(_components.get_argument_tensors())); + + code += "\n{\n\n"; + + code += " //------------------ START KERNEL_BUILDER_COORDINATE ---------------------\n\n"; + code += write_global_section(); + code += " //------------------ END KERNEL_BUILDER_COORDINATE ---------------------\n"; + + for(const auto &component_code : component_codes) + { + code += component_code; + } + + code += "}\n"; + + return code; +} +std::string ClTemplateWriter::write_global_section() const +{ + const auto dst_tensors = _components.get_dst_tensors(); + ARM_COMPUTE_ERROR_ON_MSG(dst_tensors.size() != 1, "Only one destination tensor per kernel is allowed"); + const auto dst_info = dst_tensors[0]; + const auto dst_w = dst_info->dimension(0); + const auto tile_w = std::max(1, get_window().x().step()); + const auto tile_h = std::max(1, get_window().y().step()); + auto leftover_w = dst_w % tile_w; + + std::string code = ""; + code += std::string(" int g_ind_0 = GET_SPATIAL_IDX(0, ") + std::to_string(tile_w) + ", " + std::to_string(leftover_w) + ");\n"; + code += std::string(" int g_ind_1 = GET_SPATIAL_IDX(1, ") + std::to_string(tile_h) + ", " + "0);\n"; + code += std::string(" int g_ind_2 = GET_SPATIAL_IDX(2, 1, 0);\n\n"); + + code += " const bool g_cond_x = (g_ind_0 == 0);\n"; + code += " const bool g_cond_y = (g_ind_1 == 0);\n"; + + return code; +} +std::string ClTemplateWriter::write_argument_declaration(const GpuKernelVariableTable::TensorVariable &var) const +{ + std::string code; + switch(var.kernel_argument_info.type) + { + case GpuKernelArgumentInfo::Type::Vector: + { + code += "\n VECTOR_DECLARATION(" + var.uniq_name + ")"; + break; + } + case GpuKernelArgumentInfo::Type::Image: + { + code += "\n IMAGE_DECLARATION(" + var.uniq_name + ")"; + break; + } + case GpuKernelArgumentInfo::Type::Image_3D: + { + code += "\n IMAGE_DECLARATION(" + var.uniq_name + "),"; + code += "\n unsigned int " + var.uniq_name + "_stride_z"; + break; + } + case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D: + { + code += "\n __read_only image2d_t " + var.uniq_name + "_img,"; + code += "\n unsigned int " + var.uniq_name + "_stride_z"; + break; + } + case GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer: + { + code += "\n TENSOR4D_T(" + var.uniq_name + ", BUFFER)"; + break; + } + case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image: + { + code += "\n TENSOR4D_T(" + var.uniq_name + ", IMAGE)"; + break; + } + default: + { + ARM_COMPUTE_ERROR("Unsupported declaration generation for GpuKernelArgumentInfo::Type"); + } + } + return code; +} +std::string ClTemplateWriter::write_kernel_signature(const GpuKernelVariableTable::VariableList &argument_list) const +{ + std::string code = "\n__kernel void " + write_kernel_name() + "("; + + for(int i = 0; i < static_cast<int>(argument_list.size()) - 1; ++i) + { + code += write_argument_declaration(argument_list[i]) + ","; + } + if(static_cast<int>(argument_list.size()) - 1 >= 0) + { + code += write_argument_declaration(argument_list[argument_list.size() - 1]); + } + + code += ')'; + + return code; +} +std::string ClTemplateWriter::write_kernel_name() const +{ + if(_components.empty()) + { + return "empty_kernel"; + } + std::string name = _components.empty() ? "" : _components[0]->template_writer()->get_name(); + for(size_t i = 1; i < _components.size(); ++i) + { + name += "___"; + name += _components[i]->template_writer()->get_name(); + } + + return name; +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h new file mode 100644 index 0000000000..83f617b6c6 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEWRITER +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEWRITER + +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h" +#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h" + +#include <map> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Use a templated-string-based method to write kernel code + * It stitches the component code templates together based on the valid fusion configuration. + * It then instantiates the actual kernel code from the template and the generated tag lookup table. + */ +class ClTemplateWriter : public IGpuKernelWriter +{ +public: + /** Instantiates a kernel code string from the kernel code template + * @note: some tags can be unused since they could be used only for the macros, or only for the component code + * + * @param[in] code_template Kernel code template + * @param[in] tags Tag lookup table + * + * @return std::string Instantiated kernel string + */ + static std::string replace_tags(const std::string &code_template, const TagLUT &tags); + /** Default constructor */ + ClTemplateWriter() = default; + /** Constructor + * + * @param[in] components Kernel component group from which the kernel will be generated + */ + ClTemplateWriter(const GpuKernelComponentGroup &components); + /** Destructor */ + ~ClTemplateWriter() override; + /** Generate kernel name */ + std::string get_name() override; + /** Generate kernel code */ + std::string get_code() override; + /** Generate build options */ + CLBuildOptions get_build_options() override; + /** Generate config id string of the entire kernel. This is used for tuning */ + std::string get_config_id() override; + /** Generate execution window */ + Window get_window() const override; + /** Get the kernel argument lists of the kernel*/ + std::map<ITensorInfo::Id, GpuKernelArgument> get_tensors() override; + +private: + std::string write_kernel_name() const; + std::string write_code(); + std::string write_global_section() const; + std::string write_argument_declaration(const GpuKernelVariableTable::TensorVariable &var) const; + std::string write_kernel_signature(const GpuKernelVariableTable::VariableList &argument_list) const; + +private: + GpuKernelComponentGroup _components{}; + GpuKernelVariableTable _vtable{}; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEWRITER */ |