diff options
Diffstat (limited to 'src')
46 files changed, 6272 insertions, 4 deletions
diff --git a/src/core/CL/CLCompileContext.cpp b/src/core/CL/CLCompileContext.cpp index fce8798b48..ea03d59fc2 100644 --- a/src/core/CL/CLCompileContext.cpp +++ b/src/core/CL/CLCompileContext.cpp @@ -232,7 +232,7 @@ void CLCompileContext::set_context(cl::Context context) std::string CLCompileContext::generate_build_options(const StringSet &build_options_set, const std::string &kernel_path) const { std::string concat_str; - bool ext_supported = false; + bool ext_supported = false; std::string ext_buildopts; #if defined(ARM_COMPUTE_DEBUG_ENABLED) @@ -399,4 +399,8 @@ int32_t CLCompileContext::get_ddk_version() const return -1; } +GPUTarget CLCompileContext::get_gpu_target() const +{ + return _device.target(); +} } // namespace arm_compute diff --git a/src/core/CL/ICLKernel.h b/src/core/CL/ICLKernel.h index d52b105507..224b68af70 100644 --- a/src/core/CL/ICLKernel.h +++ b/src/core/CL/ICLKernel.h @@ -44,7 +44,6 @@ namespace experimental { namespace dynamic_fusion { -struct TensorBinding; struct ClExecutionDescriptor; } // namespace dynamic_fusion } // namespace experimental diff --git a/src/core/TensorInfo.cpp b/src/core/TensorInfo.cpp index e441ddb3a2..12f79444c6 100644 --- a/src/core/TensorInfo.cpp +++ b/src/core/TensorInfo.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2021 Arm Limited. + * Copyright (c) 2016-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -35,7 +35,7 @@ namespace arm_compute { TensorInfo::TensorInfo() : _total_size(0), _offset_first_element_in_bytes(0), _strides_in_bytes(), _num_channels(0), _tensor_shape(), _dims_state(), _data_type(DataType::UNKNOWN), _format(Format::UNKNOWN), _is_resizable{ true }, - _valid_region{ Coordinates(), _tensor_shape }, _padding{ 0 }, _quantization_info(), _data_layout(DataLayout::NCHW), _are_values_constant(true) + _valid_region{ Coordinates(), _tensor_shape }, _padding{ 0 }, _quantization_info(), _data_layout(DataLayout::NCHW), _are_values_constant(true), _id(invalid_tensor_id) { } @@ -56,8 +56,28 @@ TensorInfo::TensorInfo(const ITensorInfo &info) _quantization_info = info.quantization_info(); _data_layout = info.data_layout(); _are_values_constant = info.are_values_constant(); + _id = invalid_tensor_id; // Tensor Id has to be explicitly set, instead of being copied } +TensorInfo::TensorInfo(const TensorInfo &info) + : TensorInfo() +{ + _total_size = info.total_size(); + _offset_first_element_in_bytes = info.offset_first_element_in_bytes(); + _strides_in_bytes = info.strides_in_bytes(); + _num_channels = info.num_channels(); + _tensor_shape = info.tensor_shape(); + _dims_state = info.tensor_dims_state(); + _data_type = info.data_type(); + _format = info.format(); + _is_resizable = info.is_resizable(); + _valid_region = info.valid_region(); + _padding = info.padding(); + _quantization_info = info.quantization_info(); + _data_layout = info.data_layout(); + _are_values_constant = info.are_values_constant(); + _id = invalid_tensor_id; // Tensor Id has to be explicitly set, instead of being copied +} TensorInfo::TensorInfo(Format format) : TensorInfo(TensorShape(), format) { diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp new file mode 100644 index 0000000000..93fbdfed63 --- /dev/null +++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "ClKernelRuntime.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "src/core/CL/CLUtils.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h" +#include "src/gpu/cl/ClKernelLibrary.h" + +#include "support/Cast.h" +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +using namespace arm_compute::opencl; + +void ClKernelRuntime::configure(const ClCompileContext &compile_ctx, const GpuKernelSourceCode &code) +{ + // Create kernel from kernel source string + opencl::ClKernelLibrary &klib = opencl::ClKernelLibrary::get(); + _kernel = static_cast<cl::Kernel>(compile_ctx.create_kernel(code.name(), + "" /* Program name: Used to as part of a unique string for built kernel cache. Not needed */, + code.code(), + klib.kernel_path() /* Kernel path: Used in cases of embedded kernels */, + code.build_options().options(), + false /* Is source binary */)); + + // Configure execution window + IClKernel::configure_internal(code.window()); + + // Set config id for lws tuning + _config_id = code.config_id(); + + // Set kernel arguments + _arguments = code.arguments(); +} + +inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKernelArgumentInfo &arg, const ICLTensor *tensor, const Window &arg_slice, std::vector<cl::Image2D> &cl_images) +{ + switch(arg.type) + { + case GpuKernelArgumentInfo::Type::Scalar: + { + ARM_COMPUTE_ERROR("Unsupported yet"); + break; + } + + case GpuKernelArgumentInfo::Type::Vector: + { + add_1D_tensor_argument(idx, tensor, arg_slice); + break; + } + + case GpuKernelArgumentInfo::Type::Image: + { + add_2D_tensor_argument(idx, tensor, arg_slice); + break; + } + case GpuKernelArgumentInfo::Type::Image_Reinterpret_As_3D: + { + add_2D_tensor_argument(idx, tensor, arg_slice); + const unsigned int total_cross_plane_pad = tensor->info()->padding().top + tensor->info()->padding().bottom; + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad)); + break; + } + case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D: + { + const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3)); + const size_t image_row_pitch = tensor->info()->strides_in_bytes()[1]; + cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch); + cl_images.push_back(tensor_image2d); + _kernel.setArg(idx++, tensor_image2d); + break; + } + + case GpuKernelArgumentInfo::Type::Image_3D: + { + add_2D_tensor_argument(idx, tensor, arg_slice); + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(tensor->info()->strides_in_bytes()[2])); + break; + } + case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D: + { + const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3)); + const size_t image_row_pitch = tensor->info()->strides_in_bytes()[1]; + cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch); + cl_images.push_back(tensor_image2d); + _kernel.setArg(idx++, tensor_image2d); + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(tensor->info()->strides_in_bytes()[2])); + break; + } + + case GpuKernelArgumentInfo::Type::Tensor_3D: + { + add_3D_tensor_argument(idx, tensor, arg_slice); + break; + } + + case GpuKernelArgumentInfo::Type::Tensor_4D: + { + add_4D_tensor_argument(idx, tensor, arg_slice); + break; + } + case GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer: + { + add_4d_tensor_nhwc_argument(idx, tensor); + break; + } + case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image: + { + const size_t image_w = tensor->info()->dimension(0) / 4; + const size_t image_h = tensor->info()->tensor_shape().total_size_upper(1); + const size_t image_stride_y = tensor->info()->strides_in_bytes()[1]; + + cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), + TensorShape(image_w, image_h), tensor->info()->data_type(), image_stride_y); + cl_images.push_back(tensor_image2d); + + _kernel.setArg(idx++, tensor_image2d); + add_4d_tensor_nhwc_argument(idx, tensor); + break; + } + default: + { + ARM_COMPUTE_ERROR("Unsupported"); + } + } +} + +void ClKernelRuntime::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_3D(); + // Don't slice matrix along the z dimension if matrix has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the matrix multiplication is used to perform a convolution operation + Window slice_fixed_z = slice; + slice_fixed_z.set(Window::DimX, Window::Dimension(0, 1, 1)); + slice_fixed_z.set(Window::DimY, Window::Dimension(0, 1, 1)); + + /// NOTE: Parameters extracted from old kernels. So far they seem to be constant + /// but we may need to make them into another configuration passed from GpuWorkloadSourceCode if needed in the future + constexpr bool slide_along_dimz = true; + constexpr bool skip_sliding_window = false; + constexpr bool use_dummy_work_items = false; + + unsigned int idx = 0; + do + { + // Set kernel arguments + Window arg_slice = slice; + // CLImages created from tensor arguments. Need to be retained until enqueue + std::vector<cl::Image2D> cl_images; + for(auto id_arg : _arguments) + { + const auto arg = id_arg.second; + auto tensor = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(id_arg.first)); + ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); + ARM_COMPUTE_ERROR_ON_NULLPTR(tensor->info()); + if(!slide_along_dimz) + { + // The stride_z for matrix must be zero if we do not slice + ARM_COMPUTE_ERROR_ON(tensor->info()->strides_in_bytes()[3] != 0); + arg_slice = slice_fixed_z; + } + add_tensor_argument(idx, *arg.kernel_argument_info(), tensor, arg_slice, cl_images); + } + + // Dispatch kernel + enqueue(queue, *this, slice, lws_hint(), use_dummy_work_items); + } + while(skip_sliding_window && window.slide_window_slice_3D(slice)); +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h new file mode 100644 index 0000000000..acc2380031 --- /dev/null +++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME +#define SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME + +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +struct GpuKernelSourceCode; + +/** OpenCL runtime to run a single kernel */ +class ClKernelRuntime final : public opencl::IClKernel +{ +public: + /** Configure the kernel runtime + * + * @param[in] compile_ctx OpenCL compile context + * @param[in] code Kernel source code + */ + void configure(const opencl::ClCompileContext &compile_ctx, const GpuKernelSourceCode &code); + /** Run the kernel + * + * @param[in,out] tensors @ref ITensorPack object containing run-time tensor memories + * @param[in] window Execution window + * @param[in] queue OpenCL command queue + */ + virtual void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +private: + /** Set a kernel tensor argument + * + * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set. + * @param[in] arg Kernel argument descriptor accompanying @p tensor + * @param[in] tensor Tensor to set as an argument of the object's kernel + * @param[in] arg_slice Window the kernel will be run on + * @param[out] cl_images Extra cl images created from the tensor (will need to be retained until the kernel is enqueued) + */ + inline void add_tensor_argument(unsigned int &idx, const GpuKernelArgumentInfo &arg, const ICLTensor *tensor, const Window &arg_slice, std::vector<cl::Image2D> &cl_images); + +private: + GpuKernelArgumentList _arguments{}; /** All kernel arguments required by the runtime */ +}; + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME */ diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp b/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp new file mode 100644 index 0000000000..549c6d4abb --- /dev/null +++ b/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp @@ -0,0 +1,351 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h" + +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h" +#include "support/Cast.h" + +#include <algorithm> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +/** Holder of any auxiliary @ref CLTensor required by a @ref GpuWorkloadSourceCode. + * + * @note The tensors are not allocated by default, and require the user to explicitly allocate them using the associated @ref TensorInfo and @ref AuxMemoryInfo + * + * @note This data holder must remain valid until the @ref ClWorkloadRuntime that uses it, is out of scope + */ +class ClAuxTensors +{ +public: + /** A view of a single auxiliary data and the associated @ref TensorInfo and @ref AuxMemoryInfo + */ + struct DataView + { + DataView() = default; + DataView(CLTensor *tensor, const TensorInfo &tensor_info, const AuxMemoryInfo &memory_info) + : tensor{ tensor }, tensor_info{ tensor_info }, memory_info{ memory_info } + { + } + ~DataView() = default; + DataView(const DataView &other) = default; + DataView &operator=(const DataView &other) = default; + DataView(DataView &&other) = default; + DataView &operator=(DataView &&other) = default; + CLTensor *tensor{}; /**< Pointer to the auxiliary tensor */ + TensorInfo tensor_info{}; /**< Associated tensor info */ + AuxMemoryInfo memory_info{}; /**< Memory requirement */ + }; + + /** Get views of all auxiliary tensors. This is mainly used for allocating the auxiliary tensors. */ + std::vector<DataView> get_tensors() + { + return _tensors; + } + std::vector<DataView> get_tensors() const + { + return _tensors; + } + + friend Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code); + +private: + /** Add auxiliary tensor. + * + * @param[in] tensor_info @ref ITensorInfo of the auxiliary tensor + * @param[in] memory_info Memory requirements of the auxiliary tensor + * + * @return CLTensor* Corresponding tensor memory if successfully added, otherwise nullptr + */ + CLTensor *add_aux_tensor(const ITensorInfo &tensor_info, const AuxMemoryInfo &aux_memory_info) + { + const auto t_id = tensor_info.id(); + auto find_tensor_pair = _owned_tensors.find(t_id); + if(find_tensor_pair == _owned_tensors.end()) + { + return find_tensor_pair->second.get(); + } + else + { + auto tensor = std::make_unique<CLTensor>(); + auto inserted_pair = _owned_tensors.emplace(t_id, std::move(tensor)).first; + auto new_tensor = inserted_pair->second.get(); + _tensors.emplace_back(new_tensor, tensor_info, aux_memory_info); + return new_tensor; + } + } + + std::map<ITensorInfo::Id, std::unique_ptr<CLTensor>> _owned_tensors{}; + std::vector<DataView> _tensors{}; +}; +/** Construct auxiliary tensors required by @ref GpuWorkloadSourceCode + * + * @note This is the only recommended method for user to create @ref ClAuxTensors + * + * @param[out] aux_tensors Auxiliary tensors required by the workload code + * @param[in] code @ref GpuWorkloadSourceCode which all tensors bind to + * + * @return Status + */ +Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code) +{ + for(auto t_id : code.tensors()) + { + // Get tensor object + const auto workload_arg = code.query_tensor(t_id); + ICLTensor *tensor_object = nullptr; + if(workload_arg->memory_descriptor()->memory_type == MemoryType::Auxiliary) + { + // Create aux tensor CLTensor object + const TensorInfo tensor_info = *workload_arg->tensor_info(); + ARM_COMPUTE_ERROR_ON(tensor_info.id() != t_id); + const auto aux_memory_info = workload_arg->memory_descriptor()->aux_memory_info; + tensor_object = aux_tensors->add_aux_tensor(tensor_info, aux_memory_info); + } + if(tensor_object == nullptr) + { + return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Failed to construct an auxiliary tensor"); + } + } + return Status{}; +} + +/** A fast tensor lookup table for runtime tensor objects retrieval + */ +class ClTensorLUT +{ +public: + /** Find a tensor pack associated with the @ref UnitWorkloadId @p uwk_id + * + * @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack + * + * @return ITensorPack* + */ + ITensorPack *find_tensor_pack(UnitWorkloadId uwk_id) + { + auto tensor_pack = _tensor_packs.find(uwk_id); + if(tensor_pack != _tensor_packs.end()) + { + return &(tensor_pack->second); + } + return nullptr; + } + /** Get a tensor pack associated with @p uwk_id. Throws a exception if it cannot be found. + * + * @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack + * + * @return ITensorPack* + */ + ITensorPack &get_tensor_pack(UnitWorkloadId uwk_id) + { + return _tensor_packs.at(uwk_id); + } + + friend Status create_tensor_lut(ClTensorLUT *tensor_lut, const GpuWorkloadSourceCode &code, const std::vector<CLTensor *> &user_tensors, const ClAuxTensors &aux_tensors); + +private: + /** Add a tensor pack and associate it with @ref UnitWorkloadId @p uwk_id + * + * @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack + * @param[in] tensor_pack Tensor pack to be added + */ + void add_tensor_pack(UnitWorkloadId uwk_id, const ITensorPack &tensor_pack) + { + _tensor_packs[uwk_id] = tensor_pack; + } + std::map<UnitWorkloadId, ITensorPack> _tensor_packs{}; +}; + +/** Create a fast tensor lookup table for runtime tensor retrieval + * + * @param[out] tensor_lut @ref ClTensorLUT used by the runtime to feed tensor memories to underlying kernels + * @param[in] code @ref GpuWorkloadSourceCode which all tensors bind to + * @param[in] user_tensors User tensors + * @param[in] aux_tensors Auxiliary tensors required by the workload code + * + * @return Status + */ +Status create_tensor_lut(ClTensorLUT *tensor_lut, const GpuWorkloadSourceCode &code, const std::vector<CLTensor *> &user_tensors, const ClAuxTensors &aux_tensors) +{ + // Combine user tensors and aux tensors + std::map<ITensorInfo::Id, CLTensor *> tensor_map; + for(auto tensor : user_tensors) + { + const auto t_id = tensor->info()->id(); + if(tensor_map.find(t_id) != tensor_map.end()) + { + return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Clashing tensor ids"); + } + tensor_map[t_id] = tensor; + } + for(const auto &data : aux_tensors.get_tensors()) + { + const auto t_id = data.tensor_info.id(); + const auto tensor = data.tensor; + if(tensor_map.find(t_id) != tensor_map.end()) + { + return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Clashing tensor ids"); + } + tensor_map[t_id] = tensor; + } + + // Add tensor objects into corresponding tensor packs + for(auto id_tensor : tensor_map) + { + const auto t_id = id_tensor.first; + const auto tensor_object = id_tensor.second; + if(tensor_object == nullptr) + { + return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Trying to add a nullptr into the tensor packs"); + } + if(tensor_object->allocator()->info().total_size() == 0U) + { + return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "No allocated memory found in tensor"); + } + + for(auto uwk_id : code.get_unit_workloads_from_tensor(t_id)) + { + ITensorPack *tensor_pack = tensor_lut->find_tensor_pack(uwk_id); + if(tensor_pack == nullptr) + { + tensor_lut->add_tensor_pack(uwk_id, ITensorPack{ { t_id, tensor_object } }); + } + else + { + tensor_pack->add_tensor(t_id, tensor_object); + } + } + } + return Status{}; +} + +} // namespace + +struct ClWorkloadRuntime::Implementation +{ + std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels{}; + std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels_prep{}; + bool _is_configured{ false }; + bool _is_prepared{ false }; + ClTensorLUT _tensor_lut{}; + ClAuxTensors _aux_tensors{}; + GpuWorkloadSourceCode _source_code{}; +}; + +ClWorkloadRuntime::ClWorkloadRuntime() + : _impl{ std::make_unique<Implementation>() } +{ +} + +ClWorkloadRuntime::~ClWorkloadRuntime() = default; + +Status ClWorkloadRuntime::configure(const GpuWorkloadSketch &sketch) +{ + ARM_COMPUTE_RETURN_ERROR_ON_MSG(_impl->_is_configured, "ClWorkloadRuntime cannot be re-configured"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(sketch.gpu_context()->gpu_language() != GpuLanguage::OpenCL, "ClWorkloadRuntime cannot be configured with non-OpenCL workload sketch"); + // Generate source code + _impl->_source_code = sketch.implementation().generate_source_code(); + // Configure unit workload from source code + for(auto uwk_id : _impl->_source_code.unit_workloads()) + { + const auto work = _impl->_source_code.query_unit_workload(uwk_id); + const auto stage = work.stage().stage; + auto k = std::make_unique<ClKernelRuntime>(); + k->configure(*sketch.gpu_context()->cl_compile_context(), work.code()); + + switch(stage) + { + case UnitWorkloadStage::Stage::Run: + _impl->_kernels.emplace(work.id(), std::move(k)); + break; + case UnitWorkloadStage::Stage::Prepare: + _impl->_kernels_prep.emplace(work.id(), std::move(k)); + break; + default: + ARM_COMPUTE_ERROR("Invalid unit workload stage"); + } + break; + } + // Create auxiliary tensor objects + create_aux_tensors(&_impl->_aux_tensors, _impl->_source_code); + _impl->_is_configured = true; + return Status{}; +} + +void ClWorkloadRuntime::prepare() +{ + if(!_impl->_is_prepared) + { + for(auto &id_kernel_pair : _impl->_kernels_prep) + { + const bool flush_queue = false; + const auto uwk_id = id_kernel_pair.first; + auto kernel = id_kernel_pair.second.get(); + CLScheduler::get().enqueue_op(*kernel, _impl->_tensor_lut.get_tensor_pack(uwk_id), flush_queue); + } + + _impl->_is_prepared = true; + } +} + +Status ClWorkloadRuntime::run(const std::vector<CLTensor *> &tensors) +{ + // Need to create the tensor lut in every run, unless the user can guarantee the binding remains fixed, + // in which case the lut can be cached during prepare + const auto st = create_tensor_lut(&_impl->_tensor_lut, _impl->_source_code, tensors, _impl->_aux_tensors); + ARM_COMPUTE_RETURN_ON_ERROR(st); + prepare(); + for(auto &id_kernel_pair : _impl->_kernels) + { + // Flush the command queue on the last kernel + const bool flush_queue = false; + const auto uwk_id = id_kernel_pair.first; + auto kernel = id_kernel_pair.second.get(); + CLScheduler::get().enqueue_op(*kernel, _impl->_tensor_lut.get_tensor_pack(uwk_id), flush_queue); + } + return Status{}; +} + +std::vector<std::pair<CLTensor *, AuxMemoryInfo>> ClWorkloadRuntime::get_auxiliary_tensors() +{ + std::vector<std::pair<CLTensor *, AuxMemoryInfo>> aux_tensors; + for(const auto &data : _impl->_aux_tensors.get_tensors()) + { + aux_tensors.emplace_back(data.tensor, data.memory_info); + } + return aux_tensors; +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/ArgumentPack.h b/src/dynamic_fusion/sketch/ArgumentPack.h new file mode 100644 index 0000000000..f118d7d851 --- /dev/null +++ b/src/dynamic_fusion/sketch/ArgumentPack.h @@ -0,0 +1,242 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_ARGUMENTPACK +#define SRC_DYNAMIC_FUSION_SKETCH_ARGUMENTPACK + +#include "arm_compute/core/experimental/Types.h" +#include <unordered_map> +#include <vector> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** This is a generic class that packs the arguments of an operator. For now, it is only used for tensor-related types + * Examples of "tensor-related types": @ref ITensorInfo, @ref ITensor, @ref ICLTensor + * + * The argument id is the position of the argument within the pack, and is represented by @ref TensorType + * + * @tparam T Tensor-related type + */ +template <typename T> +class ArgumentPack +{ +public: + /** @ref TensorType encodes the position of a tensor argument within the pack */ + using Id = TensorType; + /** A single argument element within the pack + * It contains either a const pointer or a non-const pointer to the Tensor-related type T, but never at the same time + */ + struct PackElement + { + PackElement() = default; + PackElement(const PackElement &elem) = default; + PackElement &operator=(const PackElement &elem) = default; + PackElement(PackElement &&elem) = default; + PackElement &operator=(PackElement &&elem) = default; + PackElement(Id id, T *tensor) + : id(id), tensor(tensor), ctensor(nullptr) + { + } + PackElement(Id id, const T *ctensor) + : id(id), tensor(nullptr), ctensor(ctensor) + { + } + + Id id{ ACL_UNKNOWN }; /**< Argument id within the pack */ + T *tensor{ nullptr }; /**< Non-const pointer to tensor-related object */ + const T *ctensor + { + nullptr + }; /**< Const pointer to tensor-related object */ + }; + +public: + /** Default constructor */ + ArgumentPack() = default; + /** Destructor */ + ~ArgumentPack() = default; + /** Allow instances of this class to be copy constructed */ + ArgumentPack<T>(const ArgumentPack<T> &other) = default; + /** Allow instances of this class to be copied */ + ArgumentPack<T> &operator=(const ArgumentPack<T> &other) = default; + /** Allow instances of this class to be move constructed */ + ArgumentPack<T>(ArgumentPack<T> &&other) = default; + /** Allow instances of this class to be moved */ + ArgumentPack<T> &operator=(ArgumentPack<T> &&other) = default; + /** Initializer list Constructor */ + ArgumentPack(const std::initializer_list<PackElement> &l) + : _pack{} + { + for(const auto &e : l) + { + _pack[e.id] = e; + } + } + /** Add tensor to the pack + * + * @param[in] id ID of the tensor to add + * @param[in] tensor Tensor to add + */ + void add_tensor(Id id, T *tensor) + { + _pack[id] = PackElement(id, tensor); + } + /** Add const tensor to the pack + * + * @param[in] id ID of the tensor to add + * @param[in] tensor Tensor to add + */ + void add_const_tensor(Id id, const T *tensor) + { + _pack[id] = PackElement(id, tensor); + } + /** Get tensor of a given id from the pack + * + * @param[in] id ID of tensor to extract + * + * @return The pointer to the tensor if exist and is non-const else nullptr + */ + T *get_tensor(Id id) + { + auto it = _pack.find(id); + return it != _pack.end() ? it->second.tensor : nullptr; + } + /** Get constant tensor of a given id + * + * @param[in] id ID of tensor to extract + * + * @return The pointer to the tensor (const or not) if exist else nullptr + */ + const T *get_const_tensor(Id id) const + { + auto it = _pack.find(id); + if(it != _pack.end()) + { + return it->second.ctensor != nullptr ? it->second.ctensor : it->second.tensor; + } + return nullptr; + } + /** Remove the tensor stored with the given id + * + * @param[in] id ID of tensor to remove + */ + void remove_tensor(Id id) + { + _pack.erase(id); + } + /** Pack size accessor + * + * @return Number of tensors registered to the pack + */ + size_t size() const + { + return _pack.size(); + } + /** Checks if pack is empty + * + * @return True if empty else false + */ + bool empty() const + { + return _pack.empty(); + } + /** Get the ACL_SRC_* tensors + * + * @return std::vector<T *> + */ + std::vector<T *> get_src_tensors() + { + std::vector<T *> src_tensors{}; + for(int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id) + { + auto tensor = get_tensor(static_cast<TensorType>(id)); + if(tensor != nullptr) + { + src_tensors.push_back(tensor); + } + } + return src_tensors; + } + /** Get the const ACL_SRC_* tensors + * + * @return std::vector<const T *> + */ + std::vector<const T *> get_const_src_tensors() const + { + std::vector<const T *> src_tensors{}; + for(int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id) + { + auto tensor = get_const_tensor(static_cast<TensorType>(id)); + if(tensor != nullptr) + { + src_tensors.push_back(tensor); + } + } + return src_tensors; + } + /** Get the ACL_DST_* tensors + * + * @return std::vector<T *> + */ + std::vector<T *> get_dst_tensors() + { + std::vector<T *> dst_tensors{}; + for(int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id) + { + auto tensor = get_tensor(static_cast<TensorType>(id)); + if(tensor != nullptr) + { + dst_tensors.push_back(tensor); + } + } + return dst_tensors; + } + /** Get the const ACL_DST_* tensors + * + * @return std::vector<const T *> + */ + std::vector<const T *> get_const_dst_tensors() const + { + std::vector<const T *> dst_tensors{}; + for(int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id) + { + auto tensor = get_const_tensor(static_cast<TensorType>(id)); + if(tensor != nullptr) + { + dst_tensors.push_back(tensor); + } + } + return dst_tensors; + } + +private: + std::unordered_map<int, PackElement> _pack{}; /**< Container with the packed tensors */ +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_ARGUMENTPACK */ diff --git a/src/dynamic_fusion/sketch/OperatorAttributes.cpp b/src/dynamic_fusion/sketch/OperatorAttributes.cpp new file mode 100644 index 0000000000..51ec444587 --- /dev/null +++ b/src/dynamic_fusion/sketch/OperatorAttributes.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +Conv2dAttributes &Conv2dAttributes::pad(const Padding2D &pad) +{ + _pad = pad; + return *this; +} +Padding2D Conv2dAttributes::pad() const +{ + return _pad; +} +Conv2dAttributes &Conv2dAttributes::stride(const Size2D &stride) +{ + _stride = stride; + return *this; +} +Size2D Conv2dAttributes::stride() const +{ + return _stride; +} +Conv2dAttributes &Conv2dAttributes::dilation(const Size2D &dilation) +{ + _dilation = dilation; + return *this; +} +Size2D Conv2dAttributes::dilation() const +{ + return _dilation; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/GpuComponentServices.h b/src/dynamic_fusion/sketch/gpu/GpuComponentServices.h new file mode 100644 index 0000000000..93881508bb --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuComponentServices.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTSERVICES +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTSERVICES + +#include "src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Services that are used throughout the creation phase of workload code + */ +class GpuComponentServices +{ +public: + /** Default constructor */ + GpuComponentServices() = default; + /** Get reference to component factory */ + GpuKernelComponentFactory &component_factory() + { + return _comp_factory; + } + +private: + GpuKernelComponentFactory _comp_factory{}; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTSERVICES */ diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp new file mode 100644 index 0000000000..9cecfc2ffd --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +bool operator==(const GpuKernelArgumentInfo &info0, const GpuKernelArgumentInfo &info1) +{ + return info0.type == info1.type; +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h new file mode 100644 index 0000000000..eb36e91d48 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT + +#include "arm_compute/core/TensorInfo.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Contain information required to set up a kernel argument at run time + */ +struct GpuKernelArgumentInfo +{ + /** Enumerate all the tensor arguments variants used by all kernel implementations. */ + enum class Type : int + { + Scalar, + + Vector, + + Image, + Image_Reinterpret_As_3D, + Image_Export_To_ClImage2D, + + Image_3D, // 3D Tensor represented as a 2D Image + stride_z + Image_3D_Export_To_ClImage2D, + + Tensor_3D, + Tensor_4D, + Tensor_4D_t_Buffer, + Tensor_4D_t_Image + }; + /** Default constructor */ + GpuKernelArgumentInfo() = default; + /** Constructor */ + GpuKernelArgumentInfo(Type type) + : type{ type } + { + } + Type type{ Type::Tensor_4D_t_Buffer }; +}; + +bool operator==(const GpuKernelArgumentInfo &info0, const GpuKernelArgumentInfo &info1); + +/** Kernel argument information linked with its corresponding @ref ITensorInfo + */ +class GpuKernelArgument +{ +public: + /** Constructor + * + * @param[in] tensor_info Associated @ref ITensorInfo + * @param[in] kernel_arg_info Associated @ref GpuKernelArgumentInfo + */ + GpuKernelArgument(const ITensorInfo &tensor_info, + const GpuKernelArgumentInfo &kernel_arg_info) + : _tensor_info{ tensor_info }, + _kernel_arg_info{ kernel_arg_info } + { + } + /** Get workload tensor id */ + ITensorInfo::Id id() const + { + return _tensor_info.id(); + } + /** Get associated @ref ITensorInfo */ + ITensorInfo *tensor_info() + { + return &_tensor_info; + } + /** Get associated @ref ITensorInfo */ + const ITensorInfo *tensor_info() const + { + return &_tensor_info; + } + /** Get associated @ref GpuKernelArgumentInfo */ + GpuKernelArgumentInfo *kernel_argument_info() + { + return &_kernel_arg_info; + } + /** Get associated @ref GpuKernelArgumentInfo */ + const GpuKernelArgumentInfo *kernel_argument_info() const + { + return &_kernel_arg_info; + } + /** Check if the associated workload tensor has valid id + * + * @return true if has valid id + * @return false otherwise + */ + bool has_valid_id() const + { + return _tensor_info.has_valid_id(); + } + +private: + TensorInfo _tensor_info{}; + GpuKernelArgumentInfo _kernel_arg_info{}; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT */ diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp new file mode 100644 index 0000000000..6e6422c957 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "GpuKernelComponentGraph.h" + +#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +/** Automatically create memory descriptors for all tensors in the graph + * + * @param[in] tensors @ref ITensorInfo map + * @param[in] graph @ref DependencyGraph of which the @p tensors are a part + * + * @return MemoryDescriptorMap An assignment map of @ref MemoryDescriptors for each ITensorInfo in the graph + */ +MemoryDescriptorMap assign_memory_descriptors(const std::map<ITensorInfo::Id, const ITensorInfo *> tensors, const DependencyGraph &graph) +{ + MemoryDescriptorMap mem_map{}; + for(auto t_id : graph.all_tensors()) + { + const auto &tensor = tensors.at(t_id); + // Only global src and dst tensors to the entire component graph are "User" tensors, which are user-specified memories + if(is_in(t_id, graph.global_src_tensors()) || is_in(t_id, graph.global_dst_tensors())) + { + mem_map[t_id] = MemoryDescriptor{ MemoryType::User }; + } + else + { + AuxMemoryInfo aux_mem_info{ tensor->total_size() }; + mem_map[t_id] = MemoryDescriptor{ MemoryType::Auxiliary, aux_mem_info }; + } + } + return mem_map; +} + +} // namespace + +std::vector<DependencyGraph::TensorId> GpuKernelComponentGraph::get_tensor_ids(const std::vector<const ITensorInfo *> tensors) +{ + std::vector<DependencyGraph::TensorId> tensor_ids{}; + std::transform( + std::begin(tensors), std::end(tensors), + std::back_inserter(tensor_ids), + [](const auto & t) + { + return t->id(); + }); + return tensor_ids; +} + +GpuKernelComponentGraph::GpuKernelComponentGraph(GpuComponentServices *services) + : _services{ services }, _components{}, _tensors{}, _dependency_graph{} +{ +} + +GpuKernelComponentStream GpuKernelComponentGraph::fuse() const +{ + // Obtain memory descriptor map + const auto mem_map = assign_memory_descriptors(_tensors, _dependency_graph); + /// @note Fusion constraints (for kernel components) are exactly the same as the invariants of @ref GpuKernelComponentGroup + /// Fusion can be framed as a mathematical optimization problem: + /// Given fusion constraints, find the "best" fusion patterns possible + /// "Best" is ill-defined at the moment. For now we define "best" fusion pattern as one + /// which results in the least number of fused kernels ( @ref GpuKernelComponentGroup ) at the end + + /// As the first iteration, we offer a sub-optimal algorithm here which ensures all + /// constraints are met, but provides no guarantee that the fusion pattern is optimal + + GpuKernelComponentStream stream{ _services, mem_map }; + // Break down into linear groups of components (constraint 1), preserving topological order + const auto linear_graphs = _dependency_graph.topological_partition(); + + // Further divide up the linear groups based on rest of the fusion constraints (rely on component group's invariants) + for(const auto &graph : linear_graphs) + { + for(unsigned int i = 0; i < graph.size(); ++i) + { + const auto comp = _components.at(graph[i].op).get(); + // Each new linear graph signals a new component group in the stream + if(i == 0) + { + stream.new_component_group(); + } + // If it violates the component group's invariant / fusion constraint, breaks up the stream by inserting a new group + bool success = stream.add_component(comp); + if(!success) + { + stream.new_component_group(); + success = stream.add_component(comp); + ARM_COMPUTE_ERROR_ON(!success); + } + } + } + return stream; +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h new file mode 100644 index 0000000000..fbcb2c10ea --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH + +#include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h" +#include "src/dynamic_fusion/sketch/utils/DependencyGraph.h" + +#include <vector> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +class IGpuKernelComponent; + +/** A multi-input (tensors), multi-output (tensors) acyclic directed graph of gpu kernel components + * Its main purposes are: + * - Perform "graph-level" optimizations like fusion of kernel components (not the fusion of operators) + * - Automatically assign memory descriptions @ref MemoryDescriptor of all tensors based on graph topology + */ +class GpuKernelComponentGraph +{ +public: + /** Constructor + * + * @param[in] services @ref GpuComponentServices to be used by the graph + */ + GpuKernelComponentGraph(GpuComponentServices *services); + /** Prevent instances of this class from being copy constructed */ + GpuKernelComponentGraph(const GpuKernelComponentGraph &graph) = delete; + /** Prevent instances of this class from being copied */ + GpuKernelComponentGraph &operator=(const GpuKernelComponentGraph &graph) = delete; + /** Allow instances of this class to be move constructed */ + GpuKernelComponentGraph(GpuKernelComponentGraph &&graph) = default; + /** Allow instances of this class to be moved */ + GpuKernelComponentGraph &operator=(GpuKernelComponentGraph &&graph) = default; + /** Create a new component and add it to the component graph + * Component id is automatically allocated + * + * @tparam T Component type + * @tparam Args Component argument types + * + * @param[in] args Component arguments except for component id, which is auto-allocated + */ + template <typename T, typename... Args> + void add_new_component(Args &&... args) + { + auto comp = _services->component_factory().create<T>(std::forward<Args>(args)...); + ArgumentPack<ITensorInfo> tensors = comp->tensors(); + const auto src_tensor_ids = get_tensor_ids(tensors.get_const_src_tensors()); + const auto dst_tensor_ids = get_tensor_ids(tensors.get_const_dst_tensors()); + bool success = _dependency_graph.add_operator(comp->id(), src_tensor_ids, dst_tensor_ids); + ARM_COMPUTE_ERROR_ON(!success); + _components[comp->id()] = std::move(comp); + for(auto t : tensors.get_const_src_tensors()) + { + _tensors[t->id()] = t; + } + for(auto t : tensors.get_const_dst_tensors()) + { + _tensors[t->id()] = t; + } + } + /** Perform component fusion and serialize the graph into a stream of component groups + */ + GpuKernelComponentStream fuse() const; + +private: + static std::vector<DependencyGraph::TensorId> get_tensor_ids(const std::vector<const ITensorInfo *> tensors); + GpuComponentServices *_services; + std::map<ComponentId, std::unique_ptr<IGpuKernelComponent>> _components; + std::map<ITensorInfo::Id, const ITensorInfo *> _tensors; + DependencyGraph _dependency_graph{}; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH */ diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp new file mode 100644 index 0000000000..3af4c1429d --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp @@ -0,0 +1,291 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "GpuKernelComponentGroup.h" + +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +bool GpuKernelComponentGroup::add_component(ComponentPtr component) +{ + // note: Constraint 1 is guaranteed as a precondition + // Constraint 2 + if(component->type() != GpuComponentType::Output && _components.size() >= max_fused_components) + { + return false; + } + // Constraint 3.1: Pattern: (Unfusable + Output) + if(!_components.empty() && get_root_component()->type() == GpuComponentType::Unfusable && component->type() != GpuComponentType::Output) + { + return false; + } + // Constraint 3.2 + if(!_components.empty() && (component->type() != GpuComponentType::Simple && component->type() != GpuComponentType::Output)) + { + return false; + } + // Constraint 3.3: Disallow multiple output components + if(!_components.empty() && get_last_component()->type() == GpuComponentType::Output && component->type() == GpuComponentType::Output) + { + return false; + } + // Constraint 4 + if(component->type() != GpuComponentType::Unfusable && component->tensors().get_const_dst_tensors().size() != 1U) + { + return false; + } + // Constraint 5 + if(!_components.empty() && !(get_root_component()->properties() == component->properties())) + { + return false; + } + // Constraint 7 + if(!_components.empty()) + { + const auto root_dst_tensors = get_root_component()->tensors().get_const_dst_tensors(); + ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty()); + const auto first_dst_tensor = root_dst_tensors[0]; + const auto dst_tensors = component->tensors().get_const_dst_tensors(); + for(const auto &t : root_dst_tensors) + { + if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0)) + { + return false; + } + } + for(const auto &t : dst_tensors) + { + if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0)) + { + return false; + } + } + } + // Constraint 8 + if(!_components.empty()) + { + const auto root_dst_tensors = get_root_component()->tensors().get_const_dst_tensors(); + ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty()); + const auto first_dst_tensor_layout = root_dst_tensors[0]->data_layout(); + const auto dst_tensors = component->tensors().get_const_dst_tensors(); + for(const auto &t : root_dst_tensors) + { + if(t->data_layout() != first_dst_tensor_layout) + { + return false; + } + } + for(const auto &t : dst_tensors) + { + if(t->data_layout() != first_dst_tensor_layout) + { + return false; + } + } + } + // Constraint 9 + if(component->tensors().get_const_dst_tensors().size() >= max_dst_tensors) + { + return false; + } + // Constraint 9 corollary + if(component->type() == GpuComponentType::Output && _components.size() >= max_fused_components + max_dst_tensors) + { + return false; + } + _components.push_back(component); + return true; +} + +std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_src_tensors() const +{ + if(_components.empty()) + { + return {}; + } + auto src_tensors = _components[0]->tensors().get_const_src_tensors(); + auto prev_dst_tensor = _components[0]->tensors().get_const_dst_tensors()[0]; // PRE: Only one dst tensor per component + for(unsigned int i = 1; i < _components.size(); ++i) + { + auto cur_src_tensors = _components[i]->tensors().get_const_src_tensors(); + for(const auto src_tensor : cur_src_tensors) + { + if(src_tensor->id() == prev_dst_tensor->id()) + { + continue; // Skip "intermediate" tensors. I.e. tensors that are used to link between two components + } + src_tensors.push_back(src_tensor); + } + prev_dst_tensor = _components[i]->tensors().get_const_dst_tensors()[0]; // PRE: Only one dst tensor per component + } + + return src_tensors; +} + +std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_dst_tensors() const +{ + if(_components.empty()) + { + return {}; + } + const auto dst_tensor_ptrs = _components[_components.size() - 1]->tensors().get_const_dst_tensors(); + std::vector<const ITensorInfo *> dst_tensors; + for(auto tensor_ptr : dst_tensor_ptrs) + { + dst_tensors.push_back(tensor_ptr); + } + return dst_tensors; +} + +std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_argument_tensors() const +{ + std::vector<const ITensorInfo *> arguments; + const auto src_tensors = get_src_tensors(); + const auto dst_tensors = get_dst_tensors(); + arguments.reserve(src_tensors.size() + dst_tensors.size()); + arguments.insert(arguments.end(), src_tensors.begin(), src_tensors.end()); + arguments.insert(arguments.end(), dst_tensors.begin(), dst_tensors.end()); + return arguments; +} + +GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_root_component() const +{ + if(empty()) + { + return nullptr; + } + return _components[0]; +} + +GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_last_component() const +{ + if(empty()) + { + return nullptr; + } + return _components[_components.size() - 1]; +} + +GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_previous_component(ComponentId id) const +{ + if(empty()) + { + return nullptr; + } + // Get the index of the requested component + size_t ind = 0; + for(const auto c : _components) + { + if(c->id() == id) + { + break; + } + ind++; + } + if(ind == 0 || ind >= _components.size()) + { + return nullptr; + } + return _components[ind - 1]; +} + +bool GpuKernelComponentGroup::is_intermediate_tensor(const ITensorInfo *tensor) const +{ + return is_tensor_in(tensor, get_interm_tensors()); +} + +size_t GpuKernelComponentGroup::size() const +{ + return _components.size(); +} +bool GpuKernelComponentGroup::empty() const +{ + return _components.empty(); +} +GpuKernelComponentGroup::ComponentPtr &GpuKernelComponentGroup::operator[](size_t index) +{ + return _components[index]; +} +const GpuKernelComponentGroup::ComponentPtr &GpuKernelComponentGroup::operator[](size_t index) const +{ + return _components[index]; +} +typename std::vector<GpuKernelComponentGroup::ComponentPtr>::iterator GpuKernelComponentGroup::begin() +{ + return _components.begin(); +} +typename std::vector<GpuKernelComponentGroup::ComponentPtr>::iterator GpuKernelComponentGroup::end() +{ + return _components.end(); +} +typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::begin() const +{ + return _components.cbegin(); +} +typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::end() const +{ + return _components.cend(); +} +typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::cbegin() const +{ + return _components.cbegin(); +} +typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::cend() const +{ + return _components.cend(); +} + +std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_interm_tensors() const +{ + std::vector<const ITensorInfo *> interm_tensors{}; + for(unsigned int i = 0; i + 1 < _components.size(); ++i) + { + auto interm_tensor = _components[i]->tensors().get_const_dst_tensors()[0]; + interm_tensors.push_back(interm_tensor); // PRE: Only one dst tensor per component + } + + return interm_tensors; +} + +bool GpuKernelComponentGroup::is_tensor_in(const ITensorInfo *tensor, const std::vector<const ITensorInfo *> tensors) +{ + for(auto t : tensors) + { + if(tensor->id() == t->id()) + { + return true; + } + } + return false; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h new file mode 100644 index 0000000000..4c9d940594 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGROUP +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGROUP + +#include "components/Types.h" + +#include <cstdint> +#include <cstdlib> +#include <vector> + +namespace arm_compute +{ +/** Forward declaration */ +class ITensorInfo; +namespace experimental +{ +namespace dynamic_fusion +{ +class IGpuKernelComponent; +/** A group of gpu kernel components to be fused together + * PRECONDITIONS: + * 1. Fusion is limited to a linear sequence of kernel components + * INVARIANTS: + * @note These preconditions and invariants are exactly the same as fusion constraints for kernel components + * 2. Max number of components that can be fused is @ref GpuKernelComponentGroup::max_fused_components ( + * excluding any output or input (if any) components. + * The max number of output components are bound by the maximum number of dst tensors allowed for a component / component group + * ) + * 3. The fusion is subject to the pattern: (Complex + Simple * | Simple + Simple * | Un-fusable) + Output? + * 4. All components but unfusable, have exactly 1 dst tensor + * 5. All fused components share the same @ref IGpuKernelComponent::Properties ( @ref UnitWorkloadStage etc. ) + * 6. All fused components share the same tunable parameters like tile size + * 7. All fused components share the same dst tensor shape + * 8. All fused components' tensors share the same @ref DataLayout + * 9. Maximum number of dst tensors allowed for an component (including unfusable) / component group is @ref GpuKernelComponentGroup::max_dst_tensors + * This has an impact on the total number of components supported, which = max_fused_components + max_dst_tensors + */ +class GpuKernelComponentGroup +{ +public: + using ComponentPtr = IGpuKernelComponent *; + /** Maximum number of components that can be fused into the same component group + */ + static constexpr size_t max_fused_components = 64; + /** Maximum number of dst tensors allowed for a component / component + */ + static constexpr size_t max_dst_tensors = 8; + +public: + /** Default constructor */ + GpuKernelComponentGroup() = default; + /** Allow instances of this class to be copy constructed */ + GpuKernelComponentGroup(const GpuKernelComponentGroup &) = default; + /** Allow instances of this class to be copied */ + GpuKernelComponentGroup &operator=(const GpuKernelComponentGroup &) = default; + /** Allow instances of this class to be move constructed */ + GpuKernelComponentGroup(GpuKernelComponentGroup &&) = default; + /** Allow instances of this class to be moved */ + GpuKernelComponentGroup &operator=(GpuKernelComponentGroup &&) = default; + /** Add a component pointer into the group + * If the operation fails, then no change is made to the group + * + * @param[in] component Pointer to the component to be added + * + * @return true If the operation is successful + * @return false If the operation fails + */ + bool add_component(ComponentPtr component); + /** Get source tensors of this group */ + std::vector<const ITensorInfo *> get_src_tensors() const; + /** Get destination tensors of this group */ + std::vector<const ITensorInfo *> get_dst_tensors() const; + /** Get tensor argument of this group + * A tensor is an argument if it is a source or destination tensor to the group + */ + std::vector<const ITensorInfo *> get_argument_tensors() const; + /** Get the root (first) component of this group */ + ComponentPtr get_root_component() const; + /** Get the last component of this group */ + ComponentPtr get_last_component() const; + /** Get the previous component to the component with id @p id + * + * @param[in] id Component id of the component whose previous component is of concern + * + * @return ComponentPtr Pointer to the previous component of the one identified by @p id + */ + ComponentPtr get_previous_component(ComponentId id) const; + /** Check if a @ref ITensorInfo is an "intermediate" tensor of the group + * + * An intermediate tensor is any tensor that is not an argument. + * + * @param[in] tensor @ref ITensorInfo to be looked up + * + * @return true If @p tensor is an intermediate tensor + * @return false Otherwise + */ + bool is_intermediate_tensor(const ITensorInfo *tensor) const; + /** Get the number of components within the group */ + size_t size() const; + /** Check if the component group is empty */ + bool empty() const; + ComponentPtr &operator[](size_t index); + const ComponentPtr &operator[](size_t index) const; + typename std::vector<ComponentPtr>::iterator begin(); + typename std::vector<ComponentPtr>::iterator end(); + typename std::vector<ComponentPtr>::const_iterator begin() const; + typename std::vector<ComponentPtr>::const_iterator end() const; + typename std::vector<ComponentPtr>::const_iterator cbegin() const; + typename std::vector<ComponentPtr>::const_iterator cend() const; + +private: + std::vector<const ITensorInfo *> get_interm_tensors() const; + + static bool is_tensor_in(const ITensorInfo *tensor, const std::vector<const ITensorInfo *> tensors); + + std::vector<ComponentPtr> _components{}; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGROUP */ diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp new file mode 100644 index 0000000000..aac84b6c59 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "GpuKernelComponentStream.h" + +#include "src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h" +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +GpuKernelComponentStream::GpuKernelComponentStream(GpuComponentServices *services, const MemoryDescriptorMap &mem_map) + : _services{ services }, _component_groups{}, _mem_map{ mem_map } +{ +} + +GpuWorkloadSourceCode GpuKernelComponentStream::write_workload_code() +{ + GpuWorkloadSourceCode source_code; + // Traverse through component groups and assemble workload together + for(auto && group : _component_groups) + { + // Write kernel code + GpuLogicalKernel logical_kernel(_services, group); + const GpuKernelSourceCode kernel_code = logical_kernel.write_kernel_code(); + // The whole unit workload stage is determined by the root component + const auto unit_workload_stage = group.get_root_component()->properties().stage(); + source_code.add_unit_workload(kernel_code, unit_workload_stage, _mem_map); + } + return source_code; +} + +void GpuKernelComponentStream::new_component_group() +{ + _component_groups.emplace_back(); +} + +bool GpuKernelComponentStream::add_component(IGpuKernelComponent *component) +{ + ARM_COMPUTE_ERROR_ON(_component_groups.empty()); + return _component_groups.back().add_component(component); +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h new file mode 100644 index 0000000000..cbaa7c297b --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTSTREAM +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTSTREAM + +#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +class GpuComponentServices; +class IGpuKernelComponent; + +/** A linear sequence of component groups serialized from the @ref GpuKernelComponentGraph + * Each component group in the stream denotes a complete kernel that may consist of multiple components + * + * The main purposes of this class are: + * - Facilitate component fusion algorithm by allowing insertions of new component groups into the stream + * - Invoke kernel writer and assemble the final @ref GpuWorkloadSourceCode + */ +class GpuKernelComponentStream +{ +public: + /** Constructor + * + * @param[in] services @ref GpuComponentServices to be used throughout the stream + * @param[in] mem_map @ref MemoryDescriptor map used to assemble the @ref GpuWorkloadSourceCode + */ + GpuKernelComponentStream(GpuComponentServices *services, const MemoryDescriptorMap &mem_map); + /** Allow instances of this class to be copy constructed */ + GpuKernelComponentStream(const GpuKernelComponentStream &stream) = default; + /** Allow instances of this class to be copied */ + GpuKernelComponentStream &operator=(const GpuKernelComponentStream &stream) = default; + /** Allow instances of this class to be move constructed */ + GpuKernelComponentStream(GpuKernelComponentStream &&stream) = default; + /** Allow instances of this class to be moved */ + GpuKernelComponentStream &operator=(GpuKernelComponentStream &&stream) = default; + /** Generate and assemble @ref GpuWorkloadSourceCode from the stream */ + GpuWorkloadSourceCode write_workload_code(); + /** Insert a new component group in the stream. + * Subsequent components are added to this group until end of stream or the next new_component_group is called + */ + void new_component_group(); + /** Add a component to the previously created component group + * Throw an error if no component group is present in the stream + * + * @param[in] component Component to be inserted + * + * @return true If the operation is successful + * @return false Otherwise + */ + bool add_component(IGpuKernelComponent *component); + +private: + GpuComponentServices *_services; + std::vector<GpuKernelComponentGroup> _component_groups{}; + MemoryDescriptorMap _mem_map{}; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTSTREAM */ diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h new file mode 100644 index 0000000000..7479328d7b --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE + +#include "arm_compute/core/CL/CLCompileContext.h" +#include "arm_compute/core/Window.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" + +#include <map> +#include <string> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** The argument list of a @ref GpuKernelSourceCode */ +using GpuKernelArgumentList = std::map<ITensorInfo::Id, GpuKernelArgument>; + +/** Container of kernel code to be compiled and run in a @ref GpuUnitWorkload + */ +class GpuKernelSourceCode +{ +public: + /** Set kernel name */ + GpuKernelSourceCode &name(const std::string &n) + { + _name = n; + return *this; + } + /** Set kernel code */ + GpuKernelSourceCode &code(const std::string &c) + { + _code = c; + return *this; + } + /** Set kernel config id string */ + GpuKernelSourceCode &config_id(const std::string &c_id) + { + _config_id = c_id; + return *this; + } + /** Set kernel build options */ + GpuKernelSourceCode &build_options(const CLBuildOptions &b_options) + { + _build_options = b_options; + return *this; + } + /** Set kernel execution window */ + GpuKernelSourceCode &window(const Window &window) + { + _window = window; + return *this; + } + /** Set kernel argument list */ + GpuKernelSourceCode &arguments(const GpuKernelArgumentList &arguments) + { + _arguments = arguments; + return *this; + } + /** Get kernel name */ + std::string name() const + { + return _name; + } + /** Get kernel code */ + std::string code() const + { + return _code; + } + /** Get kernel config id string */ + std::string config_id() const + { + return _config_id; + } + /** Get kernel build options */ + const CLBuildOptions &build_options() const + { + return _build_options; + } + /** Get kernel execution window */ + const Window &window() const + { + return _window; + } + /** Get kernel argument list */ + const GpuKernelArgumentList &arguments() const + { + return _arguments; + } + +private: + std::string _name{}; + std::string _code{}; + std::string _config_id{}; + CLBuildOptions _build_options{}; + Window _window{}; + GpuKernelArgumentList _arguments{}; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE */ diff --git a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp new file mode 100644 index 0000000000..7746f8bbf3 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "GpuLogicalKernel.h" + +#include "arm_compute/core/experimental/Types.h" + +#include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h" +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h" +#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +GpuLogicalKernel::GpuLogicalKernel(GpuComponentServices *services, const GpuKernelComponentGroup &components) + : _services{ services }, _comp_group{ components }, _store_components{} +{ + add_load_store(); +} + +GpuKernelSourceCode GpuLogicalKernel::write_kernel_code() +{ + GpuKernelSourceCode code; + ClTemplateWriter writer{ _comp_group }; + + code.name(writer.get_name()); + code.code(writer.get_code()); + code.arguments(writer.get_tensors()); + code.build_options(writer.get_build_options()); + code.config_id(writer.get_config_id()); + code.window(writer.get_window()); + + return code; +} + +void GpuLogicalKernel::add_load_store() +{ + const auto dst_tensors = _comp_group.get_dst_tensors(); + // Each dst tensor from the component group requires exactly one store component + for(const auto &dst_tensor : dst_tensors) + { + ArgumentPack<ITensorInfo> tensors; + // Pass same destination tensor to both source and destination of the store component + // In other words, the addition of a store component does not create a new dst tensor + // This way we avoid the issue of the dst tensor of the component group differs from that of a logical kernel + // This may seem to violate the acyclic-ness of the component graph. But it is fine because at the point of + // the construction of the logical kernel, we do not need a graph representation of components anymore + // (the graph has been serialized) + tensors.add_const_tensor(ACL_SRC_0, dst_tensor); + tensors.add_const_tensor(ACL_DST_0, dst_tensor); + + auto store = _services->component_factory().create<ClComponentStore>( + _comp_group.get_root_component()->properties(), // Store component share the same properties as that of the root component + tensors); + _store_components.push_back(std::move(store)); + auto success = _comp_group.add_component(_store_components.back().get()); + ARM_COMPUTE_ERROR_ON(!success); // It's guaranteed that any load store insertion should be successful + } +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h new file mode 100644 index 0000000000..4ce4443f60 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL + +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h" + +#include <memory> +#include <vector> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Forward declaration */ +class GpuComponentServices; +class IGpuKernelComponent; + +/** A wrapper-processor of a @ref GpuKernelComponentGroup + * It adds the load (if any) and store components to the component group + * The @ref GpuLogicalKernel represents a complete kernel, and can proceed to invoke any kernel writer to generate the full kernel code + */ +class GpuLogicalKernel +{ +public: + /** Constructor + * + * @param[in] services @ref GpuComponentServices to be used + * @param[in] components Component group from which this logical kernel is initialized + */ + explicit GpuLogicalKernel(GpuComponentServices *services, const GpuKernelComponentGroup &components); + /** Allow instances of this class to be copy constructed */ + GpuLogicalKernel(const GpuLogicalKernel &) = default; + /** Allow instances of this class to be copied */ + GpuLogicalKernel &operator=(const GpuLogicalKernel &) = default; + /** Allow instances of this class to be move constructed */ + GpuLogicalKernel(GpuLogicalKernel &&) = default; + /** Allow instances of this class to be moved */ + GpuLogicalKernel &operator=(GpuLogicalKernel &&) = default; + /** Generate a @ref GpuKernelSourceCode */ + GpuKernelSourceCode write_kernel_code(); + +private: + void add_load_store(); + + GpuComponentServices *_services; + GpuKernelComponentGroup _comp_group{}; + std::vector<std::unique_ptr<IGpuKernelComponent>> _store_components{}; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL */ diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp new file mode 100644 index 0000000000..e8ef835405 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h" + +#include "arm_compute/core/Validate.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +std::vector<DependencyGraph::TensorId> get_tensor_ids(const std::vector<const ITensorInfo *> tensors) +{ + std::vector<DependencyGraph::TensorId> tensor_ids{}; + std::transform( + std::begin(tensors), std::end(tensors), + std::back_inserter(tensor_ids), + [](const auto & t) + { + return t->id(); + }); + return tensor_ids; +} + +} // namespace + +Operator::Operator(OperatorId id, GpuOperatorType operator_type, const ArgumentPack<ITensorInfo> &tensors) + : _id{ id }, _operator_type{ operator_type }, _tensors{ tensors } +{ +} + +OperatorId Operator::id() const +{ + return _id; +} + +GpuOperatorType Operator::operator_type() const +{ + return _operator_type; +} + +ArgumentPack<ITensorInfo> Operator::tensors() const +{ + return _tensors; +} + +bool GpuOperatorGroup::try_add_operator(const Operator &op) const +{ + const auto src_tensor_ids = get_tensor_ids(op.tensors().get_const_src_tensors()); + const auto dst_tensor_ids = get_tensor_ids(op.tensors().get_const_dst_tensors()); + // Constraint 1 + if(!_graph.try_add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids)) + { + return false; + } + // Constraint 2 + if(_operators.size() >= max_fused_operators) + { + return false; + } + // Constraint 3.1: Pattern: (Unfusable) + if(_operators.size() > 0 && get_root_operator()->operator_type() == GpuOperatorType::Unfusable) + { + return false; + } + // Constraint 3.2 + if(_operators.size() > 0 && (op.operator_type() != GpuOperatorType::Simple)) + { + return false; + } + // Constraint 4 + if(op.operator_type() != GpuOperatorType::Unfusable && op.tensors().get_const_dst_tensors().size() != 1U) + { + return false; + } + // Constraint 5 + if(_operators.size() > 0) + { + const auto root_dst_tensors = get_root_operator()->tensors().get_const_dst_tensors(); + ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty()); + const auto first_dst_tensor = root_dst_tensors[0]; + const auto dst_tensors = op.tensors().get_const_dst_tensors(); + for(const auto &t : root_dst_tensors) + { + if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0)) + { + return false; + } + } + for(const auto &t : dst_tensors) + { + if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0)) + { + return false; + } + } + } + // Constraint 6 + if(_operators.size() > 0) + { + const auto root_dst_tensors = get_root_operator()->tensors().get_const_dst_tensors(); + ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty()); + const auto first_dst_tensor_layout = root_dst_tensors[0]->data_layout(); + const auto dst_tensors = op.tensors().get_const_dst_tensors(); + for(const auto &t : root_dst_tensors) + { + if(t->data_layout() != first_dst_tensor_layout) + { + return false; + } + } + for(const auto &t : dst_tensors) + { + if(t->data_layout() != first_dst_tensor_layout) + { + return false; + } + } + } + return true; +} +void GpuOperatorGroup::add_operator(const Operator &op) +{ + ARM_COMPUTE_ERROR_ON(!try_add_operator(op)); + const auto src_tensor_ids = get_tensor_ids(op.tensors().get_const_src_tensors()); + const auto dst_tensor_ids = get_tensor_ids(op.tensors().get_const_dst_tensors()); + _graph.add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids); + _operators[op.id()] = op; +} +Operator GpuOperatorGroup::new_operator(const GpuOperatorType &operator_type, const ArgumentPack<ITensorInfo> &tensors) const +{ + auto new_id = static_cast<OperatorId>(_operators.size()); + return Operator{ new_id, operator_type, tensors }; +} +const Operator *GpuOperatorGroup::get_root_operator() const +{ + const auto roots = _graph.get_root_ops(); + ARM_COMPUTE_ERROR_ON(roots.size() > 1); + if(roots.empty()) + { + return nullptr; + } + return &_operators.at(roots[0]); +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h new file mode 100644 index 0000000000..35abe6c543 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORGROUP +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORGROUP + +#include "arm_compute/core/ITensorInfo.h" +#include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h" +#include "src/dynamic_fusion/sketch/utils/DependencyGraph.h" +#include <map> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +using OperatorId = DependencyGraph::OperatorId; + +/** An operator for the sole purpose of validating fusion + */ +class Operator +{ +public: + /** Default constructor */ + Operator() = default; + /** Get Operator Id */ + OperatorId id() const; + /** Get operator type */ + GpuOperatorType operator_type() const; + /** Get tensor arguments */ + ArgumentPack<ITensorInfo> tensors() const; + friend class GpuOperatorGroup; + +private: + Operator(OperatorId id, GpuOperatorType operator_type, const ArgumentPack<ITensorInfo> &tensors); + OperatorId _id{}; + GpuOperatorType _operator_type{}; + ArgumentPack<ITensorInfo> _tensors{}; +}; + +/** A linear sequence of operators to be fused in a workload + * For the time being, this class is only used for validating operator fusion + * INVARIANTS: + * @note These invariants are exactly the same as operator fusion constraints + * 1. Fusion is limited to a linear sequence of operators + * 2. Max number of operators that can be fused is @ref GpuOperatorGroup::max_fused_operators + * 3. The fusion is subject to the pattern: Complex + Simple * | Simple + Simple * | Un-fusable + * 4. All operator but unfusable, have exactly 1 dst tensor + * 5. All fused operators share the same dst tensor shape + * 6. All fused operators' tensors share the same @ref DataLayout + */ +class GpuOperatorGroup +{ +public: + static constexpr size_t max_fused_operators = 32; + /** Try adding (without actually adding) an operator to the group + * + * @param[in] op Operator to be added + * + * @return true If @p op can be added while maintaining the invariants + * @return false Otherwise + */ + bool try_add_operator(const Operator &op) const; + /** Add an operator to the group + * + * @param[in] op Operator to be added + */ + void add_operator(const Operator &op); + /** Create a new operator + * + * @param[in] operator_type @ref GpuOperatorType of the new operator + * @param[in] tensors Tensor arguments to the new operator + * + * @return Operator + */ + Operator new_operator(const GpuOperatorType &operator_type, const ArgumentPack<ITensorInfo> &tensors) const; + /** Get the "root operator" of the group, which is the first operator in a linear sequence + * @return const Operator* Pointer to the root operator + */ + const Operator *get_root_operator() const; + +private: + DependencyGraph _graph{}; + std::map<OperatorId, Operator> _operators{}; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORGROUP */ diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h b/src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h new file mode 100644 index 0000000000..c77697c343 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORPROPERTIES +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORPROPERTIES + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Contain properties common to all operator types */ + +/** Operator type in the context of fusion + */ +enum class GpuOperatorType +{ + /** Simple operators are operators that: + * 1. Have a 1-to-1 mapping between the input elements and output elements, like elementwise + * 2. Have exactly 1 output + */ + Simple, + /** Complex operators are operators that are not simple but are still fusable with simple ones + */ + Complex, + /** Unfusable operators are operators that cannot be fused with any other types of operators + */ + Unfusable +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORPROPERTIES */ diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp new file mode 100644 index 0000000000..623bf351f8 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h" +#include "arm_compute/core/CL/CLCompileContext.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +GpuWorkloadContext::GpuWorkloadContext(CLCompileContext *cl_compile_ctx) + : _gpu_language{ GpuLanguage::OpenCL }, _cl_compile_ctx{ cl_compile_ctx } +{ +} + +GpuTarget GpuWorkloadContext::gpu_target() const +{ + return _cl_compile_ctx->get_gpu_target(); +} + +GpuLanguage GpuWorkloadContext::gpu_language() const +{ + return _gpu_language; +} + +const CLCompileContext *GpuWorkloadContext::cl_compile_context() const +{ + return _cl_compile_ctx; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp new file mode 100644 index 0000000000..ce7cf1e908 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +GpuWorkloadSketch::GpuWorkloadSketch(Context *context) + : _impl{ std::make_unique<Implementation>(context) } +{ +} +GpuWorkloadSketch::~GpuWorkloadSketch() +{ +} + +const GpuWorkloadSketch::Context *GpuWorkloadSketch::gpu_context() const +{ + return _impl->context(); +} + +TensorInfo GpuWorkloadSketch::create_tensor_info(const ITensorInfo &tensor_info) +{ + TensorInfo tensor{ tensor_info }; + tensor.set_id(allocate_new_tensor_id()); + return tensor; +} + +TensorInfo GpuWorkloadSketch::create_tensor_info() +{ + TensorInfo tensor{}; + tensor.set_id(allocate_new_tensor_id()); + return tensor; +} + +ITensorInfo::Id GpuWorkloadSketch::allocate_new_tensor_id() +{ + return _impl->allocate_new_tensor_id(); +} + +GpuWorkloadSketch::Implementation &GpuWorkloadSketch::implementation() +{ + return *_impl; +} +const GpuWorkloadSketch::Implementation &GpuWorkloadSketch::implementation() const +{ + return *_impl; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h new file mode 100644 index 0000000000..3997395c98 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL + +#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h" +#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h" +#include "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Internal implementation of @ref GpuWorkloadSketch */ +class GpuWorkloadSketch::Implementation +{ +public: + /** Constructor + * + * @param[in] context global workload creation context + */ + explicit Implementation( + Context *context) + : _context{ context }, + _comp_services{}, + _component_graph{ &_comp_services }, + _operator_group{} + { + } + /** Prevent instances of this class from being copy constructed */ + Implementation(const Implementation &impl) = delete; + /** Prevent instances of this class from being copied */ + Implementation &operator=(const Implementation &impl) = delete; + /** Allow instances of this class to be move constructed */ + Implementation(Implementation &&impl) = default; + /** Allow instances of this class to be moved */ + Implementation &operator=(Implementation &&impl) = default; + /** Get workload context */ + const Context *context() const + { + return _context; + } + /** Get component graph */ + const GpuKernelComponentGraph &component_graph() const + { + return _component_graph; + } + /** Get component graph */ + GpuKernelComponentGraph &component_graph() + { + return _component_graph; + } + /** Get operator group */ + const GpuOperatorGroup &operator_group() const + { + return _operator_group; + } + /** Get operator group */ + GpuOperatorGroup &operator_group() + { + return _operator_group; + } + ITensorInfo::Id allocate_new_tensor_id() + { + return ++_next_id; + } + /** Generate @ref GpuWorkloadSourceCode from the workload sketch + * @note The sketch must be valid. Any error encountered during the building of the code will be thrown. + * + * @return GpuWorkloadSourceCode The generated workload code + */ + GpuWorkloadSourceCode generate_source_code() const + { + return component_graph().fuse().write_workload_code(); + } + +private: + Context *_context; + GpuComponentServices _comp_services; + GpuKernelComponentGraph _component_graph; + GpuOperatorGroup _operator_group; + ITensorInfo::Id _next_id{ ITensorInfo::invalid_tensor_id }; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL */ diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h new file mode 100644 index 0000000000..2375f5c6c6 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE + +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Uniquely identifies a @ref GpuUnitWorkload within a @ref GpuWorkloadSourceCode */ +using UnitWorkloadId = int32_t; + +/** Describes all the info related to a kernel in order to: + * - be used by runtime to configure gpu kernel argument + * - be used by memory managers to allocate required memory + */ +class GpuWorkloadArgument +{ +public: + /** Default constructor */ + GpuWorkloadArgument() = default; + /** Constructor + * + * @param[in] tensor_info @ref ITensorInfo of the workload argument + * @param[in] mem_desc @ref MemoryDescriptor of the workload argument + * @param[in] kernel_arg_info @ref GpuKernelArgumentInfo of the workload argument + */ + GpuWorkloadArgument(const ITensorInfo &tensor_info, + const MemoryDescriptor &mem_desc, + const GpuKernelArgumentInfo &kernel_arg_info) + : _tensor_info{ tensor_info }, + _mem_desc{ mem_desc }, + _kernel_arg_info{ kernel_arg_info } + { + } + /** Get tensor id within workload */ + ITensorInfo::Id id() const + { + return _tensor_info.id(); + } + /** Get @ref ITensorInfo of the argument */ + ITensorInfo *tensor_info() + { + return &_tensor_info; + } + /** Get @ref ITensorInfo of the argument */ + const ITensorInfo *tensor_info() const + { + return &_tensor_info; + } + /** Get @ref MemoryDescriptor of the argument */ + MemoryDescriptor *memory_descriptor() + { + return &_mem_desc; + } + /** Get @ref MemoryDescriptor of the argument */ + const MemoryDescriptor *memory_descriptor() const + { + return &_mem_desc; + } + /** Get @ref GpuKernelArgumentInfo of the argument */ + GpuKernelArgumentInfo *kernel_argument_info() + { + return &_kernel_arg_info; + } + /** Get @ref GpuKernelArgumentInfo of the argument */ + const GpuKernelArgumentInfo *kernel_argument_info() const + { + return &_kernel_arg_info; + } + /** Check if the workload argument has valid id + * + * @return true If has valid id + * @return false Otherwise + */ + bool has_valid_id() const + { + return _tensor_info.has_valid_id(); + } + +private: + TensorInfo _tensor_info{}; + MemoryDescriptor _mem_desc{}; + GpuKernelArgumentInfo _kernel_arg_info{}; +}; + +/** Describes when a unit workload is run. + */ +struct UnitWorkloadStage +{ + enum class Stage + { + Prepare, /**< Only run once at the beginning. */ + Run, /**< Run every time after the first time. */ + }; + Stage stage{ Stage::Run }; +}; + +inline bool operator==(const UnitWorkloadStage &stage0, const UnitWorkloadStage &stage1) +{ + return stage0.stage == stage1.stage; +} + +/** The atomic unit in a Gpu workload. It contains exactly one kernel to run. + */ +class GpuUnitWorkload +{ +public: + /** Default constructor */ + GpuUnitWorkload() = default; + /** Constructor + * + * @param[in] id Id that uniquely identifies this unit workload in a workload + * @param[in] kernel_code @ref GpuKernelSourceCode contained within + * @param[in] stage Stage of the unit workload + */ + GpuUnitWorkload(UnitWorkloadId id, const GpuKernelSourceCode &kernel_code, const UnitWorkloadStage &stage) + : _id{ id }, _kernel_code{ kernel_code }, _stage{ stage } + { + } + /** Get the id of the unit workload */ + UnitWorkloadId id() const + { + return _id; + } + /** Get reference to the underlying @ref GpuKernelSourceCode */ + const GpuKernelSourceCode &code() const + { + return _kernel_code; + } + /** Get the stage of the unit workload */ + UnitWorkloadStage stage() const + { + return _stage; + } + +private: + UnitWorkloadId _id{}; + GpuKernelSourceCode _kernel_code{}; + UnitWorkloadStage _stage{}; +}; + +/** Hold the generated kernel source code and other information required to compile and run the workload. + */ +class GpuWorkloadSourceCode +{ +public: + /** Default constructor */ + GpuWorkloadSourceCode() = default; + /** Add a unit workload to the workload code + * + * @param[in] kernel_code @ref GpuKernelSourceCode to be contained within the unit workload + * @param[in] stage Stage of the unit workload + * @param[in] mem_map @ref MemoryDescriptor map for all tensors within the unit workload + * + * @return UnitWorkloadId Allocated unit workload id + */ + UnitWorkloadId add_unit_workload(const GpuKernelSourceCode &kernel_code, const UnitWorkloadStage &stage, const MemoryDescriptorMap &mem_map) + { + // Use the size of the kernel codes as Id + const auto uwk_id = static_cast<UnitWorkloadId>(_unit_workloads.size()); + const auto unit_work = GpuUnitWorkload(uwk_id, kernel_code, stage); + _unit_workloads.push_back(unit_work); + // Assemble kernel argument with memory descriptor to form workload argument + for(const auto &id_arg : kernel_code.arguments()) + { + const auto arg_id = id_arg.first; + const auto arg = id_arg.second; + _workload_arguments[arg_id] = GpuWorkloadArgument{ *arg.tensor_info(), mem_map.at(arg_id), *arg.kernel_argument_info() }; + if(_tensor_uwork_map.find(arg_id) == _tensor_uwork_map.end()) + { + _tensor_uwork_map[arg_id] = std::set<UnitWorkloadId>(); + } + _tensor_uwork_map[arg_id].insert(uwk_id); + } + return uwk_id; + } + /** Get a unit workload from its id */ + const GpuUnitWorkload &query_unit_workload(UnitWorkloadId id) const + { + ARM_COMPUTE_ERROR_ON(id < 0); + return _unit_workloads.at(id); + } + /** Get all unit workloads sorted in topological order */ + std::vector<UnitWorkloadId> unit_workloads() const + { + std::vector<UnitWorkloadId> ids{}; + + for(const auto &uwk : _unit_workloads) + { + ids.push_back(uwk.id()); + } + return ids; + } + /** Get a @ref GpuWorkloadArgument from its associated tensor id */ + const GpuWorkloadArgument *query_tensor(ITensorInfo::Id t_id) const + { + return &_workload_arguments.at(t_id); + } + /** Get all tensors in the entire workload */ + std::vector<ITensorInfo::Id> tensors() const + { + std::vector<ITensorInfo::Id> ids{}; + for(const auto &id_tensor : _workload_arguments) + { + ids.push_back(id_tensor.first); + } + return ids; + } + /** Get all unit workloads connected to the tensor with @p t_id */ + std::vector<UnitWorkloadId> get_unit_workloads_from_tensor(ITensorInfo::Id t_id) const + { + const auto unit_work_set = _tensor_uwork_map.at(t_id); + return std::vector<UnitWorkloadId>(unit_work_set.begin(), unit_work_set.end()); + } + +private: + std::vector<GpuUnitWorkload> _unit_workloads{}; + std::map<ITensorInfo::Id, GpuWorkloadArgument> _workload_arguments{}; + std::map<ITensorInfo::Id, std::set<UnitWorkloadId>> _tensor_uwork_map{}; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE */ diff --git a/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h new file mode 100644 index 0000000000..ae67790b4b --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER + +#include "arm_compute/core/CL/CLCompileContext.h" +#include "arm_compute/core/Window.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" + +#include <map> +#include <string> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** An interface that can write a gpu kernel + */ +class IGpuKernelWriter +{ +public: + /** Destructor */ + virtual ~IGpuKernelWriter() + { + } + /** Generate kernel name */ + virtual std::string get_name() = 0; + /** Generate kernel code */ + virtual std::string get_code() = 0; + /** Generate build options */ + virtual CLBuildOptions get_build_options() = 0; + /** Generate config id string of the entire kernel. This is used for tuning */ + virtual std::string get_config_id() = 0; + /** Generate execution window */ + virtual Window get_window() const = 0; + /** Get the kernel argument lists of the kernel*/ + virtual std::map<ITensorInfo::Id, GpuKernelArgument> get_tensors() = 0; +}; + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER */ diff --git a/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h b/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h new file mode 100644 index 0000000000..f7f0029618 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY + +#include "Types.h" +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" +#include <memory> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Factory class that creates new instances of @ref IGpuKernelComponent by assigning new component ids + */ +class GpuKernelComponentFactory +{ +public: + /** Create a new kernel component + * + * @tparam T Any polymorphic type descending from @ref IGpuKernelComponent + * @tparam Args Argument types to construct the kernel component + * + * @param[in] args Arguments to construct the kernel component + * + * @return std::unique_ptr<IGpuKernelComponent> + */ + template <typename T, typename... Args> + std::unique_ptr<IGpuKernelComponent> create(Args &&... args) + { + return std::make_unique<T>(_count++, std::forward<Args>(args)...); + } + +private: + ComponentId _count{ 0 }; +}; + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY */ diff --git a/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h new file mode 100644 index 0000000000..8bb19155a2 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT + +#include "Types.h" + +#include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Properties common to all kernel component types */ +class KernelProperties +{ +public: + KernelProperties &stage(const UnitWorkloadStage &stage) + { + _stage = stage; + return *this; + } + UnitWorkloadStage stage() const + { + return _stage; + } + +private: + UnitWorkloadStage _stage{}; +}; + +inline bool operator==(const KernelProperties &config0, const KernelProperties &config1) +{ + return config0.stage() == config1.stage(); +} + +/** Forward declaration */ +class IGpuTemplateComponentWriter; + +/** An abstract interface of a component. It enables manipulation by the component graph for purposes like fusion + */ +class IGpuKernelComponent +{ +public: + using Properties = KernelProperties; + +public: + /** Constructor + * + * @param[in] id Component id + * @param[in] properties Kernel component properties + * @param[in] tensors Tensor arguments to the components + */ + IGpuKernelComponent( + ComponentId id, + const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors) + : _id{ id }, + _properties{ properties }, + _tensors{ tensors } + { + } + /** Destructor */ + virtual ~IGpuKernelComponent() + { + } + /** Get component id */ + ComponentId id() const + { + return _id; + } + /** Get tensor arguments */ + ArgumentPack<ITensorInfo> tensors() const + { + return _tensors; + } + /** Get properties */ + Properties properties() const + { + return _properties; + } + /** Get template writer for the component */ + virtual const IGpuTemplateComponentWriter *template_writer() const = 0; + /** Get component type */ + virtual GpuComponentType type() const = 0; + +private: + ComponentId _id{ -1 }; + Properties _properties{}; + ArgumentPack<ITensorInfo> _tensors{}; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT */ diff --git a/src/dynamic_fusion/sketch/gpu/components/Types.h b/src/dynamic_fusion/sketch/gpu/components/Types.h new file mode 100644 index 0000000000..54b3a69057 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/Types.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_TYPES +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_TYPES + +#include <cstdint> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Uniquely identifies a kernel component within a workload + */ +using ComponentId = int32_t; + +/** Component type in the context of fusion + * Its main purpose is to inform the optimizer how to perform fusion. + */ +enum class GpuComponentType +{ + Complex, + Simple, + Unfusable, + Output +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_TYPES */ diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp new file mode 100644 index 0000000000..e94cfd1581 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "ClComponentDirectConv2d.h" + +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h" +#include "src/core/CL/CLValidate.h" +#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +using Settings = ClComponentDirectConv2dSettings; + +Settings &Settings::export_to_cl_image(bool cl_image) +{ + _export_to_cl_image = cl_image; + return *this; +} + +bool Settings::export_to_cl_image() const +{ + return _export_to_cl_image; +} + +Settings &Settings::fast_relaxed_math(bool fast_relaxed_math) +{ + _fast_relaxed_math = fast_relaxed_math; + return *this; +} + +bool Settings::fast_relaxed_math() const +{ + return _fast_relaxed_math; +} + +Status ClComponentDirectConv2d::validate( + const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings) +{ + ARM_COMPUTE_UNUSED(properties, settings); + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); + const auto wei = tensors.get_const_tensor(TensorType::ACL_SRC_1); + const auto bia = tensors.get_const_tensor(TensorType::ACL_SRC_2); + const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0); + + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei, dst); + + // 1. Check validity + // Matching data type + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, wei); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + if(bia != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bia); + } + + // Matching data layout + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, wei); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); + if(bia != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, bia); + } + + // All tensor infos are initialized + ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(wei->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); + if(bia != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON(bia->tensor_shape().total_size() == 0); + } + // Device requirements are met + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + // wei shape is correct + const DataLayout data_layout = src->data_layout(); + const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->dimension(channel_idx) != src->dimension(channel_idx), "Weights feature map dimension should match the respective src's one"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->num_dimensions() > 4, "Weights can be at most 4 dimensional"); + + // dst shape is correct + PadStrideInfo legacy_pad_stride(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, attributes.pad().right, attributes.pad().top, + attributes.pad().bottom, DimensionRoundingType{}); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), + misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, legacy_pad_stride)); + + // bia shape is correct + if(bia != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->dimension(0) != wei->dimension(3), + "Biases size and number of dst feature maps should match"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->num_dimensions() > 1, + "Biases should be one dimensional"); + } + + // 2. Check support level + // Data type + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); + // Data layout + ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC); + + return Status{}; +} + +ClComponentDirectConv2d::ClComponentDirectConv2d( + ComponentId id, + const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings) + : IGpuKernelComponent{ id, properties, tensors }, + _component_writer{ std::make_unique<ClTemplateDirectConv2d>(id, tensors, attributes, settings) } +{ +} +ClComponentDirectConv2d::~ClComponentDirectConv2d() +{ +} +const IGpuTemplateComponentWriter *ClComponentDirectConv2d::template_writer() const +{ + return _component_writer.get(); +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h new file mode 100644 index 0000000000..fec22b84a5 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D + +#include "arm_compute/core/Error.h" +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" +#include <memory> + +namespace arm_compute +{ +/** Forward declaration */ +class ITensorInfo; +namespace experimental +{ +namespace dynamic_fusion +{ +/** Forward declaration */ +template <typename T> +class ArgumentPack; +class Conv2dAttributes; + +/** Component specific settings + */ +class ClComponentDirectConv2dSettings +{ +public: + /** Set export_to_cl_image flag */ + ClComponentDirectConv2dSettings &export_to_cl_image(bool cl_image); + /** Get export_to_cl_image flag */ + bool export_to_cl_image() const; + + /** Set fast_relaxed_math flag */ + ClComponentDirectConv2dSettings &fast_relaxed_math(bool fast_relaxed_math); + /** Get fast_relaxed_math flag */ + bool fast_relaxed_math() const; + +private: + bool _export_to_cl_image{ false }; + bool _fast_relaxed_math{ true }; +}; + +/** Forward declaration */ +class ClTemplateDirectConv2d; + +class ClComponentDirectConv2d final : public IGpuKernelComponent +{ +public: + /** Attributes are a set of backend-agnostic parameters that define what a component does */ + using Attributes = Conv2dAttributes; + /** Settings are a set of backend-specific parameters that influence the implementation of a component */ + using Settings = ClComponentDirectConv2dSettings; + +public: + /** Validate the component + * + * @param[in] properties Component properties + * @param[in,out] tensors Tensor arguments to the component + * @param[in] attributes Component attributes + * @param[in] settings Component settings + * + * @return Status Validation results + * + * Tensor argument names: + * - ACL_SRC_0: Input + * - ACL_SRC_1: Weight + * - ACL_SRC_2: Bias (Optional) + * - ACL_DST_0: Output + * + * Tensor argument constness: + * - ACL_SRC_0: Const + * - ACL_SRC_1: Const + * - ACL_SRC_2: Const + * - ACL_DST_0: Const + * + * Valid data layouts: + * - NHWC + * + * Valid data type configurations: + * |ACL_SRC_0 |ACL_SRC_1 |ACL_SRC_2 |ACL_DST_0 | + * |:--------------|:--------------|:--------------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + */ + static Status validate( + const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings); + + /** Constructor + * + * Similar to @ref ClComponentDirectConv2d::validate() + */ + ClComponentDirectConv2d( + ComponentId id, + const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings); + + /** Destructor */ + ~ClComponentDirectConv2d() override; + /** Prevent instances of this class from being copy constructed */ + ClComponentDirectConv2d(const ClComponentDirectConv2d &component) = delete; + /** Prevent instances of this class from being copied */ + ClComponentDirectConv2d &operator=(const ClComponentDirectConv2d &component) = delete; + /** Allow instances of this class to be move constructed */ + ClComponentDirectConv2d(ClComponentDirectConv2d &&component) = default; + /** Allow instances of this class to be moved */ + ClComponentDirectConv2d &operator=(ClComponentDirectConv2d &&component) = default; + /** Get template writer for the component */ + const IGpuTemplateComponentWriter *template_writer() const override; + /** Get component type */ + GpuComponentType type() const override + { + return GpuComponentType::Complex; + } + +private: + std::unique_ptr<ClTemplateDirectConv2d> _component_writer; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D */ diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp new file mode 100644 index 0000000000..f49f397ec1 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "ClComponentStore.h" + +#include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h" + +#include <memory> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +Status ClComponentStore::validate( + const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors) +{ + ARM_COMPUTE_UNUSED(properties, tensors); + return Status{}; +} +ClComponentStore::ClComponentStore(ComponentId id, const Properties &properties, const ArgumentPack<ITensorInfo> &tensors) + : IGpuKernelComponent{ id, properties, tensors }, _component_writer{ std::make_unique<ClTemplateStore>(id, tensors) } +{ +} +ClComponentStore::~ClComponentStore() +{ +} +const IGpuTemplateComponentWriter *ClComponentStore::template_writer() const +{ + return _component_writer.get(); +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h new file mode 100644 index 0000000000..bf8c9f031e --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE + +#include "arm_compute/core/Error.h" +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" +#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h" +#include <memory> + +namespace arm_compute +{ +/** Forward declaration */ +class ITensorInfo; +namespace experimental +{ +namespace dynamic_fusion +{ +/** Forward declaration */ +template <typename T> +class ArgumentPack; + +class ClComponentStore final : public IGpuKernelComponent +{ +public: + /** Validate the component + * + * @param[in] properties Component properties + * @param[in] tensors Tensor arguments to the components + * + * @return Status Validation results + * + * Tensor argument names: + * - ACL_SRC_0: Input + * - ACL_DST_0: Output + * + * Tensor argument constness: + * - ACL_SRC_0: Const + * - ACL_DST_0: Const + * + * Valid data layouts: + * - NHWC + * + * Valid data type configurations: + * |ACL_SRC_0 |ACL_DST_0 | + * |:--------------|:--------------| + * |All |All | + */ + static Status validate( + const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors); + /** Constructor + * + * Similar to @ref ClComponentStore::validate() + */ + ClComponentStore(ComponentId id, const Properties &properties, const ArgumentPack<ITensorInfo> &tensors); + /** Destructor */ + ~ClComponentStore() override; + /** Prevent instances of this class from being copy constructed */ + ClComponentStore(const ClComponentStore &component) = delete; + /** Prevent instances of this class from being copied */ + ClComponentStore &operator=(const ClComponentStore &component) = delete; + /** Allow instances of this class to be move constructed */ + ClComponentStore(ClComponentStore &&component) = default; + /** Allow instances of this class to be moved */ + ClComponentStore &operator=(ClComponentStore &&component) = default; + /** Get template writer for the component */ + const IGpuTemplateComponentWriter *template_writer() const override; + /** Get component type */ + GpuComponentType type() const override + { + return GpuComponentType::Output; + } + +private: + std::unique_ptr<ClTemplateStore> _component_writer; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE */ diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp new file mode 100644 index 0000000000..98c1cc3939 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h" + +#include "arm_compute/core/CL/CLCompileContext.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h" +#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +bool export_to_cl_image_support(const ITensorInfo *tensor, GPUTarget gpu_target, const cl::Device &device, DataLayout data_layout) +{ + if(tensor->tensor_shape()[0] % 4 || (data_layout != DataLayout::NHWC)) + { + return false; + } + + // If not floating point + if(!is_data_type_float(tensor->data_type())) + { + return false; + } + + if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD) + { + return false; + } + + // Check if the cl_khr_image2d_from_buffer extension is supported on the target platform + if(!image2d_from_buffer_supported(device)) + { + return false; + } + + // Check cl image pitch alignment + if(get_cl_image_pitch_alignment(device) == 0) + { + return false; + } + + const size_t image_w = tensor->tensor_shape()[0] / 4; + const size_t image_h = tensor->tensor_shape()[1] * tensor->tensor_shape()[2] * tensor->tensor_shape()[3]; + const size_t max_image_w = device.getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>(); + const size_t max_image_h = device.getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>(); + + if(image_w > max_image_w || image_h > max_image_h) + { + return false; + } + + return true; +} + +GpuOperatorType operator_type = GpuOperatorType::Complex; +} // namespace + +Status GpuConv2d::validate_op(const GpuWorkloadSketch &sketch, + const ITensorInfo *src, + const ITensorInfo *wei, + const ITensorInfo *bia, + const ITensorInfo *dst, + const Conv2dAttributes &attributes) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei, dst); + ARM_COMPUTE_RETURN_ERROR_ON( + !src->has_valid_id() || !wei->has_valid_id() || !dst->has_valid_id()); + if(bia != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON(!bia->has_valid_id()); + } + + // Perform fusion test + // Pack tensor infos + ArgumentPack<ITensorInfo> tensors; + tensors.add_const_tensor(ACL_SRC_0, src); + tensors.add_const_tensor(ACL_SRC_1, wei); + tensors.add_const_tensor(ACL_SRC_2, bia); + tensors.add_const_tensor(ACL_DST_0, dst); + const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op), + "Operator fusion test failed. This operator cannot be fused into the workload"); + + // Auto initialize dst tensor info + TensorInfo dst_info_to_validate = *dst; + const auto data_layout = src->data_layout(); + + { + auto shape = misc::shape_calculator::compute_deep_convolution_shape(src->tensor_shape(), data_layout, wei->tensor_shape(), + PadStrideInfo(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, + attributes.pad().right, + attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR)); // use the default DimensionRoundingType + + auto_init_if_empty(dst_info_to_validate, src->clone()->set_tensor_shape(shape)); + } + + // Check support level + // Data type + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); + // Data layout + ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC); + + const auto sketch_ctx = sketch.implementation().context(); + + const auto gpu_target = sketch_ctx->gpu_target(); + + if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + { + const auto cl_compile_ctx = sketch_ctx->cl_compile_context(); + ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr); + // Validate Direct Conv2d Component + { + const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + auto settings = ClComponentDirectConv2d::Settings(); + + settings.export_to_cl_image( + export_to_cl_image_support(src, gpu_target, cl_compile_ctx->get_device(), data_layout)); + + settings.fast_relaxed_math( + (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) + && (dst_info_to_validate.data_type() == DataType::F32 || dst_info_to_validate.data_type() == DataType::F16)); + + ArgumentPack<ITensorInfo> arguments; + arguments.add_const_tensor(ACL_SRC_0, src); + arguments.add_const_tensor(ACL_SRC_1, wei); + arguments.add_const_tensor(ACL_SRC_2, bia); + arguments.add_const_tensor(ACL_DST_0, &dst_info_to_validate); + ARM_COMPUTE_RETURN_ON_ERROR(ClComponentDirectConv2d::validate(properties, arguments, attributes, settings)); + } + } + else + { + ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language"); + } + return Status{}; +} + +void GpuConv2d::create_op(GpuWorkloadSketch &sketch, + ITensorInfo *src, + ITensorInfo *wei, + ITensorInfo *bia, + ITensorInfo *dst, + const Conv2dAttributes &attributes) +{ + // Assert validation + ARM_COMPUTE_ERROR_THROW_ON(GpuConv2d::validate_op(sketch, src, wei, bia, dst, attributes)); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, wei, dst); + const auto data_layout = src->data_layout(); + + // Auto initialize dst tensor + { + auto shape = misc::shape_calculator::compute_deep_convolution_shape(src->tensor_shape(), data_layout, wei->tensor_shape(), + PadStrideInfo(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, + attributes.pad().right, + attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR)); // use the default DimensionRoundingType + + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape)); + } + + // Translate into components and add to component graph + auto &comp_graph = sketch.implementation().component_graph(); + + const auto sketch_ctx = sketch.implementation().context(); + + const auto gpu_target = sketch_ctx->gpu_target(); + + if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + { + const auto cl_compile_ctx = sketch_ctx->cl_compile_context(); + ARM_COMPUTE_ERROR_ON(cl_compile_ctx == nullptr); + + // Add Direct Conv2d Component + { + auto properties = IGpuKernelComponent::Properties(); + properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + + auto settings = ClComponentDirectConv2d::Settings(); + + settings.export_to_cl_image( + export_to_cl_image_support(src, gpu_target, cl_compile_ctx->get_device(), data_layout)); + + settings.fast_relaxed_math( + (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) + && (dst->data_type() == DataType::F32 || dst->data_type() == DataType::F16)); + + if(settings.export_to_cl_image()) + { + arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(wei); + } + + ArgumentPack<ITensorInfo> arguments; + arguments.add_const_tensor(ACL_SRC_0, src); + arguments.add_const_tensor(ACL_SRC_1, wei); + arguments.add_const_tensor(ACL_SRC_2, bia); + arguments.add_const_tensor(ACL_DST_0, dst); + comp_graph.add_new_component<ClComponentDirectConv2d>(properties, arguments, attributes, settings); + } + } + else + { + ARM_COMPUTE_ERROR("Unimplemented Gpu language"); + } + + // Set up fusion test by adding to the Operator Group + // Note this has to be performed after all the components have been successfully added to the component graph + + // Pack tensor infos + ArgumentPack<ITensorInfo> tensors; + tensors.add_const_tensor(ACL_SRC_0, src); + tensors.add_tensor(ACL_SRC_1, wei); + tensors.add_const_tensor(ACL_SRC_2, bia); + tensors.add_tensor(ACL_DST_0, dst); + + const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + sketch.implementation().operator_group().add_operator(op); +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp new file mode 100644 index 0000000000..13c0b141a5 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "GpuKernelVariableTable.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/ITensorInfo.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +void GpuKernelVariableTable::declare_variable(const ITensorInfo *tensor, GpuKernelArgumentInfo argument_info, bool is_interm, const std::string &alias) +{ + ARM_COMPUTE_ERROR_ON_MSG(!tensor->has_valid_id(), "Tensor info with valid id expected"); + // Do not re-declare if the variable associated with the tensor has already been declared + if(get_variable(tensor).has_valid_id()) + { + ARM_COMPUTE_ERROR_ON(!(get_variable(tensor).kernel_argument_info == argument_info)); + return; + } + // Declare variable associated with the tensor + std::stringstream ss; + ss << alias << "_t" << tensor->id(); + const auto uniq_name = ss.str(); + TensorVariable var{ tensor->id(), uniq_name, argument_info }; + + if(is_interm) + { + _interm_var = var; + _interm_tensors.insert(tensor->id()); + } + else + { + _vars.emplace(tensor->id(), var); + } +} + +GpuKernelVariableTable::TensorVariable GpuKernelVariableTable::get_variable(const ITensorInfo *tensor) const +{ + const TensorVariable empty_var{}; + if(_vars.find(tensor->id()) != _vars.end()) + { + return _vars.at(tensor->id()); + } + if(_interm_tensors.find(tensor->id()) != _interm_tensors.end()) + { + return _interm_var; + } + return empty_var; +} + +GpuKernelVariableTable::VariableList GpuKernelVariableTable::get_variable_list(const std::vector<const ITensorInfo *> &tensors) const +{ + VariableList vars{}; + for(const auto &tensor : tensors) + { + if(!tensor->has_valid_id()) + { + continue; + } + vars.push_back(get_variable(tensor)); + } + return vars; +} + +TagVal::TagVal(const GpuKernelVariableTable::TensorVariable &var) + : value{ var.uniq_name } +{ +} + +TagVal::TagVal(const std::string &val) + : value{ val } +{ +} + +TagVal::TagVal(const char *val) + : value{ std::string(val) } +{ +} + +TagVal::TagVal(const DataType &data_type) + : value{ get_cl_type_from_data_type(data_type) } +{ +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h new file mode 100644 index 0000000000..4eee3963c2 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_GPUKERNELVARIABLETABLE +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_GPUKERNELVARIABLETABLE + +#include "arm_compute/core/ITensorInfo.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "support/Requires.h" +#include "support/StringSupport.h" + +#include <set> +#include <string> +#include <type_traits> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** A table of all the variables used in the kernel + * Since fusion is restricted to a linear sequence of components in a kernel, only a single "intermediate variable" (the accumulator) is allowed. + * Each kernel has exactly one variable table + */ +class GpuKernelVariableTable +{ +public: + /** A tensor variable whose main purposes are: + * - Hold the newly assigned @ref GpuKernelArgumentInfo for the associated tensor info + * - Hold the generated variable name for the associated tensor info + */ + struct TensorVariable + { + public: + TensorVariable() = default; + TensorVariable(const TensorVariable &) = default; + TensorVariable &operator=(const TensorVariable &) = default; + ITensorInfo::Id id{ ITensorInfo::invalid_tensor_id }; + std::string uniq_name{ "empty" }; // Unique name, also the final variable name used in the built code + GpuKernelArgumentInfo kernel_argument_info{}; + bool has_valid_id() const + { + return id != ITensorInfo::invalid_tensor_id; + } + }; + using VariableList = std::vector<TensorVariable>; + +public: + /** Declare a @ref TensorVariable for a corresponding tensor info. + * + * @note: Later re-declaration of the intermediate variable will overwrite the previous association to the @ref ITensorInfo + * Therefore, the order of declaration is important. It's assumed that the components declaring the variable is already in correct order + * + * @param[in] tensor Tensor info with which the new variable is associated + * @param[in] argument_info Kernel argument information + * @param[in] is_interm If the new variable is an intermediate variable + * @param[in] alias Alias for the variable. Will be used as part of the variable name + */ + void declare_variable(const ITensorInfo *tensor, GpuKernelArgumentInfo argument_info, bool is_interm = false, const std::string &alias = "unnamed"); + /** Get the @ref TensorVariable associated with @p tensor + * + * @param[in] tensor Tensor info to be queried + * + * @return TensorVariable + */ + TensorVariable get_variable(const ITensorInfo *tensor) const; + /** Get the @ref TensorVariable list associated with @p tensors + * @note Empty tensors are skipped + * + * @param[in] tensors List of tensor infos to be queried + * + * @return VariableList + */ + VariableList get_variable_list(const std::vector<const ITensorInfo *> &tensors) const; + +private: + std::map<ITensorInfo::Id, TensorVariable> _vars{}; /**< Non-intermediate (function parameter) variables*/ + TensorVariable _interm_var{}; /**< Intermediate variable */ + std::set<ITensorInfo::Id> _interm_tensors{}; /**< Tensors associated with the single intermediate variable */ +}; + +/** A tag value will substitute a tag in a string template during its instantiation */ +struct TagVal +{ + /** Default constructor */ + TagVal() = default; + /** Construct a @ref TagVal from a @ref GpuKernelVariableTable::TensorVariable */ + TagVal(const GpuKernelVariableTable::TensorVariable &var); + /** Construct a @ref TagVal from an integral type */ + template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_integral<T>::value)> + TagVal(T val) + : value{ support::cpp11::to_string(val) } + { + } + /** Construct a @ref TagVal from a string */ + TagVal(const std::string &val); + /** Construct a @ref TagVal from a c-style string */ + TagVal(const char *val); + /** Construct a @ref TagVal from a @ref DataType */ + TagVal(const DataType &data_type); + /** Get the value of the TagVal as a converted string */ + std::string value{}; +}; + +/** A tag used in a string template is a placeholder string to be substituted by real values during template instantiation */ +using Tag = std::string; + +/** Tag lookup table. It is used to instantiate a string template */ +using TagLUT = std::unordered_map<Tag, TagVal>; + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_GPUKERNELVARIABLETABLE */ diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h b/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h new file mode 100644 index 0000000000..c85ddf5a2c --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_IGPUTEMPLATECOMPONENTWRITER +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_IGPUTEMPLATECOMPONENTWRITER + +#include "arm_compute/core/CL/CLCompileContext.h" +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/core/Window.h" +#include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/components/Types.h" +#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Forward declaration */ +class GpuKernelComponentGroup; +class GpuKernelVariableTable; + +/** An interface used by @ref ClTemplateWriter to write source code for a kernel component + */ +class IGpuTemplateComponentWriter +{ +public: + using ComponentGroup = GpuKernelComponentGroup; + +public: + /** Constructor + * + * @param[in] id Component id + * @param[in] tensors Tensor arguments to the components + */ + IGpuTemplateComponentWriter(ComponentId id, const ArgumentPack<ITensorInfo> tensors) + : _id{ id }, _tensors{ tensors } + { + } + /** Destructor */ + virtual ~IGpuTemplateComponentWriter() + { + } + /** Generate kernel component name */ + virtual std::string get_name() const = 0; + /** Generate kernel component code template + * + * @param[in] comp_group Component group of which the component is a part of + * + * @return std::string Component code + */ + virtual std::string get_component_code(const ComponentGroup &comp_group) const = 0; + /** Declare all variables used by the component in the @p vtable + * + * @param[out] vtable Variable table + * @param[in] comp_group Component group of which the component is a part of + */ + virtual void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const = 0; + /** Generate the tag look-up table used to instantiate the component code. + * + * @param[in] vtable Variable table + * @param[in] comp_group Component group of which the component is a part of + * + * @return TagLUT Tag lookup table + */ + virtual TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const = 0; + /** Generate additional macros used in the component */ + virtual std::string get_additional_macros() const + { + return ""; + } + /** Generate the build options used in the component + * + * @param[in] comp_group Component group of which the component is a part of + * + * @return CLBuildOptions Build options + */ + virtual CLBuildOptions get_build_options(const ComponentGroup &comp_group) const + { + ARM_COMPUTE_UNUSED(comp_group); + return CLBuildOptions{}; + } + /** Generate the component config id string used for tuning */ + virtual std::string get_config_id() const + { + return ""; + } + /** Generate the header list used in the component */ + virtual std::set<std::string> get_headers_list() const + { + return std::set<std::string> {}; + } + /** Generate the execution window for the component */ + virtual Window get_window() const + { + return Window{}; + } + /** Get tensor arguments */ + ArgumentPack<ITensorInfo> tensors() const + { + return _tensors; + } + /** Get component id */ + ComponentId id() const + { + return _id; + } + +private: + ComponentId _id{ -1 }; + ArgumentPack<ITensorInfo> _tensors{}; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_IGPUTEMPLATECOMPONENTWRITER */ diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp new file mode 100644 index 0000000000..870de64eb8 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp @@ -0,0 +1,400 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "ClTemplateDirectConv2d.h" + +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h" + +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/helpers/WindowHelpers.h" + +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +ClTemplateDirectConv2d::ClTemplateDirectConv2d(ComponentId id, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings) + : IGpuTemplateComponentWriter{ id, tensors }, + _src{}, + _weight{}, + _bias{}, + _dst{}, + _attributes{ attributes }, + _settings{ settings } +{ + _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); + _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); + if(this->tensors().get_const_tensor(TensorType::ACL_SRC_2)) + { + _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2); + } + _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); + ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _weight, _dst); +} + +std::string ClTemplateDirectConv2d::get_name() const +{ + return "direct_conv2d"; +} + +std::string ClTemplateDirectConv2d::get_component_code(const ComponentGroup &comp_group) const +{ + ARM_COMPUTE_UNUSED(comp_group); + + const auto channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL); + const auto k0 = adjust_vec_size(is_data_type_quantized(_src->data_type()) ? 16u : 8u, _src->dimension(channel_idx)); + const bool leftover_loop = (_src->dimension(channel_idx) % k0) != 0; + + std::string code = R"_( +//------------------ START KERNEL {{meta_kernel_id}} --------------------- +// IN_0(src) {{src}} +// IN_1(wei) {{weight}} +)_"; + if(_bias && _bias->has_valid_id()) + { + code += R"_( +// IN_1(bia) {{bias}} +)_"; + } + code += R"_( +// OUT(dst, accum) {{dst}} + +// Initialize the accumulators +TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}}); +{ + // All the tensor dimensions are passed at compile time. + // In case of dynamic tensor support, the following dimensions should be passed as function argument. +#define _IWEI_WIDTH {{WEI_WIDTH}} +#define _IWEI_HEIGHT {{WEI_HEIGHT}} +#define _ISRC_WIDTH {{src}}_w +#define _ISRC_HEIGHT {{src}}_h +#define _ISRC_CHANNELS {{src}}_c +#define _IDST_WIDTH {{arg_dst}}_w +#define _IDST_HEIGHT {{arg_dst}}_h +#define _IDST_CHANNELS {{arg_dst}}_c +#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT) + + // .v = access the whole vector (OpenCL vector) + // .s[x] = access the vector element at position x (scalar access) + TILE(int, M0, 1, xi); + TILE(int, M0, 1, yi); + + // Convert the linear index to coordinate + LOOP_UNROLLING(int, i, 0, 1, M0, + { + xi[i].v = ((g_ind_1 + i) % _IDST_WIDTH) * {{STRIDE_X}}; + yi[i].v = ((g_ind_1 + i) / _IDST_WIDTH) * {{STRIDE_Y}}; + xi[i].v -= {{PAD_LEFT}}; + yi[i].v -= {{PAD_TOP}}; + }) + + LOOP_UNROLLING(int, i, 0, 1, M0, + { + {{dst}}[i].v = 0; + }) + + for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i) + { + int ck = 0; + int xk = i % _IWEI_WIDTH; + int yk = i / _IWEI_WIDTH; + + int k = 0; + for(; k <= (_ISRC_CHANNELS - K0); k += K0) + { + TILE({{SRC_DATA_TYPE}}, M0, K0, a); + TILE({{WEI_DATA_TYPE}}, N0, K0, b); + + // Initialize tiles + LOOP_UNROLLING(int, i, 0, 1, M0, + { + a[i].v = {{ZERO_VALUE}}; + }) + + LOOP_UNROLLING(int, i, 0, 1, N0, + { + b[i].v = {{ZERO_VALUE}}; + }) + + // Load tile from the src tensor + T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, K0, {{SRC_TENSOR_TYPE}}, {{src}}, g_ind_2, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a); + + // Load tile from the weights tensor + T_LOAD({{WEI_DATA_TYPE}}, N0, K0, {{WEI_TENSOR_TYPE}}, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b); + + // Compute the matrix multiplication between two tiles + T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, K0, NT, T, a, b, {{dst}}); + + ck += K0; + } + + // We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS + // This #if directive should be removed in case of dynamic tensor support +)_"; + + if(leftover_loop) + { + code += R"_( + // Left-over accumulations + for(; k < _ISRC_CHANNELS; ++k) + { + TILE({{SRC_DATA_TYPE}}, M0, 1, a); + TILE({{WEI_DATA_TYPE}}, N0, 1, b); + + // Initialize tiles + LOOP_UNROLLING(int, i, 0, 1, M0, + { + a[i].v = {{ZERO_VALUE}}; + }) + + LOOP_UNROLLING(int, i, 0, 1, N0, + { + b[i].v = {{ZERO_VALUE}}; + }) + + // Load tile from the src tensor + T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, 1, {{SRC_TENSOR_TYPE}}, {{src}}, g_ind_2, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a); + + // Load tile from the weights tensor + // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration + T_LOAD({{WEI_DATA_TYPE}}, N0, 1, BUFFER, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b); + + // Compute the matrix multiplication between two tiles + T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, 1, NT, T, a, b, {{dst}}); + + ++ck; + } + )_"; +} + +code += R"_( +#undef _I_WEI_WIDTH +#undef _I_WEI_HEIGHT +#undef _ISRC_WIDTH +#undef _ISRC_HEIGHT +#undef _ISRC_CHANNELS +#undef _IDST_WIDTH +#undef _IDST_HEIGHT +#undef _IDST_CHANNELS +#undef _IY_MULTIPLIER + + } +)_"; + + if(_bias && _bias->has_valid_id()) + { + code += R"_( + TILE({{BIA_DATA_TYPE}}, 1, N0, bias0); + + T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, g_ind_0, 0, 1, 0, bias0); + + // c = c + bias[broadcasted] + T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}}); + )_"; +} + +code += R"_( +} +//------------------ END KERNEL {{meta_kernel_id}} --------------------- +)_"; + return code; +} + +void ClTemplateDirectConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +{ + vtable.declare_variable( + _src, + GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + comp_group.is_intermediate_tensor(_src), + "src"); + + const GpuKernelArgumentInfo::Type weight_type = _settings.export_to_cl_image() ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer; + vtable.declare_variable( + _weight, + GpuKernelArgumentInfo(weight_type), + comp_group.is_intermediate_tensor(_weight), + "weight"); + + if(_bias && _bias->has_valid_id()) // optional bias + { + vtable.declare_variable( + _bias, + GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector), + comp_group.is_intermediate_tensor(_bias), + "bias"); + } + vtable.declare_variable( + _dst, + GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + comp_group.is_intermediate_tensor(_dst), + "dst"); +} + +TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +{ + TagLUT lut{}; + // Arguments and global shared variables + lut["src"] = vtable.get_variable(_src); + lut["weight"] = vtable.get_variable(_weight); + + if(_bias && _bias->has_valid_id()) // optional bias + { + lut["bias"] = vtable.get_variable(_bias); + lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type()); + } + lut["dst"] = vtable.get_variable(_dst); + + const auto dst_argument = vtable.get_variable(comp_group.get_dst_tensors()[0]); + lut["arg_dst"] = dst_argument.uniq_name; + + // Local build options + lut["meta_kernel_id"] = id(); + lut["ACC_DATA_TYPE"] = _src->data_type(); + lut["SRC_DATA_TYPE"] = _src->data_type(); + lut["WEI_DATA_TYPE"] = _weight->data_type(); + + lut["SRC_TENSOR_TYPE"] = "BUFFER"; + switch(vtable.get_variable(_weight).kernel_argument_info.type) + { + case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D: + case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D: + case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image: + { + lut["WEI_TENSOR_TYPE"] = "IMAGE"; + break; + } + default: + { + lut["WEI_TENSOR_TYPE"] = "BUFFER"; + break; + } + } + const auto width_idx = 1; + const auto height_idx = 2; + lut["WEI_WIDTH"] = _weight->dimension(width_idx); + lut["WEI_HEIGHT"] = _weight->dimension(height_idx); + + lut["STRIDE_X"] = _attributes.stride().x(); + lut["STRIDE_Y"] = _attributes.stride().y(); + + lut["PAD_LEFT"] = _attributes.pad().left; + lut["PAD_TOP"] = _attributes.pad().top; + + lut["ZERO_VALUE"] = 0; + + return lut; +} + +CLBuildOptions ClTemplateDirectConv2d::get_build_options(const ComponentGroup &comp_group) const +{ + const unsigned int channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL); + const DataType data_type = _src->data_type(); + + /// NOTE: For now tile sizes (n0, m0, n0) are set by the execution window. This may change in the future + const auto root_window = comp_group.get_root_component()->template_writer()->get_window(); + const unsigned int n0 = root_window.x().step(); + const unsigned int m0 = root_window.y().step(); + const unsigned int k0 = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, _src->dimension(channel_idx)); + const unsigned int partial_store_n0 = _dst->dimension(0) % n0; + + CLBuildOptions build_opts{}; + if(_settings.fast_relaxed_math()) + { + build_opts.add_option("-cl-fast-relaxed-math"); + } + else + { + // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations + // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations + build_opts.add_option("-cl-unsafe-math-optimizations"); + } + build_opts.add_option("-DIS_TILED"); + build_opts.add_option("-DN0=" + support::cpp11::to_string(n0)); + build_opts.add_option("-DM0=" + support::cpp11::to_string(m0)); + build_opts.add_option("-DK0=" + support::cpp11::to_string(k0)); + build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0)); + + return build_opts; +} + +std::string ClTemplateDirectConv2d::get_config_id() const +{ + const DataType data_type = _src->data_type(); + const DataLayout data_layout = _src->data_layout(); + + const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + const unsigned int kernel_size = _weight->dimension(width_idx); + + std::string config_id{}; + config_id += lower_string(string_from_data_type(data_type)); + config_id += "_"; + config_id += support::cpp11::to_string(kernel_size); + config_id += "_"; + config_id += support::cpp11::to_string(_attributes.stride().x()); + config_id += "_"; + config_id += support::cpp11::to_string(_attributes.stride().y()); + config_id += "_"; + config_id += support::cpp11::to_string(_dst->dimension(width_idx)); + config_id += "_"; + config_id += support::cpp11::to_string(_dst->dimension(height_idx)); + config_id += "_"; + config_id += lower_string(string_from_data_layout(data_layout)); + return config_id; +} + +std::set<std::string> ClTemplateDirectConv2d::get_headers_list() const +{ + return std::set<std::string>{ "helpers.h", "tile_helpers.h" }; +} + +Window ClTemplateDirectConv2d::get_window() const +{ + ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); + + const auto output_shape = _dst->tensor_shape(); + + const unsigned int vec_size = std::min(static_cast<unsigned int>(output_shape[0]), 4u); + const unsigned int num_rows = (_dst->tensor_shape()[0] > 16) ? ((_src->data_type() == DataType::F32) ? 2U : 4U) : 1U; + + // Create and configure kernel window + Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows)); + + const size_t dim_y_collapsed = ceil_to_multiple(output_shape[1] * output_shape[2], num_rows); + win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, num_rows)); + win.set(Window::DimZ, Window::Dimension(0, output_shape.total_size_upper(3), 1)); + + return win; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h new file mode 100644 index 0000000000..48027a9b8d --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDIRECTCONV2D +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDIRECTCONV2D + +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h" +#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h" +#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +class ClTemplateDirectConv2d final : public IGpuTemplateComponentWriter +{ +public: + using Attributes = ClComponentDirectConv2d::Attributes; + using Settings = ClComponentDirectConv2d::Settings; + /** Constructor + * + * Similar to @ref ClComponentDirectConv2d::validate() + * + * @param[in] id Component id + * @param[in] tensors Tensor arguments to the components + * @param[in] attributes Component attributes + * @param[in] settings Component settings + */ + ClTemplateDirectConv2d(ComponentId id, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings); + /** Prevent instances of this class from being copy constructed */ + ClTemplateDirectConv2d(const ClTemplateDirectConv2d &direct_conv2d) = delete; + /** Prevent instances of this class from being copied */ + ClTemplateDirectConv2d &operator=(const ClTemplateDirectConv2d &direct_conv2d) = delete; + /** Allow instances of this class to be move constructed */ + ClTemplateDirectConv2d(ClTemplateDirectConv2d &&direct_conv2d) = default; + /** Allow instances of this class to be moved */ + ClTemplateDirectConv2d &operator=(ClTemplateDirectConv2d &&direct_conv2d) = default; + /** Generate kernel component name */ + std::string get_name() const override; + /** Generate kernel component code template + * + * @param[in] comp_group Component group of which the component is a part of + * + * @return std::string Component code + */ + std::string get_component_code(const ComponentGroup &comp_group) const override; + /** Declare all variables used by the component in the @p vtable + * + * @param[out] vtable Variable table + * @param[in] comp_group Component group of which the component is a part of + */ + void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override; + /** Generate the tag look-up table used to instantiate the component code. + * + * @param[in] vtable Variable table + * @param[in] comp_group Component group of which the component is a part of + * + * @return TagLUT Tag lookup table + */ + TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override; + /** Generate the build options used in the component + * + * @param[in] comp_group Component group of which the component is a part of + * + * @return CLBuildOptions Build options + */ + CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override; + /** Generate the component config id string used for tuning */ + std::string get_config_id() const override; + /** Generate the header list used in the component */ + std::set<std::string> get_headers_list() const override; + /** Generate the execution window for the component */ + Window get_window() const override; + +private: + const ITensorInfo *_src; + const ITensorInfo *_weight; + const ITensorInfo *_bias; + const ITensorInfo *_dst; + Attributes _attributes; + Settings _settings; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDIRECTCONV2D */ diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp new file mode 100644 index 0000000000..6c4b8f52f2 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "ClTemplateStore.h" + +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +ClTemplateStore::ClTemplateStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors) + : IGpuTemplateComponentWriter{ id, tensors }, _src{}, _dst{} +{ + _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); + _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); +} + +std::string ClTemplateStore::get_name() const +{ + return "store"; +} + +std::string ClTemplateStore::get_component_code(const ComponentGroup &comp_group) const +{ + ARM_COMPUTE_UNUSED(comp_group); + return R"_( +//------------------ START KERNEL {{meta_kernel_id}} STORE --------------------- +{ +// This also follows NHWC layout +// g_ind_0 maps to global_id(0) maps to Channel +// g_ind_1 maps to global_id(1) maps to Height and Weight (Collapsed Window) +// g_ind_2 maps to global_id(2) maps to N / Batch +#define _IDST_WIDTH {{dst}}_w +#define _IDST_HEIGHT {{dst}}_h + TILE(uint, M0, 1, dst_indirect_y); + + // Calculate the destination indirect Y + LOOP_UNROLLING(int, i, 0, 1, M0, + { + dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1); + dst_indirect_y[i].v += g_ind_2 * (int)(_IDST_WIDTH * _IDST_HEIGHT); + }) + + bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0; + + T_STORE_INDIRECT_WIDTH_SELECT({{DST_DATA_TYPE}}, M0, N0, PARTIAL_N0, {{DST_TENSOR_TYPE}}, {{dst}}, g_ind_0, {{dst}}_stride_y, x_cond, {{src}}, dst_indirect_y); + +#undef _IDST_WIDTH +#undef _IDST_HEIGHT + //------------------ END KERNEL {{meta_kernel_id}} STORE --------------------- +} + +)_"; +} + +void ClTemplateStore::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +{ + // ARM_COMPUTE_UNUSED(comp_group) + vtable.declare_variable( + _src, + GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + comp_group.is_intermediate_tensor(_src), + "src"); + vtable.declare_variable( + _dst, + GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + comp_group.is_intermediate_tensor(_dst), + "dst"); +} + +TagLUT ClTemplateStore::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +{ + TagLUT lut{}; + + // Arguments and global shared variables + lut["src"] = vtable.get_variable(_src); + lut["dst"] = vtable.get_variable(_dst); + + // Local build options + lut["meta_kernel_id"] = id(); + lut["DST_TENSOR_TYPE"] = "BUFFER"; + const auto dst_info = comp_group.get_dst_tensors()[0]; + lut["DST_DATA_TYPE"] = dst_info->data_type(); + + return lut; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h new file mode 100644 index 0000000000..3f97a82204 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE + +#include "arm_compute/core/experimental/Types.h" +#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h" +#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +class ClTemplateStore final : public IGpuTemplateComponentWriter +{ +public: + /** Constructor + * + * @param[in] id Component id + * @param[in] tensors Tensor arguments to the components + */ + ClTemplateStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors); + /** Prevent instances of this class from being copy constructed */ + ClTemplateStore(const ClTemplateStore &store) = delete; + /** Prevent instances of this class from being copied */ + ClTemplateStore &operator=(const ClTemplateStore &store) = delete; + /** Allow instances of this class to be move constructed */ + ClTemplateStore(ClTemplateStore &&store) = default; + /** Allow instances of this class to be moved */ + ClTemplateStore &operator=(ClTemplateStore &&store) = default; + /** Generate kernel component name */ + std::string get_name() const override; + /** Generate kernel component code template + * + * @param[in] comp_group Component group of which the component is a part of + * + * @return std::string Component code + */ + std::string get_component_code(const ComponentGroup &comp_group) const override; + /** Declare all variables used by the component in the @p vtable + * + * @param[out] vtable Variable table + * @param[in] comp_group Component group of which the component is a part of + */ + void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override; + /** Generate the tag look-up table used to instantiate the component code. + * + * @param[in] vtable Variable table + * @param[in] comp_group Component group of which the component is a part of + * + * @return TagLUT Tag lookup table + */ + TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override; + +private: + const ITensorInfo *_src; + const ITensorInfo *_dst; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE */ diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp new file mode 100644 index 0000000000..cb643a741d --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp @@ -0,0 +1,297 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "ClTemplateWriter.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" +#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/// @note: some tags can be unused since they could be used only for the macros, or only for the component code +std::string ClTemplateWriter::replace_tags(const std::string &code_template, const TagLUT &tags) +{ + std::string replaced_code = ""; + bool scanning_pattern = false; + std::string pattern_found = ""; + for(size_t i = 0; i < code_template.size() - 1; ++i) + { + if(!scanning_pattern) + { + if(code_template[i] == '{' && code_template[i + 1] == '{') + { + i += 1; + scanning_pattern = true; + pattern_found = ""; + } + else + { + replaced_code += code_template[i]; + } + } + else + { + if(code_template[i] == '}' && code_template[i + 1] == '}') + { + i += 1; + scanning_pattern = false; + std::string err = "Pattern " + pattern_found + " not found in tags"; + ARM_COMPUTE_ERROR_ON_MSG(tags.find(pattern_found) == tags.end(), err.c_str()); + replaced_code += tags.find(pattern_found)->second.value; + } + else + { + pattern_found += code_template[i]; + } + } + } + + return replaced_code; +} +ClTemplateWriter::~ClTemplateWriter() +{ +} +ClTemplateWriter::ClTemplateWriter(const GpuKernelComponentGroup &components) + : _components{ components } +{ +} +std::string ClTemplateWriter::get_name() +{ + return write_kernel_name(); +} +std::string ClTemplateWriter::get_code() +{ + return write_code(); +} +std::string ClTemplateWriter::get_config_id() +{ + std::string config_id = get_name(); + for(const auto &comp : _components) + { + config_id += "--" + comp->template_writer()->get_config_id() + "--"; + } + + return config_id; +} + +CLBuildOptions ClTemplateWriter::get_build_options() +{ + CLBuildOptions build_opts{}; + + for(const auto &comp : _components) + { + build_opts.add_options(comp->template_writer()->get_build_options(_components).options()); + } + + return build_opts; +} + +Window ClTemplateWriter::get_window() const +{ + const auto root_comp = _components.get_root_component(); + ARM_COMPUTE_ERROR_ON_MSG(root_comp == nullptr, "No root component found"); + return root_comp->template_writer()->get_window(); +} + +std::map<ITensorInfo::Id, GpuKernelArgument> ClTemplateWriter::get_tensors() +{ + // Assemble GpuKernelArguments + std::map<ITensorInfo::Id, GpuKernelArgument> tensors; + for(const auto t : _components.get_argument_tensors()) + { + tensors.emplace( + t->id(), + GpuKernelArgument{ *t, _vtable.get_variable(t).kernel_argument_info }); + } + return tensors; +} + +std::string ClTemplateWriter::write_code() +{ + ARM_COMPUTE_ERROR_ON_MSG(_components.empty(), "No components found"); + + // These data structures will hold the data from all the components in the blueprint + std::set<std::string> headers_list{}; + std::set<std::string> additional_macros{}; + std::vector<std::string> component_codes{}; // vector because order matters + + // Pass 1: Declare all kernel variables + for(auto &component : _components) + { + component->template_writer()->declare_variables(_vtable, _components); + } + // Pass 2: Generate component codes + for(auto &component : _components) + { + const auto component_writer = component->template_writer(); + auto curr_headers_list = component_writer->get_headers_list(); + auto curr_additional_macros = component_writer->get_additional_macros(); + auto curr_component_code = component_writer->get_component_code(_components); + const auto var_lut = component_writer->get_tag_lut(_vtable, _components); // Ideally can be merged with get_component_code once we have finer-grained code generation technique + component_codes.push_back(replace_tags(curr_component_code, var_lut)); + + headers_list.insert(curr_headers_list.begin(), curr_headers_list.end()); + if(!additional_macros.empty()) // Some components might not have any + { + additional_macros.insert(replace_tags(curr_additional_macros, var_lut)); + } + } + + // Step 3: Assemble the data gathered by traversing the graph into the string "code" + std::string code = ""; + + for(auto &header : headers_list) + { +#if defined(EMBEDDED_KERNELS) + code += CLKernelLibrary::get().get_program(header).first; +#else // defined(EMBEDDED_KERNELS) + code += "#include \"" + header + "\"\n"; +#endif // defined(EMBEDDED_KERNELS) + } + + for(auto ¯os : additional_macros) + { + code += macros; + } + + code += write_kernel_signature(_vtable.get_variable_list(_components.get_argument_tensors())); + + code += "\n{\n\n"; + + code += " //------------------ START KERNEL_BUILDER_COORDINATE ---------------------\n\n"; + code += write_global_section(); + code += " //------------------ END KERNEL_BUILDER_COORDINATE ---------------------\n"; + + for(const auto &component_code : component_codes) + { + code += component_code; + } + + code += "}\n"; + + return code; +} +std::string ClTemplateWriter::write_global_section() const +{ + const auto dst_tensors = _components.get_dst_tensors(); + ARM_COMPUTE_ERROR_ON_MSG(dst_tensors.size() != 1, "Only one destination tensor per kernel is allowed"); + const auto dst_info = dst_tensors[0]; + const auto dst_w = dst_info->dimension(0); + const auto tile_w = std::max(1, get_window().x().step()); + const auto tile_h = std::max(1, get_window().y().step()); + auto leftover_w = dst_w % tile_w; + + std::string code = ""; + code += std::string(" int g_ind_0 = GET_SPATIAL_IDX(0, ") + std::to_string(tile_w) + ", " + std::to_string(leftover_w) + ");\n"; + code += std::string(" int g_ind_1 = GET_SPATIAL_IDX(1, ") + std::to_string(tile_h) + ", " + "0);\n"; + code += std::string(" int g_ind_2 = GET_SPATIAL_IDX(2, 1, 0);\n\n"); + + code += " const bool g_cond_x = (g_ind_0 == 0);\n"; + code += " const bool g_cond_y = (g_ind_1 == 0);\n"; + + return code; +} +std::string ClTemplateWriter::write_argument_declaration(const GpuKernelVariableTable::TensorVariable &var) const +{ + std::string code; + switch(var.kernel_argument_info.type) + { + case GpuKernelArgumentInfo::Type::Vector: + { + code += "\n VECTOR_DECLARATION(" + var.uniq_name + ")"; + break; + } + case GpuKernelArgumentInfo::Type::Image: + { + code += "\n IMAGE_DECLARATION(" + var.uniq_name + ")"; + break; + } + case GpuKernelArgumentInfo::Type::Image_3D: + { + code += "\n IMAGE_DECLARATION(" + var.uniq_name + "),"; + code += "\n unsigned int " + var.uniq_name + "_stride_z"; + break; + } + case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D: + { + code += "\n __read_only image2d_t " + var.uniq_name + "_img,"; + code += "\n unsigned int " + var.uniq_name + "_stride_z"; + break; + } + case GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer: + { + code += "\n TENSOR4D_T(" + var.uniq_name + ", BUFFER)"; + break; + } + case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image: + { + code += "\n TENSOR4D_T(" + var.uniq_name + ", IMAGE)"; + break; + } + default: + { + ARM_COMPUTE_ERROR("Unsupported declaration generation for GpuKernelArgumentInfo::Type"); + } + } + return code; +} +std::string ClTemplateWriter::write_kernel_signature(const GpuKernelVariableTable::VariableList &argument_list) const +{ + std::string code = "\n__kernel void " + write_kernel_name() + "("; + + for(int i = 0; i < static_cast<int>(argument_list.size()) - 1; ++i) + { + code += write_argument_declaration(argument_list[i]) + ","; + } + if(static_cast<int>(argument_list.size()) - 1 >= 0) + { + code += write_argument_declaration(argument_list[argument_list.size() - 1]); + } + + code += ')'; + + return code; +} +std::string ClTemplateWriter::write_kernel_name() const +{ + if(_components.empty()) + { + return "empty_kernel"; + } + std::string name = _components.empty() ? "" : _components[0]->template_writer()->get_name(); + for(size_t i = 1; i < _components.size(); ++i) + { + name += "___"; + name += _components[i]->template_writer()->get_name(); + } + + return name; +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h new file mode 100644 index 0000000000..83f617b6c6 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEWRITER +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEWRITER + +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h" +#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h" + +#include <map> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Use a templated-string-based method to write kernel code + * It stitches the component code templates together based on the valid fusion configuration. + * It then instantiates the actual kernel code from the template and the generated tag lookup table. + */ +class ClTemplateWriter : public IGpuKernelWriter +{ +public: + /** Instantiates a kernel code string from the kernel code template + * @note: some tags can be unused since they could be used only for the macros, or only for the component code + * + * @param[in] code_template Kernel code template + * @param[in] tags Tag lookup table + * + * @return std::string Instantiated kernel string + */ + static std::string replace_tags(const std::string &code_template, const TagLUT &tags); + /** Default constructor */ + ClTemplateWriter() = default; + /** Constructor + * + * @param[in] components Kernel component group from which the kernel will be generated + */ + ClTemplateWriter(const GpuKernelComponentGroup &components); + /** Destructor */ + ~ClTemplateWriter() override; + /** Generate kernel name */ + std::string get_name() override; + /** Generate kernel code */ + std::string get_code() override; + /** Generate build options */ + CLBuildOptions get_build_options() override; + /** Generate config id string of the entire kernel. This is used for tuning */ + std::string get_config_id() override; + /** Generate execution window */ + Window get_window() const override; + /** Get the kernel argument lists of the kernel*/ + std::map<ITensorInfo::Id, GpuKernelArgument> get_tensors() override; + +private: + std::string write_kernel_name() const; + std::string write_code(); + std::string write_global_section() const; + std::string write_argument_declaration(const GpuKernelVariableTable::TensorVariable &var) const; + std::string write_kernel_signature(const GpuKernelVariableTable::VariableList &argument_list) const; + +private: + GpuKernelComponentGroup _components{}; + GpuKernelVariableTable _vtable{}; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEWRITER */ diff --git a/src/dynamic_fusion/sketch/utils/DependencyGraph.h b/src/dynamic_fusion/sketch/utils/DependencyGraph.h new file mode 100644 index 0000000000..55eb4c5c77 --- /dev/null +++ b/src/dynamic_fusion/sketch/utils/DependencyGraph.h @@ -0,0 +1,658 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_UTILS_DEPENDENCYGRAPH +#define SRC_DYNAMIC_FUSION_SKETCH_UTILS_DEPENDENCYGRAPH + +#include "arm_compute/core/Error.h" +#include <algorithm> +#include <cstdint> +#include <deque> +#include <map> +#include <set> +#include <tuple> +#include <vector> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +template <typename T> +bool is_in(const T &v, const std::vector<T> &vec) +{ + return std::find(std::begin(vec), std::end(vec), v) != std::end(vec); +} +} // namespace + +/** A multi-input (tensors), multi-output (tensors) acyclic directed graph + * Represented as a doubly-linked adjacency list with the differentiation between source and destination + */ +class DependencyGraph +{ +public: + using Id = int32_t; + using TensorId = Id; + using OperatorId = Id; + /** Adjacency list + * + */ + using AdjList = std::map<Id, std::vector<Id>>; + + /** A pack of operator including its input and output tensors, used by traversing through the graph in topological order + * + */ + struct OpPack + { + OperatorId op{}; + std::vector<TensorId> inputs{}; + std::vector<TensorId> outputs{}; + friend bool operator==(const OpPack &opp0, const OpPack &opp1) + { + return std::make_tuple( + opp0.op, opp0.inputs, opp0.outputs) + == std::make_tuple( + opp1.op, opp1.inputs, opp1.outputs); + } + }; + +public: + DependencyGraph() = default; + friend std::ostream &operator<<(std::ostream &os, const DependencyGraph &); + + /** Try adding an operator (without actually adding it), while keeping the graph as a "linear sequence" / list + * @note The list is expected to only grow from head to tail + * + * PRECONDITION: The current graph is already linear + * + * @return true If the operator can be added while keeping the graph as a linear sequence + * @return false Otherwise + */ + bool try_add_operator_as_linear(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs) const + { + ARM_COMPUTE_UNUSED(op, outputs); + if(all_ops().empty()) + { + return true; + } + std::vector<TensorId> common_tensors{}; + auto existing_tensors = all_tensors(); + std::sort(existing_tensors.begin(), existing_tensors.end()); // To use std::set_intersection, both input sets must be sorted + std::vector<TensorId> sorted_inputs{ inputs }; + std::sort(sorted_inputs.begin(), sorted_inputs.end()); + std::set_intersection(existing_tensors.begin(), existing_tensors.end(), + sorted_inputs.begin(), sorted_inputs.end(), std::back_inserter(common_tensors)); + if(common_tensors.size() != 1U) + { + return false; + } + const auto linked_tensor = common_tensors[0]; + const auto tail_ops = get_dst_ops(); + ARM_COMPUTE_ERROR_ON(tail_ops.size() != 1U); // PRECONDITION + const auto tail = tail_ops[0]; + + if(!is_in(linked_tensor, dst_tensors(tail))) + { + return false; + } + return true; + } + /** Add an operator, while keeping the graph as a "linear sequence" + * + * PRECONDITION: The current graph is already linear + * INVARIANT: The list can only grow from head to tail + * INVARIANT: POSTCONDITION: The graph is linear + */ + void add_operator_as_linear(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs) + { + ARM_COMPUTE_ERROR_ON(!try_add_operator_as_linear(op, inputs, outputs)); + auto success = add_operator(op, inputs, outputs); + ARM_COMPUTE_ERROR_ON(!success); + } + /** Add a new operator + * Return invalid if it violates the DAG invariant + * Invalid operation will not change the graph + * + * @param[in] op Operator to add + * @param[in] inputs Input tensors to the operator + * @param[in] outputs Output tensors to the operator + */ + bool add_operator(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs) + { + if(operator_exists(op)) + { + return false; + } + _adj_src_tensors[op] = {}; + _adj_dst_tensors[op] = {}; + for(auto in_tensor : inputs) + { + // Linking input tensor to operator node will never create a cycle / loop because we guarantee + // each op is newly created, so every <input, op> pair / edge is new + link_input(op, in_tensor); + } + for(auto out_tensor : outputs) + { + // If there exists a back path from op's output tensor to op already, then linking the two will create a loop / cycle + if(path_exists_from_tensor_to_op(out_tensor, op)) + { + remove_operator(op); + return false; + } + else + { + link_output(op, out_tensor); + } + } + + return true; + } + + /** Sort the graph in a topological order + * + * @return std::vector<OpPack> + */ + std::vector<OpPack> topological_sort() const + { + // Incident degree (number of source operators to an op) + std::map<OperatorId, unsigned int> in_degree{}; + std::set<OperatorId> visited_ops{}; + std::deque<OperatorId> zero_in_degree_ops{}; + std::vector<OpPack> sorted_op_packs{}; + for(auto op : all_ops()) + { + const auto degree = src_ops(op).size(); + in_degree[op] = degree; + if(degree == 0) + { + zero_in_degree_ops.push_back(op); + visited_ops.insert(op); + } + } + + while(!zero_in_degree_ops.empty()) + { + const OperatorId op = zero_in_degree_ops.front(); + zero_in_degree_ops.pop_front(); + sorted_op_packs.push_back(OpPack{ op, src_tensors(op), dst_tensors(op) }); + + for(const auto next_op : dst_ops(op)) + { + if(in_degree[next_op] > 0) + { + in_degree[next_op]--; + } + if(in_degree[next_op] == 0 && visited_ops.find(next_op) == visited_ops.end()) + { + zero_in_degree_ops.push_back(next_op); + visited_ops.insert(op); + } + } + } + + return sorted_op_packs; + } + + void find_independent_paths_util(OperatorId op, std::vector<std::vector<OperatorId>> &paths, std::vector<OperatorId> cur_path, + const std::map<OperatorId, unsigned int> &in_degree) const + { + // We have found an unresolved dependency + if(in_degree.at(op) > 1) + { + paths.push_back(cur_path); + return; + } + const auto child_ops = dst_ops(op); + + cur_path.push_back(op); + // Hit the leaf op + if(child_ops.empty()) + { + paths.push_back(cur_path); + return; + } + for(const auto child_op : child_ops) + { + find_independent_paths_util(child_op, paths, cur_path, in_degree); + } + } + /** Find all independent linear paths from op, which doesn't depend on any other op + * + * @return std::vector<OpPack> + */ + std::vector<std::vector<OperatorId>> find_independent_paths(OperatorId op, + const std::map<OperatorId, unsigned int> &in_degree) const + { + std::vector<std::vector<OperatorId>> paths; + std::vector<OperatorId> cur_path; + find_independent_paths_util(op, paths, cur_path, in_degree); + return paths; + } + /** Find a longest linear path from op, which doesn't depend on any other op + * + * @return std::vector<OpPack> + */ + std::vector<OperatorId> find_longest_independent_path(OperatorId op, + const std::map<OperatorId, unsigned int> &in_degree) const + { + const auto &paths = find_independent_paths(op, in_degree); + ARM_COMPUTE_ERROR_ON(paths.empty()); + size_t max_len = 0; + const std::vector<OperatorId> *max_path = nullptr; + + for(const auto &path : paths) + { + if(path.size() >= max_len) + { + max_path = &path; + max_len = path.size(); + } + } + return *max_path; + } + std::vector<OperatorId> propose_next_path(std::set<OperatorId> &candidate_ops, + const std::map<OperatorId, unsigned int> &in_degree) const + { + if(candidate_ops.empty()) + { + return {}; + } + size_t max_len = 0; + std::vector<OperatorId> max_path; + OperatorId chosen_op{}; + for(auto op : candidate_ops) + { + const auto path = find_longest_independent_path(op, in_degree); + if(path.size() >= max_len) + { + chosen_op = op; + max_path = path; + max_len = path.size(); + } + } + candidate_ops.erase(chosen_op); + return max_path; + } + /** Partition the graph into a list of linear sub-"graphs", while preserving the topological order, and trying to minimize + * the number of partitions + */ + std::vector<std::vector<OpPack>> topological_partition() const + { + // Initialize zero incident degree and zero in degree ops + std::map<OperatorId, unsigned int> in_degree{}; + std::set<OperatorId> candidate_ops{}; + for(auto op : all_ops()) + { + const auto degree = src_ops(op).size(); + in_degree[op] = degree; + if(degree == 0) + { + candidate_ops.insert(op); + } + } + + std::vector<std::vector<OpPack>> sorted_partitions{}; + while(!candidate_ops.empty()) + { + // generate_longest_path_from_zero_indegree_ops(in_degree, visited_ops, candidate_ops) + const auto path = propose_next_path(candidate_ops, in_degree); + + // Append to sorted_partitions + std::vector<OpPack> path_op_pack{}; + for(auto op : path) + { + path_op_pack.push_back(OpPack{ op, src_tensors(op), dst_tensors(op) }); + } + sorted_partitions.push_back(path_op_pack); + // Remove whole path (Update in_degree, visited_ops, candidate_ops) + for(auto op : path) + { + for(const auto next_op_child : dst_ops(op)) + { + if(in_degree[next_op_child] > 0) + { + in_degree[next_op_child]--; + } + if(in_degree[next_op_child] == 0 && !is_in(next_op_child, path)) // We do not want to put the proposed path back into candidates + { + candidate_ops.insert(next_op_child); + } + } + } + } + return sorted_partitions; + } + + /** Strict equality comparison (all internal ids and order of insertion matter). + * In the future this may be replaced with a topological comparison, allowing equivalent graphs with different internal ids to be equal + * + * + * @param[in] g0 + * @param[in] g1 + * @return true If the same + * @return false Otherwise + */ + friend bool operator==(const DependencyGraph &g0, const DependencyGraph &g1) + { + // Do not compare id allocators + return std::make_tuple( + g0._adj_src_tensors, g0._adj_dst_tensors, g0._adj_src_ops, g0._adj_dst_ops) + == std::make_tuple( + g1._adj_src_tensors, g1._adj_dst_tensors, g1._adj_src_ops, g1._adj_dst_ops); + } + std::vector<OperatorId> src_ops_from_tensor(TensorId tensor) const + { + return _adj_src_ops.at(tensor); + } + std::vector<OperatorId> dst_ops_from_tensor(TensorId tensor) const + { + return _adj_dst_ops.at(tensor); + } + /** Get all tensors + * + * @return std::vector<TensorId> + */ + std::vector<TensorId> all_tensors() const + { + std::vector<TensorId> tensors{}; + std::transform(std::begin(_adj_src_ops), std::end(_adj_src_ops), std::back_inserter(tensors), [](const auto & it) + { + return it.first; + }); + return tensors; + } + /** Get source tensors of the whole graph + * + * @return std::vector<TensorId> + */ + std::vector<TensorId> global_src_tensors() const + { + std::vector<TensorId> tensors; + for(auto tensor_src_ops : _adj_src_ops) + { + if(tensor_src_ops.second.empty()) + { + tensors.push_back(tensor_src_ops.first); + } + } + return tensors; + } + /** Get destination tensors of the whole graph + * + * @return std::vector<TensorId> + */ + std::vector<TensorId> global_dst_tensors() const + { + std::vector<TensorId> tensors; + for(auto tensor_dst_ops : _adj_dst_ops) + { + if(tensor_dst_ops.second.empty()) + { + tensors.push_back(tensor_dst_ops.first); + } + } + return tensors; + } + /** Get all root ops. Root ops can also be referred to as "src ops" of the whole graph + * + * @return std::vector<OperatorId> + */ + std::vector<OperatorId> get_root_ops() const + { + std::vector<OperatorId> ops{}; + const auto op_list = all_ops(); + + for(auto op : op_list) + { + if(src_ops(op).empty()) + { + ops.emplace_back(op); + } + } + return ops; + } + +private: + void link_input(OperatorId op, TensorId in_tensor) + { + ARM_COMPUTE_ERROR_ON(!operator_exists(op)); + if(!tensor_exists(in_tensor)) + { + insert_new_tensor(in_tensor); + } + ARM_COMPUTE_ERROR_ON(are_connected(op, in_tensor)); // Prevent repetitive linking + _adj_src_tensors[op].push_back(in_tensor); + _adj_dst_ops[in_tensor].push_back(op); + } + void link_output(OperatorId op, TensorId out_tensor) + { + ARM_COMPUTE_ERROR_ON(!operator_exists(op)); + if(!tensor_exists(out_tensor)) + { + insert_new_tensor(out_tensor); + } + ARM_COMPUTE_ERROR_ON(are_connected(op, out_tensor)); // Prevent repetitive linking + _adj_dst_tensors[op].push_back(out_tensor); + _adj_src_ops[out_tensor].push_back(op); + } + + std::vector<OperatorId> src_ops(OperatorId op) const + { + ARM_COMPUTE_ERROR_ON(!operator_exists(op)); + std::vector<OperatorId> ops{}; + for(TensorId src_tensor : src_tensors(op)) + { + ops.insert(ops.end(), std::begin(_adj_src_ops.at(src_tensor)), std::end(_adj_src_ops.at(src_tensor))); + } + return ops; + } + std::vector<OperatorId> dst_ops(OperatorId op) const + { + ARM_COMPUTE_ERROR_ON(!operator_exists(op)); + std::vector<OperatorId> ops{}; + for(TensorId dst_tensor : _adj_dst_tensors.at(op)) + { + ops.insert(ops.end(), std::begin(_adj_dst_ops.at(dst_tensor)), std::end(_adj_dst_ops.at(dst_tensor))); + } + return ops; + } + + /** Get source tensors to an operator + * + * @param[in] op + * @return std::vector<TensorId> + */ + std::vector<TensorId> src_tensors(OperatorId op) const + { + ARM_COMPUTE_ERROR_ON(!operator_exists(op)); + return _adj_src_tensors.at(op); + } + /** Get destination tensors to an operator + * + * @param[in] op + * @return std::vector<TensorId> + */ + std::vector<TensorId> dst_tensors(OperatorId op) const + { + ARM_COMPUTE_ERROR_ON(!operator_exists(op)); + return _adj_dst_tensors.at(op); + } + /** Get all operators + * + * @return std::vector<OperatorId> + */ + std::vector<OperatorId> all_ops() const + { + std::vector<OperatorId> ops{}; + std::transform(std::begin(_adj_src_tensors), std::end(_adj_src_tensors), std::back_inserter(ops), [](const auto & it) + { + return it.first; + }); + return ops; + } + /** Remove an operator from graph. + * + * @param[in] op + */ + void remove_operator(OperatorId op) + { + for(auto src_tensor : _adj_src_tensors.at(op)) + { + auto &dst_ops = _adj_dst_ops.at(src_tensor); + dst_ops.erase( + std::remove(std::begin(dst_ops), std::end(dst_ops), op), + std::end(dst_ops)); + } + for(auto dst_tensor : _adj_dst_tensors.at(op)) + { + auto &src_ops = _adj_src_ops.at(dst_tensor); + src_ops.erase( + std::remove(std::begin(src_ops), std::end(src_ops), op), + std::end(src_ops)); + } + // Remove any isolated tensors + // An isolated tensor is one where both its _adj_src_ops and _adj_dst_ops are empty + for(auto t : all_tensors()) + { + if(_adj_src_ops.at(t).empty() && _adj_dst_ops.at(t).empty()) + { + _adj_src_ops.erase(t); + _adj_dst_ops.erase(t); + } + } + _adj_src_tensors.erase(op); + _adj_dst_tensors.erase(op); + } + void insert_new_tensor(TensorId tensor) + { + _adj_src_ops[tensor] = {}; + _adj_dst_ops[tensor] = {}; + } + bool tensor_exists(TensorId tensor) const + { + return _adj_src_ops.find(tensor) != _adj_src_ops.end() && _adj_dst_ops.find(tensor) != _adj_dst_ops.end(); + } + bool operator_exists(OperatorId op) const + { + return _adj_src_tensors.find(op) != _adj_src_tensors.end() && _adj_dst_tensors.find(op) != _adj_dst_tensors.end(); + } + bool is_src_tensor_of(OperatorId op, TensorId tensor) const + { + if(!operator_exists(op) || !tensor_exists(tensor)) + { + return false; + } + const auto op_inputs = src_tensors(op); + return std::find(op_inputs.begin(), op_inputs.end(), tensor) != op_inputs.end(); + } + bool is_dst_tensor_of(OperatorId op, TensorId tensor) const + { + if(!operator_exists(op) || !tensor_exists(tensor)) + { + return false; + } + const auto op_outputs = dst_tensors(op); + return std::find(op_outputs.begin(), op_outputs.end(), tensor) != op_outputs.end(); + } + bool are_connected(OperatorId op, TensorId tensor) const + { + return is_src_tensor_of(op, tensor) || is_dst_tensor_of(op, tensor); + } + /** If op is the destination / leaf operator of the whole graph + * + * @param[in] op + * @return true + * @return false + */ + bool is_dst_op(OperatorId op) const + { + return dst_ops(op).empty(); + } + std::vector<OperatorId> get_dst_ops() const + { + std::vector<OperatorId> ops{}; + const auto op_list = all_ops(); + + for(auto op : op_list) + { + if(is_dst_op(op)) + { + ops.emplace_back(op); + } + } + return ops; + } + bool path_exists_from_tensor_to_op(TensorId src_tensor, OperatorId dst_op) const + { + if(!tensor_exists(src_tensor) || !operator_exists(dst_op)) + { + return false; + } + for(auto child_op : dst_ops_from_tensor(src_tensor)) + { + if(path_exists_from_op_to_op(child_op, dst_op)) + { + return true; + } + } + return false; + } + + bool path_exists_from_op_to_op(OperatorId src_op, OperatorId dst_op) const + { + if(!operator_exists(src_op) || !operator_exists(dst_op)) + { + return false; + } + if(src_op == dst_op) + { + return true; + } + if(is_in(src_op, get_dst_ops())) + { + return false; + } + for(auto child_tensor : dst_tensors(src_op)) + { + if(path_exists_from_tensor_to_op(child_tensor, dst_op)) + { + return true; + } + } + return false; + } + +private: + AdjList _adj_src_tensors{}; + AdjList _adj_dst_tensors{}; + AdjList _adj_src_ops{}; + AdjList _adj_dst_ops{}; +}; + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_UTILS_DEPENDENCYGRAPH */ |