diff options
Diffstat (limited to 'src/dynamic_fusion/sketch')
105 files changed, 14885 insertions, 0 deletions
diff --git a/src/dynamic_fusion/sketch/ArgumentPack.h b/src/dynamic_fusion/sketch/ArgumentPack.h new file mode 100644 index 0000000000..d030bc3d45 --- /dev/null +++ b/src/dynamic_fusion/sketch/ArgumentPack.h @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_ARGUMENTPACK_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_ARGUMENTPACK_H + +#include "arm_compute/core/experimental/Types.h" + +#include <unordered_map> +#include <vector> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** This is a generic class that packs the arguments of an operator. For now, it is only used for tensor-related types + * Examples of "tensor-related types": @ref ITensorInfo, @ref ITensor, @ref ICLTensor + * + * The argument id is the position of the argument within the pack, and is represented by @ref TensorType + * + * @tparam T Tensor-related type + */ +template <typename T> +class ArgumentPack +{ +public: + /** @ref arm_compute::TensorType encodes the position of a tensor argument within the pack */ + using Id = TensorType; + /** A single argument element within the pack + * It contains either a const pointer or a non-const pointer to the Tensor-related type T, but never at the same time + */ + struct PackElement + { + PackElement() = default; + PackElement(const PackElement &elem) = default; + PackElement &operator=(const PackElement &elem) = default; + PackElement(PackElement &&elem) = default; + PackElement &operator=(PackElement &&elem) = default; + PackElement(Id id, T *tensor) : id(id), tensor(tensor), ctensor(nullptr) + { + } + PackElement(Id id, const T *ctensor) : id(id), tensor(nullptr), ctensor(ctensor) + { + } + + Id id{ACL_UNKNOWN}; /**< Argument id within the pack */ + T *tensor{nullptr}; /**< Non-const pointer to tensor-related object */ + const T *ctensor{nullptr}; /**< Const pointer to tensor-related object */ + }; + +public: + /** Default constructor */ + ArgumentPack() = default; + /** Destructor */ + ~ArgumentPack() = default; + /** Allow instances of this class to be copy constructed */ + ArgumentPack<T>(const ArgumentPack<T> &other) = default; + /** Allow instances of this class to be copied */ + ArgumentPack<T> &operator=(const ArgumentPack<T> &other) = default; + /** Allow instances of this class to be move constructed */ + ArgumentPack<T>(ArgumentPack<T> &&other) = default; + /** Allow instances of this class to be moved */ + ArgumentPack<T> &operator=(ArgumentPack<T> &&other) = default; + /** Initializer list Constructor */ + ArgumentPack(const std::initializer_list<PackElement> &l) : _pack{} + { + for (const auto &e : l) + { + _pack[e.id] = e; + } + } + /** Add tensor to the pack + * + * @param[in] id ID of the tensor to add + * @param[in] tensor Tensor to add + */ + void add_tensor(Id id, T *tensor) + { + _pack[id] = PackElement(id, tensor); + } + /** Add const tensor to the pack + * + * @param[in] id ID of the tensor to add + * @param[in] tensor Tensor to add + */ + void add_const_tensor(Id id, const T *tensor) + { + _pack[id] = PackElement(id, tensor); + } + /** Get tensor of a given id from the pack + * + * @param[in] id ID of tensor to extract + * + * @return The pointer to the tensor if exist and is non-const else nullptr + */ + T *get_tensor(Id id) + { + auto it = _pack.find(id); + return it != _pack.end() ? it->second.tensor : nullptr; + } + /** Get constant tensor of a given id + * + * @param[in] id ID of tensor to extract + * + * @return The pointer to the tensor (const or not) if exist else nullptr + */ + const T *get_const_tensor(Id id) const + { + auto it = _pack.find(id); + if (it != _pack.end()) + { + return it->second.ctensor != nullptr ? it->second.ctensor : it->second.tensor; + } + return nullptr; + } + /** Remove the tensor stored with the given id + * + * @param[in] id ID of tensor to remove + */ + void remove_tensor(Id id) + { + _pack.erase(id); + } + /** Pack size accessor + * + * @return Number of tensors registered to the pack + */ + size_t size() const + { + return _pack.size(); + } + /** Checks if pack is empty + * + * @return True if empty else false + */ + bool empty() const + { + return _pack.empty(); + } + /** Get the ACL_SRC_* tensors + * + * @return std::vector<T *> + */ + std::vector<T *> get_src_tensors() + { + std::vector<T *> src_tensors{}; + for (int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id) + { + auto tensor = get_tensor(static_cast<TensorType>(id)); + if (tensor != nullptr) + { + src_tensors.push_back(tensor); + } + } + return src_tensors; + } + /** Get the const ACL_SRC_* tensors + * + * @return std::vector<const T *> + */ + std::vector<const T *> get_const_src_tensors() const + { + std::vector<const T *> src_tensors{}; + for (int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id) + { + auto tensor = get_const_tensor(static_cast<TensorType>(id)); + if (tensor != nullptr) + { + src_tensors.push_back(tensor); + } + } + return src_tensors; + } + /** Get the ACL_DST_* tensors + * + * @return std::vector<T *> + */ + std::vector<T *> get_dst_tensors() + { + std::vector<T *> dst_tensors{}; + for (int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id) + { + auto tensor = get_tensor(static_cast<TensorType>(id)); + if (tensor != nullptr) + { + dst_tensors.push_back(tensor); + } + } + return dst_tensors; + } + /** Get the const ACL_DST_* tensors + * + * @return std::vector<const T *> + */ + std::vector<const T *> get_const_dst_tensors() const + { + std::vector<const T *> dst_tensors{}; + for (int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id) + { + auto tensor = get_const_tensor(static_cast<TensorType>(id)); + if (tensor != nullptr) + { + dst_tensors.push_back(tensor); + } + } + return dst_tensors; + } + +private: + std::unordered_map<int, PackElement> _pack{}; /**< Container with the packed tensors */ +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_ARGUMENTPACK_H diff --git a/src/dynamic_fusion/sketch/attributes/CastAttributes.cpp b/src/dynamic_fusion/sketch/attributes/CastAttributes.cpp new file mode 100644 index 0000000000..4ad94268f4 --- /dev/null +++ b/src/dynamic_fusion/sketch/attributes/CastAttributes.cpp @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +CastAttributes &CastAttributes::data_type(const DataType &data_type) +{ + _data_type = data_type; + return *this; +} + +DataType CastAttributes::data_type() const +{ + return _data_type; +} + +CastAttributes &CastAttributes::convert_policy(const ConvertPolicy &convert_policy) +{ + _convert_policy = convert_policy; + return *this; +} + +ConvertPolicy CastAttributes::convert_policy() const +{ + return _convert_policy; +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/attributes/ClampAttributes.cpp b/src/dynamic_fusion/sketch/attributes/ClampAttributes.cpp new file mode 100644 index 0000000000..b177f760df --- /dev/null +++ b/src/dynamic_fusion/sketch/attributes/ClampAttributes.cpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/dynamic_fusion/sketch/attributes/ClampAttributes.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ + +ClampAttributes &ClampAttributes::min_val(const float &min_val) +{ + _min_val = min_val; + return *this; +} + +float ClampAttributes::min_val() const +{ + return _min_val; +} + +ClampAttributes &ClampAttributes::max_val(const float &max_val) +{ + _max_val = max_val; + return *this; +} + +float ClampAttributes::max_val() const +{ + return _max_val; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/attributes/Conv2dAttributes.cpp b/src/dynamic_fusion/sketch/attributes/Conv2dAttributes.cpp new file mode 100644 index 0000000000..97e74f742d --- /dev/null +++ b/src/dynamic_fusion/sketch/attributes/Conv2dAttributes.cpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +Conv2dAttributes &Conv2dAttributes::pad(const Padding2D &pad) +{ + _pad = pad; + return *this; +} +Padding2D Conv2dAttributes::pad() const +{ + return _pad; +} +Conv2dAttributes &Conv2dAttributes::stride(const Size2D &stride) +{ + _stride = stride; + return *this; +} +Size2D Conv2dAttributes::stride() const +{ + return _stride; +} +Conv2dAttributes &Conv2dAttributes::dilation(const Size2D &dilation) +{ + _dilation = dilation; + return *this; +} +Size2D Conv2dAttributes::dilation() const +{ + return _dilation; +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp b/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp new file mode 100644 index 0000000000..6f3816568c --- /dev/null +++ b/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +DepthwiseConv2dAttributes &DepthwiseConv2dAttributes::pad(const Padding2D &pad) +{ + _pad = pad; + return *this; +} +Padding2D DepthwiseConv2dAttributes::pad() const +{ + return _pad; +} +DepthwiseConv2dAttributes &DepthwiseConv2dAttributes::stride(const Size2D &stride) +{ + _stride = stride; + return *this; +} +Size2D DepthwiseConv2dAttributes::stride() const +{ + return _stride; +} +DepthwiseConv2dAttributes &DepthwiseConv2dAttributes::dilation(const Size2D &dilation) +{ + _dilation = dilation; + return *this; +} +Size2D DepthwiseConv2dAttributes::dilation() const +{ + return _dilation; +} + +DepthwiseConv2dAttributes &DepthwiseConv2dAttributes::depth_multiplier(const uint32_t &depth_multiplier) +{ + _depth_multiplier = depth_multiplier; + return *this; +} + +uint32_t DepthwiseConv2dAttributes::depth_multiplier() const +{ + return _depth_multiplier; +} + +DepthwiseConv2dAttributes & +DepthwiseConv2dAttributes::dimension_rounding_type(const DimensionRoundingType &dimension_rounding_type) +{ + _dimension_rounding_type = dimension_rounding_type; + return *this; +} + +DimensionRoundingType DepthwiseConv2dAttributes::dimension_rounding_type() const +{ + return _dimension_rounding_type; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/attributes/MatMulAttributes.cpp b/src/dynamic_fusion/sketch/attributes/MatMulAttributes.cpp new file mode 100644 index 0000000000..027b550377 --- /dev/null +++ b/src/dynamic_fusion/sketch/attributes/MatMulAttributes.cpp @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/dynamic_fusion/sketch/attributes/MatMulAttributes.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +MatMulAttributes MatMulAttributes::adj_lhs(bool adj_lhs) +{ + _adj_lhs = adj_lhs; + return *this; +} +MatMulAttributes MatMulAttributes::adj_rhs(bool adj_rhs) +{ + _adj_rhs = adj_rhs; + return *this; +} +bool MatMulAttributes::adj_lhs() const +{ + return _adj_lhs; +} +bool MatMulAttributes::adj_rhs() const +{ + return _adj_rhs; +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp b/src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp new file mode 100644 index 0000000000..80f65f926a --- /dev/null +++ b/src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h" + +#include "arm_compute/core/Size2D.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +PoolingType Pool2dAttributes::pool_type() const +{ + return _pool_type; +} + +Pool2dAttributes Pool2dAttributes::pool_type(PoolingType pool_type) +{ + _pool_type = pool_type; + return *this; +} + +Padding2D Pool2dAttributes::pad() const +{ + return _pad; +} + +Pool2dAttributes Pool2dAttributes::pad(const Padding2D &pad) +{ + _pad = pad; + return *this; +} + +Size2D Pool2dAttributes::pool_size() const +{ + return _pool_size; +} + +Pool2dAttributes Pool2dAttributes::pool_size(const Size2D &pool_size) +{ + _pool_size = pool_size; + return *this; +} + +Size2D Pool2dAttributes::stride() const +{ + return _stride; +} + +Pool2dAttributes Pool2dAttributes::stride(const Size2D &stride) +{ + _stride = stride; + return *this; +} + +bool Pool2dAttributes::exclude_padding() const +{ + return _exclude_padding; +} + +Pool2dAttributes Pool2dAttributes::exclude_padding(bool exclude_padding) +{ + _exclude_padding = exclude_padding; + return *this; +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/attributes/ReshapeAttributes.cpp b/src/dynamic_fusion/sketch/attributes/ReshapeAttributes.cpp new file mode 100644 index 0000000000..0938c0df84 --- /dev/null +++ b/src/dynamic_fusion/sketch/attributes/ReshapeAttributes.cpp @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/dynamic_fusion/sketch/attributes/ReshapeAttributes.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +ReshapeAttributes &ReshapeAttributes::shape(const TensorShape &shape) +{ + _shape = shape; + return *this; +} +TensorShape ReshapeAttributes::shape() const +{ + return _shape; +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/attributes/ResizeAttributes.cpp b/src/dynamic_fusion/sketch/attributes/ResizeAttributes.cpp new file mode 100644 index 0000000000..1919dbc72d --- /dev/null +++ b/src/dynamic_fusion/sketch/attributes/ResizeAttributes.cpp @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/dynamic_fusion/sketch/attributes/ResizeAttributes.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +ResizeAttributes &ResizeAttributes::output_width(int32_t output_width) +{ + _output_width = output_width; + return *this; +} + +int32_t ResizeAttributes::output_width() const +{ + return _output_width; +} + +ResizeAttributes &ResizeAttributes::output_height(int32_t output_height) +{ + _output_height = output_height; + return *this; +} + +int32_t ResizeAttributes::output_height() const +{ + return _output_height; +} + +ResizeAttributes &ResizeAttributes::interpolation_policy(InterpolationPolicy interpolation_policy) +{ + _interpolation_policy = interpolation_policy; + return *this; +} + +InterpolationPolicy ResizeAttributes::interpolation_policy() const +{ + return _interpolation_policy; +} + +ResizeAttributes &ResizeAttributes::sampling_policy(SamplingPolicy sampling_policy) +{ + _sampling_policy = sampling_policy; + return *this; +} + +SamplingPolicy ResizeAttributes::sampling_policy() const +{ + return _sampling_policy; +} + +ResizeAttributes &ResizeAttributes::align_corners(bool align_corners) +{ + _align_corners = align_corners; + return *this; +} + +bool ResizeAttributes::align_corners() const +{ + return _align_corners; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/attributes/SoftmaxAttributes.cpp b/src/dynamic_fusion/sketch/attributes/SoftmaxAttributes.cpp new file mode 100644 index 0000000000..5d4d666263 --- /dev/null +++ b/src/dynamic_fusion/sketch/attributes/SoftmaxAttributes.cpp @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +SoftmaxAttributes &SoftmaxAttributes::beta(float beta) +{ + _beta = beta; + return *this; +} + +float SoftmaxAttributes::beta() const +{ + return _beta; +} + +SoftmaxAttributes &SoftmaxAttributes::is_log_softmax(bool is_log_softmax) +{ + _is_log_softmax = is_log_softmax; + return *this; +} + +bool SoftmaxAttributes::is_log_softmax() const +{ + return _is_log_softmax; +} + +SoftmaxAttributes &SoftmaxAttributes::axis(int axis) +{ + _axis = axis; + return *this; +} + +int SoftmaxAttributes::axis() const +{ + return _axis; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/GpuComponentServices.h b/src/dynamic_fusion/sketch/gpu/GpuComponentServices.h new file mode 100644 index 0000000000..93881508bb --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuComponentServices.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTSERVICES +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTSERVICES + +#include "src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Services that are used throughout the creation phase of workload code + */ +class GpuComponentServices +{ +public: + /** Default constructor */ + GpuComponentServices() = default; + /** Get reference to component factory */ + GpuKernelComponentFactory &component_factory() + { + return _comp_factory; + } + +private: + GpuKernelComponentFactory _comp_factory{}; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTSERVICES */ diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h new file mode 100644 index 0000000000..c923bf9c16 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT_H + +#include "arm_compute/core/TensorInfo.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Describe how the tensor runtime memory can be accessed + * + * Please see documentation under @ref GpuKernelArgumentBinding + */ +enum class TensorStorageType +{ + Unknown, + ClBufferUint8Ptr, + ClImage2dReadOnly, + ClImage2dWriteOnly, +}; + +/** Describe additional runtime information about the tensor + * + * Please see documentation under @ref GpuKernelArgumentBinding + */ +enum class TensorComponentType +{ + Unknown, + OffsetFirstElement, + Stride0, + Stride1, + Stride2, + Stride3, + Stride4, + Dim0, + Dim1, + Dim2, + Dim3, + Dim4, + Dim1xDim2, + Dim2xDim3, + Dim1xDim2xDim3, +}; + +/** Describe how to extract information from a runtime Gpu tensor, and set it as an argument to a gpu kernel at runtime + * + * A kernel argument is just an argument to the gpu kernel as shown in the argument list below. This contrasts with a "workload argument" which is a tensor (@ref GpuWorkloadArgument) + * void kernel(arg0, arg1, ... argN) + * + * In a kernel generated using dynamic fusion (@ref GpuKernelSourceCode), every kernel argument describes part of a tensor. + * A tensor is described as: **storages** followed by **components** + * + * A storage (@ref TensorStorageType) describes how the tensor runtime memory can be accessed (e.g. via a global uint8 pointer to a CL buffer) + * A component (@ref TensorComponentType) describes additional runtime information about the tensor (e.g. the dimensions of the tensor) + * + * The arguments are arranged in the order of use in the generated kernel code: + * + * arg0 , arg1 , arg2 , ..., , argN + * storage, component0, component1, ..., componentX, storage, component0, component1, ..., componentY + * | tensor0 | tensor1 | + * + * An example argument list: + * + * void kernel( + * image2d_t t0_image, // TensorStorageType::ClImage2dReadOnly + * uint8_t* t0_ptr, // TensorStorageType::ClBufferUint8Ptr + * uint t0_dim0, // TensorComponentType::Dim0 + * uint t0_stride1, // TensorComponentType::Stride1 + * image2d_t t1_ptr, // TensorStorageType::ClImage2dReadOnly + * uint t1_dim1xdim2, // TensorComponentType::Dim1xDim2 + * uint t1_stride1, // TensorComponentType::Stride1 + * uint t1_stride2, // TensorComponentType:Stride2 + * ) + * + */ +class GpuKernelArgumentBinding +{ +public: + enum class Type : int32_t + { + TensorStorage, /** @ref TensorStorageType */ + TensorComponent /** @ref TensorComponentType */ + }; + GpuKernelArgumentBinding(ITensorInfo::Id id, TensorStorageType storage) + : _type{Type::TensorStorage}, _id{id}, _value{} + { + _value.tensor_storage_type = storage; + } + GpuKernelArgumentBinding(ITensorInfo::Id id, TensorComponentType component) + : _type{Type::TensorComponent}, _id{id}, _value{} + { + _value.tensor_component_type = component; + } + /** Storage type of the tensor + */ + TensorStorageType tensor_storage_type() const + { + ARM_COMPUTE_ERROR_ON(_type != Type::TensorStorage); + return _value.tensor_storage_type; + } + /** Component of the tensor + */ + TensorComponentType tensor_component_type() const + { + ARM_COMPUTE_ERROR_ON(_type != Type::TensorComponent); + return _value.tensor_component_type; + } + /** Id of the tensor this kernel argument belongs to + */ + ITensorInfo::Id id() const + { + return _id; + } + /** Type of the kernel argument + */ + Type type() const + { + return _type; + } + +private: + Type _type; + ITensorInfo::Id _id; + union Value + { + TensorStorageType tensor_storage_type; + TensorComponentType tensor_component_type; + }; + Value _value; +}; + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT_H diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp new file mode 100644 index 0000000000..1a458c9862 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "GpuKernelComponentGraph.h" + +#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +std::vector<DependencyGraph::TensorId> +GpuKernelComponentGraph::get_tensor_ids(const std::vector<const ITensorInfo *> tensors) +{ + std::vector<DependencyGraph::TensorId> tensor_ids{}; + std::transform(std::begin(tensors), std::end(tensors), std::back_inserter(tensor_ids), + [](const auto &t) { return t->id(); }); + return tensor_ids; +} + +GpuKernelComponentGraph::GpuKernelComponentGraph(GpuWorkloadContext *context, GpuComponentServices *services) + : _context{context}, _services{services}, _components{}, _tensors{}, _dependency_graph{} +{ +} + +GpuKernelComponentStream GpuKernelComponentGraph::fuse(const MemoryDescriptorMap &mem_map) const +{ + GpuKernelComponentStream stream{_context, _services, mem_map}; + const auto op_seq = _dependency_graph.build_operators_sequence(); + + stream.new_component_group(); + for (auto op : op_seq) + { + const auto component = _components.at(op.op).get(); + const auto success = stream.add_component(component); + if (!success) // Assume first failure was because the root component is unfusable + { + stream.new_component_group(); + const auto success = stream.add_component(component); + ARM_COMPUTE_ERROR_ON(!success); + ARM_COMPUTE_UNUSED(success); + } + } + + return stream; +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h new file mode 100644 index 0000000000..6f871a3c90 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH + +#include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h" +#include "src/dynamic_fusion/sketch/utils/DependencyGraph.h" + +#include <vector> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +class IGpuKernelComponent; + +/** A multi-input (tensors), multi-output (tensors) acyclic directed graph of gpu kernel components + * Its main purposes are: + * - Perform "graph-level" optimizations like fusion of kernel components (not the fusion of operators) + * - Automatically assign memory descriptions @ref MemoryDescriptor of all tensors based on graph topology + */ +class GpuKernelComponentGraph +{ +public: + /** Constructor + * + * @param[in] context @ref GpuWorkloadContext to be used by the graph + * @param[in] services @ref GpuComponentServices to be used by the graph + */ + GpuKernelComponentGraph(GpuWorkloadContext *context, GpuComponentServices *services); + /** Prevent instances of this class from being copy constructed */ + GpuKernelComponentGraph(const GpuKernelComponentGraph &graph) = delete; + /** Prevent instances of this class from being copied */ + GpuKernelComponentGraph &operator=(const GpuKernelComponentGraph &graph) = delete; + /** Allow instances of this class to be move constructed */ + GpuKernelComponentGraph(GpuKernelComponentGraph &&graph) = default; + /** Allow instances of this class to be moved */ + GpuKernelComponentGraph &operator=(GpuKernelComponentGraph &&graph) = default; + /** Create a new component and add it to the component graph + * Component id is automatically allocated + * + * @tparam T Component type + * @tparam Args Component argument types + * + * @param[in] args Component arguments except for component id, which is auto-allocated + */ + template <typename T, typename... Args> + void add_new_component(Args &&...args) + { + auto comp = _services->component_factory().create<T>(std::forward<Args>(args)...); + ArgumentPack<ITensorInfo> tensors = comp->tensors(); + const auto src_tensor_ids = get_tensor_ids(tensors.get_const_src_tensors()); + const auto dst_tensor_ids = get_tensor_ids(tensors.get_const_dst_tensors()); + bool success = _dependency_graph.add_operator(comp->id(), src_tensor_ids, dst_tensor_ids); + ARM_COMPUTE_UNUSED(success); + ARM_COMPUTE_ERROR_ON(!success); + _components[comp->id()] = std::move(comp); + for (auto t : tensors.get_const_src_tensors()) + { + _tensors[t->id()] = t; + } + for (auto t : tensors.get_const_dst_tensors()) + { + _tensors[t->id()] = t; + } + } + /** Perform component fusion and serialize the graph into a stream of component groups + * + * @param[in] mem_map MemoryDescriptorMap for all the tensors in the component graph + * + * @return GpuKernelComponentStream + */ + GpuKernelComponentStream fuse(const MemoryDescriptorMap &mem_map) const; + +private: + static std::vector<DependencyGraph::TensorId> get_tensor_ids(const std::vector<const ITensorInfo *> tensors); + GpuWorkloadContext *_context; + GpuComponentServices *_services; + std::map<ComponentId, std::unique_ptr<IGpuKernelComponent>> _components; + std::map<ITensorInfo::Id, const ITensorInfo *> _tensors; + DependencyGraph _dependency_graph{}; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH */ diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp new file mode 100644 index 0000000000..5a6d125d96 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp @@ -0,0 +1,368 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "GpuKernelComponentGroup.h" + +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/core/Validate.h" + +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" + +#include <algorithm> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +bool GpuKernelComponentGroup::add_component(ComponentPtr component) +{ + ARM_COMPUTE_ERROR_ON_MSG(_finalized, "The component group has been finalized and cannot be altered."); + + // note: Constraint 1 is guaranteed as a precondition + // Constraint 2 + if (component->type() != GpuComponentType::Output && _components.size() >= max_fused_components) + { + return false; + } + // Constraint 3.1: Pattern: (Unfusable + Output) + if (!_components.empty() && get_root_component()->type() == GpuComponentType::Unfusable && + component->type() != GpuComponentType::Output) + { + return false; + } + // Constraint 3.2 + if (!_components.empty() && + (component->type() != GpuComponentType::Simple && component->type() != GpuComponentType::Output)) + { + return false; + } + // Constraint 4 + if (component->type() != GpuComponentType::Unfusable && component->tensors().get_const_dst_tensors().size() != 1U) + { + return false; + } + // Constraint 5 + if (!_components.empty() && !(get_root_component()->properties() == component->properties())) + { + return false; + } + // Constraint 7 + if (!_components.empty()) + { + const auto root_dst_tensors = get_root_component()->tensors().get_const_dst_tensors(); + ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty()); + const auto first_dst_tensor = root_dst_tensors[0]; + const auto dst_tensors = component->tensors().get_const_dst_tensors(); + for (const auto &t : root_dst_tensors) + { + if (detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0)) + { + return false; + } + } + for (const auto &t : dst_tensors) + { + if (detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0)) + { + return false; + } + } + } + // Constraint 8 + if (!_components.empty()) + { + const auto root_dst_tensors = get_root_component()->tensors().get_const_dst_tensors(); + ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty()); + const auto first_dst_tensor_layout = root_dst_tensors[0]->data_layout(); + const auto dst_tensors = component->tensors().get_const_dst_tensors(); + for (const auto &t : root_dst_tensors) + { + if (t->data_layout() != first_dst_tensor_layout) + { + return false; + } + } + for (const auto &t : dst_tensors) + { + if (t->data_layout() != first_dst_tensor_layout) + { + return false; + } + } + } + // Constraint 9 + if (component->tensors().get_const_dst_tensors().size() >= max_dst_tensors) + { + return false; + } + // Constraint 9 corollary + if (component->type() == GpuComponentType::Output && _components.size() >= max_fused_components + max_dst_tensors) + { + return false; + } + _components.push_back(component); + return true; +} + +void GpuKernelComponentGroup::finalize() +{ + if (_finalized) + { + return; + } + + _finalized = true; + + std::set<const ITensorInfo *> output_tensors; + std::map<const ITensorInfo *, std::vector<const ITensorInfo *>> possible_tile_map; + std::map<const ITensorInfo *, int32_t> tile_usages; + + for (auto component : _components) + { + const auto tensors = component->tensors(); + const auto src_tensors = tensors.get_const_src_tensors(); + const auto dst_tensors = tensors.get_const_dst_tensors(); + + // Detect input, output and intermediate tensors. + for (auto tensor : src_tensors) + { + const auto output_tensors_it = output_tensors.find(tensor); + + if (output_tensors_it != output_tensors.end()) + { + // This tensor is the output of another operator. + // It must be marked as intermediate tensor. + output_tensors.erase(output_tensors_it); + _interm_tensors.insert(tensor); + } + else if (_interm_tensors.find(tensor) == _interm_tensors.end()) + { + _input_tensors.insert(tensor); + + tile_usages[tensor] = 0; + possible_tile_map.emplace(tensor, std::vector<const ITensorInfo *>()); + } + } + + for (auto tensor : dst_tensors) + { + ARM_COMPUTE_ERROR_ON(_input_tensors.find(tensor) != _input_tensors.end()); + ARM_COMPUTE_ERROR_ON(output_tensors.find(tensor) != output_tensors.end()); + ARM_COMPUTE_ERROR_ON(_interm_tensors.find(tensor) != _interm_tensors.end()); + output_tensors.insert(tensor); + + tile_usages[tensor] = 0; + possible_tile_map.emplace(tensor, std::vector<const ITensorInfo *>()); + } + + // Check if the output can overwrite the input tile. + const auto component_type = component->type(); + if (component_type == GpuComponentType::Simple || component_type == GpuComponentType::Output) + { + ARM_COMPUTE_ERROR_ON(dst_tensors.size() != 1); + + const auto dst_tensor = dst_tensors[0]; + const auto &dst_shape = dst_tensor->tensor_shape(); + const auto &dst_type = dst_tensor->data_type(); + + tile_usages[dst_tensor] = 0; + + for (auto src_tensor : src_tensors) + { + const auto &src_shape = src_tensor->tensor_shape(); + const auto &src_type = src_tensor->data_type(); + + if (src_shape == dst_shape && src_type == dst_type) + { + const auto tile_usages_it = tile_usages.find(src_tensor); + ARM_COMPUTE_ERROR_ON(tile_usages_it == tile_usages.end()); + + if (component_type == GpuComponentType::Simple || tile_usages_it->second > 0) + { + // Increase the number of tile usages unless this component is an output + // and the tile has not been shared with any component. + // (Reason: output component doesn't change the content of the tile) + ++tile_usages_it->second; + } + + possible_tile_map[dst_tensor].push_back(src_tensor); + } + } + } + else + { + // Outputs of complex and unfusable components need dedicated tile. + for (auto tensor : dst_tensors) + { + tile_usages[tensor] = 0; + } + } + } + + // Find the smallest list of tiles that the intermediate tensors need to write to. + for (auto tensor : _input_tensors) + { + _tile_map[tensor] = tensor; + } + + for (auto component : _components) + { + const auto dst_tensors = component->tensors().get_const_dst_tensors(); + + for (auto tensor : dst_tensors) + { + const auto target_tiles = possible_tile_map.at(tensor); + _tile_map[tensor] = tensor; + + for (auto target : target_tiles) + { + const auto num_usage = tile_usages[target]; + + if (num_usage <= 1) + { + // The target tile is consumed by only this operator, so we can reuse it + // for the destination tensor data. + _tile_map[tensor] = _tile_map.at(target); + break; + } + } + } + } + + for (auto tensor : output_tensors) + { + _tile_map[tensor] = tensor; + } + + // All intermediate tensors that cannot be shared with any previous tensor + // will need to be declared as tile variable. + for (auto tensor_tile : _tile_map) + { + if (tensor_tile.first == tensor_tile.second && _interm_tensors.find(tensor_tile.first) != _interm_tensors.end()) + { + _tiles.push_back(tensor_tile.first); + } + } + + std::set_union(_input_tensors.begin(), _input_tensors.end(), output_tensors.begin(), output_tensors.end(), + std::back_inserter(_argument_tensors)); + _any_output_tensor = *output_tensors.begin(); +} + +std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_tiles() const +{ + ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized."); + return _tiles; +} + +const ITensorInfo *GpuKernelComponentGroup::get_tile_for_tensor(const ITensorInfo *tensor) const +{ + ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized."); + + if (_tile_map.find(tensor) != _tile_map.end()) + { + return _tile_map.at(tensor); + } + + return tensor; +} + +const ITensorInfo *GpuKernelComponentGroup::get_any_dst_tensor() const +{ + ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized."); + return _any_output_tensor; +} + +std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_argument_tensors() const +{ + ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized."); + return _argument_tensors; +} + +GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_root_component() const +{ + if (empty()) + { + return nullptr; + } + return _components[0]; +} + +bool GpuKernelComponentGroup::is_intermediate_tensor(const ITensorInfo *tensor) const +{ + ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized."); + return _interm_tensors.find(tensor) != _interm_tensors.end(); +} + +bool GpuKernelComponentGroup::is_input_tensor(const ITensorInfo *tensor) const +{ + ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized."); + return _input_tensors.find(tensor) != _input_tensors.end(); +} + +size_t GpuKernelComponentGroup::size() const +{ + return _components.size(); +} +bool GpuKernelComponentGroup::empty() const +{ + return _components.empty(); +} +GpuKernelComponentGroup::ComponentPtr &GpuKernelComponentGroup::operator[](size_t index) +{ + return _components[index]; +} +const GpuKernelComponentGroup::ComponentPtr &GpuKernelComponentGroup::operator[](size_t index) const +{ + return _components[index]; +} +typename std::vector<GpuKernelComponentGroup::ComponentPtr>::iterator GpuKernelComponentGroup::begin() +{ + return _components.begin(); +} +typename std::vector<GpuKernelComponentGroup::ComponentPtr>::iterator GpuKernelComponentGroup::end() +{ + return _components.end(); +} +typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::begin() const +{ + return _components.cbegin(); +} +typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::end() const +{ + return _components.cend(); +} +typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::cbegin() const +{ + return _components.cbegin(); +} +typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::cend() const +{ + return _components.cend(); +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h new file mode 100644 index 0000000000..6ad71abb39 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGROUP +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGROUP + +#include "components/Types.h" +#include <cstdint> +#include <cstdlib> +#include <map> +#include <set> +#include <vector> + +namespace arm_compute +{ +/** Forward declaration */ +class ITensorInfo; +namespace experimental +{ +namespace dynamic_fusion +{ +class IGpuKernelComponent; +/** A group of gpu kernel components to be fused together + * PRECONDITIONS: + * 1. Fusion is limited to a linear sequence of kernel components + * INVARIANTS: + * @note These preconditions and invariants are exactly the same as fusion constraints for kernel components + * 2. Max number of components that can be fused is @ref GpuKernelComponentGroup::max_fused_components ( + * excluding any output or input (if any) components. + * The max number of output components are bound by the maximum number of dst tensors allowed for a component / component group + * ) + * 3. The fusion is subject to the pattern: (Complex + Simple * | Simple + Simple * | Un-fusable) + Output? + * 4. All components but unfusable, have exactly 1 dst tensor + * 5. All fused components share the same @ref IGpuKernelComponent::Properties ( @ref UnitWorkloadStage etc. ) + * 6. All fused components share the same tunable parameters like tile size + * 7. All fused components share the same dst tensor shape + * 8. All fused components' tensors share the same @ref DataLayout + * 9. Maximum number of dst tensors allowed for an component (including unfusable) / component group is @ref GpuKernelComponentGroup::max_dst_tensors + * This has an impact on the total number of components supported, which = max_fused_components + max_dst_tensors + */ +class GpuKernelComponentGroup +{ +public: + using ComponentPtr = IGpuKernelComponent *; + /** Maximum number of components that can be fused into the same component group + */ + static constexpr size_t max_fused_components = 64; + /** Maximum number of dst tensors allowed for a component / component + */ + static constexpr size_t max_dst_tensors = 8; + +public: + /** Default constructor */ + GpuKernelComponentGroup() = default; + /** Allow instances of this class to be copy constructed */ + GpuKernelComponentGroup(const GpuKernelComponentGroup &) = default; + /** Allow instances of this class to be copied */ + GpuKernelComponentGroup &operator=(const GpuKernelComponentGroup &) = default; + /** Allow instances of this class to be move constructed */ + GpuKernelComponentGroup(GpuKernelComponentGroup &&) = default; + /** Allow instances of this class to be moved */ + GpuKernelComponentGroup &operator=(GpuKernelComponentGroup &&) = default; + /** Add a component pointer into the group + * If the operation fails, then no change is made to the group + * + * @param[in] component Pointer to the component to be added + * + * @return true If the operation is successful + * @return false If the operation fails + */ + bool add_component(ComponentPtr component); + /** Optimize and pre-compute information about the component group */ + void finalize(); + /** Get one of the destination tensors of this group */ + const ITensorInfo *get_any_dst_tensor() const; + /** Get tensor argument of this group + * A tensor is an argument if it is a source or destination tensor to the group + */ + std::vector<const ITensorInfo *> get_argument_tensors() const; + /** Get the root (first) component of this group */ + ComponentPtr get_root_component() const; + /** Check if a @ref ITensorInfo is an "intermediate" tensor of the group + * + * An intermediate tensor is any tensor that is not an argument. + * + * @param[in] tensor @ref ITensorInfo to be looked up + * + * @return true If @p tensor is an intermediate tensor + * @return false Otherwise + */ + bool is_intermediate_tensor(const ITensorInfo *tensor) const; + /** Check if an @ref ITensorInfo is an input tensor of the group. + * + * @param[in] tensor @ref ITensorInfo to be looked up. + * + * @return true if @p tensor is an input tensor of the group, otherwise false. + */ + bool is_input_tensor(const ITensorInfo *tensor) const; + /** Get the list of temporary tiles that need to be declared */ + std::vector<const ITensorInfo *> get_tiles() const; + /** Get the shared tile that can be used to store temporary data of the specified tensor. + * + * @param[in] tensor @ref ITensorInfo to be looked up. + * + * @return @ref ITensorInfo that is used to store temporary data of @p tensor. + **/ + const ITensorInfo *get_tile_for_tensor(const ITensorInfo *tensor) const; + /** Get the number of components within the group */ + size_t size() const; + /** Check if the component group is empty */ + bool empty() const; + ComponentPtr &operator[](size_t index); + const ComponentPtr &operator[](size_t index) const; + typename std::vector<ComponentPtr>::iterator begin(); + typename std::vector<ComponentPtr>::iterator end(); + typename std::vector<ComponentPtr>::const_iterator begin() const; + typename std::vector<ComponentPtr>::const_iterator end() const; + typename std::vector<ComponentPtr>::const_iterator cbegin() const; + typename std::vector<ComponentPtr>::const_iterator cend() const; + +private: + std::vector<ComponentPtr> _components{}; + + bool _finalized{false}; + + std::vector<const ITensorInfo *> _argument_tensors{}; + std::set<const ITensorInfo *> _input_tensors{}; + std::set<const ITensorInfo *> _interm_tensors{}; + const ITensorInfo *_any_output_tensor{nullptr}; + std::vector<const ITensorInfo *> _tiles{}; + std::map<const ITensorInfo *, const ITensorInfo *> _tile_map{}; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGROUP */ diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp new file mode 100644 index 0000000000..8042e3dd08 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "GpuKernelComponentStream.h" + +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" +#include "src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +GpuKernelComponentStream::GpuKernelComponentStream(GpuWorkloadContext *context, + GpuComponentServices *services, + const MemoryDescriptorMap &mem_map) + : _context{context}, _services{services}, _component_groups{}, _mem_map{mem_map} +{ +} + +GpuWorkloadSourceCode GpuKernelComponentStream::write_workload_code() +{ + GpuWorkloadSourceCode source_code; + // Traverse through component groups and assemble workload together + for (auto &&group : _component_groups) + { + group.finalize(); + + // Write kernel code + GpuLogicalKernel logical_kernel(_services, group); + const GpuKernelSourceCode kernel_code = logical_kernel.write_kernel_code(); + // The whole unit workload stage is determined by the root component + const auto unit_workload_stage = group.get_root_component()->properties().stage(); + source_code.add_unit_workload(kernel_code, unit_workload_stage, _mem_map, _context); + } + return source_code; +} + +void GpuKernelComponentStream::new_component_group() +{ + _component_groups.emplace_back(); +} + +bool GpuKernelComponentStream::add_component(IGpuKernelComponent *component) +{ + ARM_COMPUTE_ERROR_ON(_component_groups.empty()); + return _component_groups.back().add_component(component); +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h new file mode 100644 index 0000000000..ef8a8a15b0 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTSTREAM +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTSTREAM + +#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h" + +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +class GpuComponentServices; +class IGpuKernelComponent; + +/** A linear sequence of component groups serialized from the @ref GpuKernelComponentGraph + * Each component group in the stream denotes a complete kernel that may consist of multiple components + * + * The main purposes of this class are: + * - Facilitate component fusion algorithm by allowing insertions of new component groups into the stream + * - Invoke kernel writer and assemble the final @ref GpuWorkloadSourceCode + */ +class GpuKernelComponentStream +{ +public: + /** Constructor + * + * @param[in] context @ref GpuWorkloadContext to be used throughout the stream + * @param[in] services @ref GpuComponentServices to be used throughout the stream + * @param[in] mem_map @ref MemoryDescriptor map used to assemble the @ref GpuWorkloadSourceCode + */ + GpuKernelComponentStream(GpuWorkloadContext *context, + GpuComponentServices *services, + const MemoryDescriptorMap &mem_map); + /** Allow instances of this class to be copy constructed */ + GpuKernelComponentStream(const GpuKernelComponentStream &stream) = default; + /** Allow instances of this class to be copied */ + GpuKernelComponentStream &operator=(const GpuKernelComponentStream &stream) = default; + /** Allow instances of this class to be move constructed */ + GpuKernelComponentStream(GpuKernelComponentStream &&stream) = default; + /** Allow instances of this class to be moved */ + GpuKernelComponentStream &operator=(GpuKernelComponentStream &&stream) = default; + /** Generate and assemble @ref GpuWorkloadSourceCode from the stream */ + GpuWorkloadSourceCode write_workload_code(); + /** Insert a new component group in the stream. + * Subsequent components are added to this group until end of stream or the next new_component_group is called + */ + void new_component_group(); + /** Add a component to the previously created component group + * Throw an error if no component group is present in the stream + * + * @param[in] component Component to be inserted + * + * @return true If the operation is successful + * @return false Otherwise + */ + bool add_component(IGpuKernelComponent *component); + +private: + GpuWorkloadContext *_context; + GpuComponentServices *_services; + std::vector<GpuKernelComponentGroup> _component_groups{}; + MemoryDescriptorMap _mem_map{}; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTSTREAM */ diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h new file mode 100644 index 0000000000..11d916eec9 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE_H + +#include "arm_compute/core/CL/CLCompileContext.h" +#include "arm_compute/core/Window.h" + +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" + +#include <deque> +#include <string> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** The argument list of a @ref GpuKernelSourceCode */ +using GpuKernelArgumentList = std::deque<GpuKernelArgumentBinding>; + +/** Container of kernel code to be compiled and run in a @ref GpuUnitWorkload + */ +class GpuKernelSourceCode +{ +public: + /** Set kernel name */ + GpuKernelSourceCode &name(const std::string &n) + { + _name = n; + return *this; + } + /** Set kernel code */ + GpuKernelSourceCode &code(const std::string &c) + { + _code = c; + return *this; + } + /** Set kernel config id string */ + GpuKernelSourceCode &config_id(const std::string &c_id) + { + _config_id = c_id; + return *this; + } + /** Set kernel build options */ + GpuKernelSourceCode &build_options(const CLBuildOptions &b_options) + { + _build_options = b_options; + return *this; + } + /** Set kernel execution window */ + GpuKernelSourceCode &window(const Window &window) + { + _window = window; + return *this; + } + /** Set kernel argument list */ + GpuKernelSourceCode &arguments(const GpuKernelArgumentList &arguments) + { + _arguments = arguments; + return *this; + } + /** Get kernel name */ + std::string name() const + { + return _name; + } + /** Get kernel code */ + std::string code() const + { + return _code; + } + /** Get kernel config id string */ + std::string config_id() const + { + return _config_id; + } + /** Get kernel build options */ + const CLBuildOptions &build_options() const + { + return _build_options; + } + /** Get kernel execution window */ + const Window &window() const + { + return _window; + } + /** Get kernel argument list */ + const GpuKernelArgumentList &arguments() const + { + return _arguments; + } + +private: + std::string _name{}; + std::string _code{}; + std::string _config_id{}; + CLBuildOptions _build_options{}; + Window _window{}; + GpuKernelArgumentList _arguments{}; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE_H diff --git a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp new file mode 100644 index 0000000000..725a46e91c --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "GpuLogicalKernel.h" + +#include "arm_compute/core/experimental/Types.h" + +#include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h" +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" +#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +GpuLogicalKernel::GpuLogicalKernel(GpuComponentServices *services, GpuKernelComponentGroup components) // NOLINT + : _comp_group{std::move(components)}, _store_components{} +{ + ARM_COMPUTE_UNUSED(services); +} + +GpuKernelSourceCode GpuLogicalKernel::write_kernel_code() +{ + GpuKernelSourceCode code; + GpuCkwDriver writer{_comp_group}; + + code.name(writer.get_name()); + code.code(writer.get_code()); + code.arguments(writer.get_kernel_arguments()); + code.build_options(writer.get_build_options()); + code.config_id(writer.get_config_id()); + code.window(writer.get_window()); + + return code; +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h new file mode 100644 index 0000000000..e2bc83b286 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2022, 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL_H + +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h" + +#include <memory> +#include <vector> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Forward declaration */ +class GpuComponentServices; +class IGpuKernelComponent; + +/** A wrapper-processor of a @ref GpuKernelComponentGroup + * It adds the load (if any) and store components to the component group + * The @ref GpuLogicalKernel represents a complete kernel, and can proceed to invoke any kernel writer to generate the full kernel code + */ +class GpuLogicalKernel +{ +public: + /** Constructor + * + * @param[in] services @ref GpuComponentServices to be used + * @param[in] components Component group from which this logical kernel is initialized + */ + explicit GpuLogicalKernel(GpuComponentServices *services, GpuKernelComponentGroup components); // NOLINT + /** Allow instances of this class to be copy constructed */ + GpuLogicalKernel(const GpuLogicalKernel &) = default; + /** Allow instances of this class to be copied */ + GpuLogicalKernel &operator=(const GpuLogicalKernel &) = default; + /** Allow instances of this class to be move constructed */ + GpuLogicalKernel(GpuLogicalKernel &&) = default; + /** Allow instances of this class to be moved */ + GpuLogicalKernel &operator=(GpuLogicalKernel &&) = default; + /** Generate a @ref GpuKernelSourceCode */ + GpuKernelSourceCode write_kernel_code(); + +private: + GpuKernelComponentGroup _comp_group{}; + std::vector<std::unique_ptr<IGpuKernelComponent>> _store_components{}; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL_H diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp new file mode 100644 index 0000000000..aec8b9db4f --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h" + +#include "arm_compute/core/Validate.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +std::vector<DependencyGraph::TensorId> get_tensor_ids(const std::vector<const ITensorInfo *> tensors) +{ + std::vector<DependencyGraph::TensorId> tensor_ids{}; + std::transform(std::begin(tensors), std::end(tensors), std::back_inserter(tensor_ids), + [](const auto &t) { return t->id(); }); + return tensor_ids; +} + +} // namespace + +Operator::Operator(OperatorId id, GpuOperatorType operator_type, const ArgumentPack<ITensorInfo> &tensors) + : _id{id}, _operator_type{operator_type}, _tensors{tensors} +{ +} + +OperatorId Operator::id() const +{ + return _id; +} + +GpuOperatorType Operator::operator_type() const +{ + return _operator_type; +} + +ArgumentPack<ITensorInfo> Operator::tensors() const +{ + return _tensors; +} + +bool GpuOperatorGroup::try_add_operator(const Operator &op, bool is_output) const +{ + const auto src_tensor_ids = get_tensor_ids(op.tensors().get_const_src_tensors()); + const auto dst_tensor_ids = get_tensor_ids(op.tensors().get_const_dst_tensors()); + // Constraint 1 + if (!_graph.try_add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids, is_output)) + { + return false; + } + // Constraint 2 + if (_operators.size() >= max_fused_operators) + { + return false; + } + // Constraint 3.1: Pattern: (Unfusable) + if (_operators.size() > 0 && get_root_operator()->operator_type() == GpuOperatorType::Unfusable) + { + return false; + } + // Constraint 3.2 + if (_operators.size() > 0 && (op.operator_type() != GpuOperatorType::Simple)) + { + return false; + } + // Constraint 4 + if (op.operator_type() != GpuOperatorType::Unfusable && op.tensors().get_const_dst_tensors().size() != 1U) + { + return false; + } + // Constraint 5 + if (_operators.size() > 0) + { + const auto root_dst_tensors = get_root_operator()->tensors().get_const_dst_tensors(); + ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty()); + const auto first_dst_tensor = root_dst_tensors[0]; + const auto dst_tensors = op.tensors().get_const_dst_tensors(); + for (const auto &t : root_dst_tensors) + { + if (detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0)) + { + return false; + } + } + for (const auto &t : dst_tensors) + { + if (detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0)) + { + return false; + } + } + } + // Constraint 6 + if (_operators.size() > 0) + { + const auto root_dst_tensors = get_root_operator()->tensors().get_const_dst_tensors(); + ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty()); + const auto first_dst_tensor_layout = root_dst_tensors[0]->data_layout(); + const auto dst_tensors = op.tensors().get_const_dst_tensors(); + for (const auto &t : root_dst_tensors) + { + if (t->data_layout() != first_dst_tensor_layout) + { + return false; + } + } + for (const auto &t : dst_tensors) + { + if (t->data_layout() != first_dst_tensor_layout) + { + return false; + } + } + } + return true; +} +void GpuOperatorGroup::add_operator(const Operator &op, bool is_output) +{ + ARM_COMPUTE_ERROR_ON(!try_add_operator(op, is_output)); + const auto src_tensor_ids = get_tensor_ids(op.tensors().get_const_src_tensors()); + const auto dst_tensor_ids = get_tensor_ids(op.tensors().get_const_dst_tensors()); + _graph.add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids, is_output); + _operators[op.id()] = op; +} +Operator GpuOperatorGroup::new_operator(const GpuOperatorType &operator_type, + const ArgumentPack<ITensorInfo> &tensors) const +{ + auto new_id = static_cast<OperatorId>(_operators.size()); + return Operator{new_id, operator_type, tensors}; +} +const Operator *GpuOperatorGroup::get_root_operator() const +{ + const auto roots = _graph.get_root_ops(); + ARM_COMPUTE_ERROR_ON(roots.size() > 1); + if (roots.empty()) + { + return nullptr; + } + return &_operators.at(roots[0]); +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h new file mode 100644 index 0000000000..0a2369d357 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORGROUP +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORGROUP + +#include "arm_compute/core/ITensorInfo.h" + +#include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h" +#include "src/dynamic_fusion/sketch/utils/DependencyGraph.h" + +#include <map> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +using OperatorId = DependencyGraph::OperatorId; + +/** An operator for the sole purpose of validating fusion + */ +class Operator +{ +public: + /** Default constructor */ + Operator() = default; + /** Get Operator Id */ + OperatorId id() const; + /** Get operator type */ + GpuOperatorType operator_type() const; + /** Get tensor arguments */ + ArgumentPack<ITensorInfo> tensors() const; + friend class GpuOperatorGroup; + +private: + Operator(OperatorId id, GpuOperatorType operator_type, const ArgumentPack<ITensorInfo> &tensors); + OperatorId _id{}; + GpuOperatorType _operator_type{}; + ArgumentPack<ITensorInfo> _tensors{}; +}; + +/** A linear sequence of operators to be fused in a workload + * For the time being, this class is only used for validating operator fusion + * INVARIANTS: + * @note These invariants are exactly the same as operator fusion constraints + * 1. Fusion is limited to a linear sequence of operators + * 2. Max number of operators that can be fused is @ref GpuOperatorGroup::max_fused_operators + * 3. The fusion is subject to the pattern: Complex + Simple * | Simple + Simple * | Un-fusable + * 4. All operator but unfusable, have exactly 1 dst tensor + * 5. All fused operators share the same dst tensor shape + * 6. All fused operators' tensors share the same @ref DataLayout + */ +class GpuOperatorGroup +{ +public: + static constexpr size_t max_fused_operators = 32; + /** Try adding (without actually adding) an operator to the group + * + * @param[in] op Operator to be added + * @param[in] is_output Whether this operator is the output operator. + * + * @return true If @p op can be added while maintaining the invariants + * @return false Otherwise + */ + bool try_add_operator(const Operator &op, bool is_output = false) const; + /** Add an operator to the group + * + * @param[in] op Operator to be added + * @param[in] is_output Whether this operator is the output operator. + */ + void add_operator(const Operator &op, bool is_output = false); + /** Create a new operator + * + * @param[in] operator_type @ref GpuOperatorType of the new operator + * @param[in] tensors Tensor arguments to the new operator + * + * @return Operator + */ + Operator new_operator(const GpuOperatorType &operator_type, const ArgumentPack<ITensorInfo> &tensors) const; + /** Get the "root operator" of the group, which is the first operator in a linear sequence + * @return const Operator* Pointer to the root operator + */ + const Operator *get_root_operator() const; + +private: + DependencyGraph _graph{}; + std::map<OperatorId, Operator> _operators{}; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORGROUP */ diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h b/src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h new file mode 100644 index 0000000000..c77697c343 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORPROPERTIES +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORPROPERTIES + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Contain properties common to all operator types */ + +/** Operator type in the context of fusion + */ +enum class GpuOperatorType +{ + /** Simple operators are operators that: + * 1. Have a 1-to-1 mapping between the input elements and output elements, like elementwise + * 2. Have exactly 1 output + */ + Simple, + /** Complex operators are operators that are not simple but are still fusable with simple ones + */ + Complex, + /** Unfusable operators are operators that cannot be fused with any other types of operators + */ + Unfusable +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORPROPERTIES */ diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp new file mode 100644 index 0000000000..fab18aabb4 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h" + +#include "arm_compute/core/CL/CLCompileContext.h" + +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +GpuWorkloadContext::GpuWorkloadContext(CLCompileContext *cl_compile_ctx) + : _impl{std::make_unique<Impl>(GpuLanguage::OpenCL, cl_compile_ctx)} +{ +} + +GpuWorkloadContext::~GpuWorkloadContext() = default; + +GpuWorkloadContext::GpuWorkloadContext(GpuWorkloadContext &&other) = default; + +GpuWorkloadContext &GpuWorkloadContext::operator=(GpuWorkloadContext &&other) = default; + +GpuTarget GpuWorkloadContext::gpu_target() const +{ + return _impl->cl_compile_context()->get_gpu_target(); +} + +GpuLanguage GpuWorkloadContext::gpu_language() const +{ + return _impl->gpu_language(); +} + +const CLCompileContext *GpuWorkloadContext::cl_compile_context() const +{ + return _impl->cl_compile_context(); +} + +void GpuWorkloadContext::register_user_tensor(std::unique_ptr<TensorInfo> &&tensor_info) +{ + _impl->register_user_tensor(std::move(tensor_info)); +} + +GpuWorkloadContext::Impl &GpuWorkloadContext::implementation() +{ + return *_impl; +} + +const GpuWorkloadContext::Impl &GpuWorkloadContext::implementation() const +{ + return *_impl; +} + +GpuWorkloadContext::Impl::Impl(GpuLanguage gpu_language, CLCompileContext *cl_compile_ctx) + : _gpu_language(gpu_language), + _cl_compile_ctx(cl_compile_ctx), + _next_tensor_id(1), + _mem_map(), + _managed_tensor_info() +{ +} + +GpuLanguage GpuWorkloadContext::Impl::gpu_language() const +{ + return _gpu_language; +} + +const CLCompileContext *GpuWorkloadContext::Impl::cl_compile_context() const +{ + return _cl_compile_ctx; +} + +const MemoryDescriptorMap &GpuWorkloadContext::Impl::mem_map() const +{ + return _mem_map; +} + +void GpuWorkloadContext::Impl::register_user_tensor(std::unique_ptr<TensorInfo> &&tensor_info) +{ + ARM_COMPUTE_ERROR_ON(tensor_info->has_valid_id()); + + const auto tensor_id = next_tensor_id(); + + tensor_info->set_id(tensor_id); + _mem_map[tensor_id] = MemoryDescriptor{MemoryType::User}; + // Save a *copy* of the user tensor info in workload context for future reference + // Note that this means if the user modifies the @p tensor_info, the change will not be reflected in the context + _managed_tensor_info.emplace(tensor_info->id(), std::move(tensor_info)); +} + +ITensorInfo *GpuWorkloadContext::Impl::create_virtual_tensor() +{ + auto tensor_info = std::make_unique<TensorInfo>(); + const auto tensor_id = -next_tensor_id(); + tensor_info->set_id(tensor_id); + _mem_map[tensor_id] = MemoryDescriptor{MemoryType::Virtual}; + auto inserted = _managed_tensor_info.emplace(tensor_info->id(), std::move(tensor_info)); + return inserted.first->second.get(); +} + +ITensorInfo *GpuWorkloadContext::Impl::create_auxiliary_tensor(const ITensorInfo &itensor_info) +{ + auto tensor_info = std::make_unique<TensorInfo>(itensor_info); + const auto tensor_id = next_tensor_id(); + tensor_info->set_id(tensor_id); + _mem_map[tensor_id] = MemoryDescriptor{MemoryType::Auxiliary, AuxMemoryInfo{tensor_info->total_size()}}; + auto inserted = _managed_tensor_info.emplace(tensor_info->id(), std::move(tensor_info)); + return inserted.first->second.get(); +} + +ITensorInfo *GpuWorkloadContext::Impl::get_tensor_info(ITensorInfo::Id id) +{ + return _managed_tensor_info.at(id).get(); +} + +const ITensorInfo *GpuWorkloadContext::Impl::get_tensor_info(ITensorInfo::Id id) const +{ + return _managed_tensor_info.at(id).get(); +} + +ITensorInfo::Id GpuWorkloadContext::Impl::next_tensor_id() +{ + return _next_tensor_id++; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h new file mode 100644 index 0000000000..b3571a6480 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADCONTEXTIMPL_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADCONTEXTIMPL_H + +#include "arm_compute/core/CL/CLCompileContext.h" +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h" +#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Internal implementation of workload context. */ +class GpuWorkloadContext::Impl +{ +public: + /** Constructor + * + * @param[in] gpu_language Target GPU language. + * @param[in] cl_compile_ctx CL compile context. + */ + Impl(GpuLanguage gpu_language, CLCompileContext *cl_compile_ctx); + + /** Copy constructor */ + Impl(Impl &) = default; + + /** Assignment */ + Impl &operator=(Impl &) = default; + + /** Get target GPU language. */ + GpuLanguage gpu_language() const; + + /** Get CL compile context. */ + const CLCompileContext *cl_compile_context() const; + + /** Get memory descriptor registry. */ + const MemoryDescriptorMap &mem_map() const; + + /** Set a new ID and register the user tensor info. + * + * The ownership of the tensor info object will be transfered to this context object. + * + * @param[in] tensor_info The tensor info to be registered. + */ + void register_user_tensor(std::unique_ptr<TensorInfo> &&tensor_info); + + /** Create a virtual (see @ref MemoryType) tensor info and save it + * + * @return ITensorInfo* The created virtual tensor info object pointer + */ + ITensorInfo *create_virtual_tensor(); + /** Create an auxiliary (see @ref MemoryType) tensor info and save it + * + * @param[in] tensor_info @ref ITensorInfo to copy from + * + * @return ITensorInfo* The created auxiliary tensor info object pointer + */ + ITensorInfo *create_auxiliary_tensor(const ITensorInfo &tensor_info); + + /** Get tensor info created by this context, from id */ + ITensorInfo *get_tensor_info(ITensorInfo::Id id); + + /** Get tensor info created by this context, from id */ + const ITensorInfo *get_tensor_info(ITensorInfo::Id id) const; + +private: + ITensorInfo::Id next_tensor_id(); + + GpuLanguage _gpu_language; + CLCompileContext *_cl_compile_ctx; + + ITensorInfo::Id _next_tensor_id; + MemoryDescriptorMap _mem_map; + std::map<ITensorInfo::Id, std::unique_ptr<TensorInfo>> _managed_tensor_info; +}; + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute + +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADCONTEXTIMPL_H diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp new file mode 100644 index 0000000000..357cb48a84 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h" + +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ + +GpuWorkloadSketch::GpuWorkloadSketch(Context *context) : _impl{std::make_unique<Implementation>(context)} +{ +} + +GpuWorkloadSketch::~GpuWorkloadSketch() +{ +} + +GpuWorkloadSketch::GpuWorkloadSketch(GpuWorkloadSketch &&) = default; + +GpuWorkloadSketch &GpuWorkloadSketch::operator=(GpuWorkloadSketch &&) = default; + +const GpuWorkloadSketch::Context *GpuWorkloadSketch::gpu_context() const +{ + return _impl->context(); +} + +GpuWorkloadSketch::Context *GpuWorkloadSketch::gpu_context() +{ + return _impl->context(); +} + +GpuWorkloadSketch::Implementation &GpuWorkloadSketch::implementation() +{ + return *_impl; +} + +const GpuWorkloadSketch::Implementation &GpuWorkloadSketch::implementation() const +{ + return *_impl; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h new file mode 100644 index 0000000000..04e294eacc --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL_H + +#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h" +#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h" + +#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h" +#include "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Internal implementation of @ref GpuWorkloadSketch */ +class GpuWorkloadSketch::Implementation +{ +public: + /** Constructor + * + * @param[in] context global workload creation context + */ + explicit Implementation(Context *context) + : _context{context}, _comp_services{}, _component_graph{_context, &_comp_services}, _operator_group{} + { + } + /** Prevent instances of this class from being copy constructed */ + Implementation(const Implementation &impl) = delete; + /** Prevent instances of this class from being copied */ + Implementation &operator=(const Implementation &impl) = delete; + /** Allow instances of this class to be move constructed */ + Implementation(Implementation &&impl) = default; + /** Allow instances of this class to be moved */ + Implementation &operator=(Implementation &&impl) = default; + /** Get workload context */ + const Context *context() const + { + return _context; + } + /** Get workload context */ + Context *context() + { + return _context; + } + /** Get component graph */ + const GpuKernelComponentGraph &component_graph() const + { + return _component_graph; + } + /** Get component graph */ + GpuKernelComponentGraph &component_graph() + { + return _component_graph; + } + /** Get operator group */ + const GpuOperatorGroup &operator_group() const + { + return _operator_group; + } + /** Get operator group */ + GpuOperatorGroup &operator_group() + { + return _operator_group; + } + /** Generate @ref GpuWorkloadSourceCode from the workload sketch + * @note The sketch must be valid. Any error encountered during the building of the code will be thrown. + * + * @return GpuWorkloadSourceCode The generated workload code + */ + GpuWorkloadSourceCode generate_source_code() const + { + const auto mem_map = _context->implementation().mem_map(); + return component_graph().fuse(mem_map).write_workload_code(); + } + /** Create a virtual (see @ref MemoryType) tensor info and save it + * + * @return ITensorInfo* The created virtual tensor info object pointer + */ + ITensorInfo *create_virtual_tensor() + { + return _context->implementation().create_virtual_tensor(); + } + /** Create an auxiliary (see @ref MemoryType) tensor info and save it + * + * @param[in] tensor_info @ref ITensorInfo to copy from + * + * @return ITensorInfo* The created auxiliary tensor info object pointer + */ + ITensorInfo *create_auxiliary_tensor(const ITensorInfo &tensor_info) + { + return _context->implementation().create_auxiliary_tensor(tensor_info); + } + + ITensorInfo *get_tensor_info(ITensorInfo::Id id) + { + return _context->implementation().get_tensor_info(id); + } + +private: + Context *_context; + GpuComponentServices _comp_services; + GpuKernelComponentGraph _component_graph; + GpuOperatorGroup _operator_group; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL_H diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h new file mode 100644 index 0000000000..5d75bcaaa0 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h @@ -0,0 +1,300 @@ +/* + * Copyright (c) 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE_H + +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h" + +#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +/** Extract kernel arguments of one tensor from a flat list of kernel arguments. + * + * @param[in] flat_kernel_args + * @return GpuKernelArgumentList + */ +GpuKernelArgumentList extract_kernel_args_for_one_tensor(GpuKernelArgumentList &flat_kernel_args) +{ + if (flat_kernel_args.empty()) + { + return {}; + } + GpuKernelArgumentList tensor_kargs{}; + + const GpuKernelArgumentBinding &karg_head = flat_kernel_args.front(); + tensor_kargs.push_back(karg_head); + flat_kernel_args.pop_front(); + const auto tensor_id = karg_head.id(); + + while (!flat_kernel_args.empty()) + { + const GpuKernelArgumentBinding &karg = flat_kernel_args.front(); + if (karg.id() != tensor_id) // Encounter the next tensor, return the current tensor's kernel arguments + { + return tensor_kargs; + } + tensor_kargs.push_back(karg); + flat_kernel_args.pop_front(); + } + return tensor_kargs; +} +} // namespace +/** Uniquely identifies a @ref GpuUnitWorkload within a @ref GpuWorkloadSourceCode */ +using UnitWorkloadId = int32_t; + +/** Describes all the info related to a **workload argument** (tensor) in order to: + * - be used by runtime to configure gpu kernel argument + * - be used by memory managers to allocate required memory + */ +class GpuWorkloadArgument +{ +public: + /** Default constructor */ + GpuWorkloadArgument() = default; + /** Constructor + * + * @param[in] tensor_info @ref ITensorInfo of the workload argument + * @param[in] mem_desc @ref MemoryDescriptor of the workload argument + * @param[in] kernel_args @ref GpuKernelArgumentList of the workload argument + */ + GpuWorkloadArgument(const ITensorInfo &tensor_info, + const MemoryDescriptor &mem_desc, + const GpuKernelArgumentList &kernel_args) + : _tensor_info{tensor_info}, _mem_desc{mem_desc}, _kernel_args{kernel_args} + { + } + /** Get tensor id within workload */ + ITensorInfo::Id id() const + { + return _tensor_info.id(); + } + /** Get @ref ITensorInfo of the argument */ + ITensorInfo *tensor_info() + { + return &_tensor_info; + } + /** Get @ref ITensorInfo of the argument */ + const ITensorInfo *tensor_info() const + { + return &_tensor_info; + } + /** Get @ref MemoryDescriptor of the argument */ + MemoryDescriptor *memory_descriptor() + { + return &_mem_desc; + } + /** Get @ref MemoryDescriptor of the argument */ + const MemoryDescriptor *memory_descriptor() const + { + return &_mem_desc; + } + /** Get @ref GpuKernelArgumentList of the workload tensor */ + GpuKernelArgumentList *kernel_argument_list() + { + return &_kernel_args; + } + /** Get @ref GpuKernelArgumentList of the workload tensor */ + const GpuKernelArgumentList *kernel_argument_list() const + { + return &_kernel_args; + } + /** Check if the workload argument has valid id + * + * @return true If has valid id + * @return false Otherwise + */ + bool has_valid_id() const + { + return _tensor_info.has_valid_id(); + } + +private: + TensorInfo _tensor_info{}; + MemoryDescriptor _mem_desc{}; + GpuKernelArgumentList _kernel_args{}; +}; + +/** Describes when a unit workload is run. + */ +struct UnitWorkloadStage +{ + enum class Stage + { + Prepare, /**< Only run once at the beginning. */ + Run, /**< Run every time after the first time. */ + }; + Stage stage{Stage::Run}; +}; + +inline bool operator==(const UnitWorkloadStage &stage0, const UnitWorkloadStage &stage1) +{ + return stage0.stage == stage1.stage; +} + +/** The atomic unit in a Gpu workload. It contains exactly one kernel to run. + */ +class GpuUnitWorkload +{ +public: + /** Default constructor */ + GpuUnitWorkload() = default; + /** Constructor + * + * @param[in] id Id that uniquely identifies this unit workload in a workload + * @param[in] kernel_code @ref GpuKernelSourceCode contained within + * @param[in] stage Stage of the unit workload + */ + GpuUnitWorkload(UnitWorkloadId id, const GpuKernelSourceCode &kernel_code, const UnitWorkloadStage &stage) + : _id{id}, _kernel_code{kernel_code}, _stage{stage} + { + } + /** Get the id of the unit workload */ + UnitWorkloadId id() const + { + return _id; + } + /** Get reference to the underlying @ref GpuKernelSourceCode */ + const GpuKernelSourceCode &code() const + { + return _kernel_code; + } + /** Get the stage of the unit workload */ + UnitWorkloadStage stage() const + { + return _stage; + } + +private: + UnitWorkloadId _id{}; + GpuKernelSourceCode _kernel_code{}; + UnitWorkloadStage _stage{}; +}; + +/** Hold the generated kernel source code and other information required to compile and run the workload. + */ +class GpuWorkloadSourceCode +{ +public: + /** Default constructor */ + GpuWorkloadSourceCode() = default; + /** Add a unit workload to the workload code + * + * @param[in] kernel_code @ref GpuKernelSourceCode to be contained within the unit workload + * @param[in] stage Stage of the unit workload + * @param[in] mem_map @ref MemoryDescriptor map for all tensors within the unit workload + * @param[in] context @ref GpuWorkloadContext associated with the unit workload + * + * @return UnitWorkloadId Allocated unit workload id + */ + UnitWorkloadId add_unit_workload(const GpuKernelSourceCode &kernel_code, + const UnitWorkloadStage &stage, + const MemoryDescriptorMap &mem_map, + const GpuWorkloadContext *context) + { + // Use the size of the kernel codes as Id + const auto uwk_id = static_cast<UnitWorkloadId>(_unit_workloads.size()); + const auto unit_work = GpuUnitWorkload(uwk_id, kernel_code, stage); + _unit_workloads.push_back(unit_work); + + GpuKernelArgumentList flat_kernel_args = kernel_code.arguments(); + GpuKernelArgumentList tensor_kargs{}; + while (true) + { + tensor_kargs = extract_kernel_args_for_one_tensor(flat_kernel_args); + if (tensor_kargs.empty()) + { + break; + } + else + { + const auto tensor_id = tensor_kargs.at(0).id(); + _workload_arguments[tensor_id] = GpuWorkloadArgument{ + *context->implementation().get_tensor_info(tensor_id), mem_map.at(tensor_id), tensor_kargs}; + if (_tensor_uwork_map.find(tensor_id) == _tensor_uwork_map.end()) + { + _tensor_uwork_map[tensor_id] = std::set<UnitWorkloadId>(); + } + _tensor_uwork_map[tensor_id].insert(uwk_id); + } + } + + return uwk_id; + } + /** Get a unit workload from its id */ + const GpuUnitWorkload &query_unit_workload(UnitWorkloadId id) const + { + ARM_COMPUTE_ERROR_ON(id < 0); + return _unit_workloads.at(id); + } + /** Get all unit workloads sorted in topological order */ + std::vector<UnitWorkloadId> unit_workloads() const + { + std::vector<UnitWorkloadId> ids{}; + + for (const auto &uwk : _unit_workloads) + { + ids.push_back(uwk.id()); + } + return ids; + } + /** Get a @ref GpuWorkloadArgument from its associated tensor id */ + const GpuWorkloadArgument *query_tensor(ITensorInfo::Id t_id) const + { + return &_workload_arguments.at(t_id); + } + /** Get all tensors in the entire workload */ + std::vector<ITensorInfo::Id> tensors() const + { + std::vector<ITensorInfo::Id> ids{}; + for (const auto &id_tensor : _workload_arguments) + { + ids.push_back(id_tensor.first); + } + return ids; + } + /** Get all unit workloads connected to the tensor with @p t_id */ + std::vector<UnitWorkloadId> get_unit_workloads_from_tensor(ITensorInfo::Id t_id) const + { + const auto unit_work_set = _tensor_uwork_map.at(t_id); + return std::vector<UnitWorkloadId>(unit_work_set.begin(), unit_work_set.end()); + } + +private: + std::vector<GpuUnitWorkload> _unit_workloads{}; + std::map<ITensorInfo::Id, GpuWorkloadArgument> _workload_arguments{}; + std::map<ITensorInfo::Id, std::set<UnitWorkloadId>> _tensor_uwork_map{}; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE_H diff --git a/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h new file mode 100644 index 0000000000..84972501de --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER_H + +#include "arm_compute/core/CL/CLCompileContext.h" +#include "arm_compute/core/Window.h" + +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h" + +#include <map> +#include <string> +#include <vector> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** An interface that can write a gpu kernel + */ +class IGpuKernelWriter +{ +public: + /** Destructor */ + virtual ~IGpuKernelWriter() + { + } + /** Generate kernel name */ + virtual std::string get_name() = 0; + /** Generate kernel code */ + virtual std::string get_code() = 0; + /** Generate build options */ + virtual CLBuildOptions get_build_options() + { + return {}; + } + /** Generate config id string of the entire kernel. This is used for tuning */ + virtual std::string get_config_id() = 0; + /** Generate execution window */ + virtual Window get_window() const = 0; + /** Get the flat list of arguments of the kernel*/ + virtual GpuKernelArgumentList get_kernel_arguments() + { + return {}; + } +}; + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER_H diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp new file mode 100644 index 0000000000..a42b39700c --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h" + +#include "compute_kernel_writer/include/ckw/Error.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ + +GpuCkwComponentArgument::GpuCkwComponentArgument(ckw::TensorOperand tensor) : _tensor(tensor) +{ +} + +GpuCkwComponentArgument &GpuCkwComponentArgument::init_virtual_tensor(ckw::TileOperand &tile, + const ckw::TensorSampler &sampler) +{ + CKW_ASSERT(_tile == nullptr); + + _tile = tile; + _sampler = sampler; + + return *this; +} + +bool GpuCkwComponentArgument::has_tensor() const +{ + return _tensor.is_valid(); +} + +ckw::TensorOperand &GpuCkwComponentArgument::tensor() +{ + CKW_ASSERT(_tensor.is_valid()); + + return _tensor; +} + +const ckw::TensorOperand &GpuCkwComponentArgument::tensor() const +{ + CKW_ASSERT(_tensor.is_valid()); + + return _tensor; +} + +bool GpuCkwComponentArgument::has_tile() const +{ + return _tile.is_valid(); +} + +ckw::TileOperand &GpuCkwComponentArgument::tile() +{ + CKW_ASSERT(_tile.is_valid()); + + return _tile; +} + +const ckw::TileOperand &GpuCkwComponentArgument::tile() const +{ + CKW_ASSERT(_tile.is_valid()); + + return _tile; +} + +ckw::TensorSampler &GpuCkwComponentArgument::tensor_sampler() +{ + CKW_ASSERT(_tile.is_valid()); + + return _sampler; +} + +const ckw::TensorSampler &GpuCkwComponentArgument::tensor_sampler() const +{ + CKW_ASSERT(_tile.is_valid()); + + return _sampler; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h new file mode 100644 index 0000000000..7a57c81e5f --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWCOMPONENTARGUMENT_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWCOMPONENTARGUMENT_H + +#include "compute_kernel_writer/include/ckw/TensorOperand.h" +#include "compute_kernel_writer/include/ckw/TensorSampler.h" +#include "compute_kernel_writer/include/ckw/TileOperand.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ + +/** The argument of a dynamic fusion component which can be either user tensor or virtual tensor. */ +class GpuCkwComponentArgument +{ +public: + /** Default constructor */ + GpuCkwComponentArgument() = default; + + /** Initialize a new instance of @ref GpuCkwComponentArgument class for user tensor. + * + * @param[in] tensor The user tensor. + */ + explicit GpuCkwComponentArgument(ckw::TensorOperand tensor); + + /** Bind the tile and sampler to the tensor argument. + * + * This method can be used to share a tile and sampler associated to a tensor + * among different kernel components. For example, when we create the destination + * tile and destination sampler for the first time (root component), this method can be + * used to bind these two information to the destination tensor so that the following + * simple components know the tile size and how to access the elements from memory. + * + * @param[in] tile The tile that has been loaded. + * @param[in] sampler The tensor sampling information that has been used to load the tile. + */ + GpuCkwComponentArgument &init_virtual_tensor(ckw::TileOperand &tile, const ckw::TensorSampler &sampler); + + /** Get whether the argument is a user tensor. */ + bool has_tensor() const; + + /** Get the tensor operand. + * + * If the tensor is not available, throw an error. + */ + ckw::TensorOperand &tensor(); + + /** Get the tensor operand. + * + * If the tensor is not available, throw an error. + */ + const ckw::TensorOperand &tensor() const; + + /** Get whether the argument contains a tile. + * + * The argument can be either a user tensor that has been loaded, + * or a virtual tensor (i.e. a tile with tensor sampling information). + */ + bool has_tile() const; + + /** Get the tile operand. + * + * If the tile is not available, throw an error. + */ + ckw::TileOperand &tile(); + + /** Get the tile operand. + * + * If the tile is not available, throw an error. + */ + const ckw::TileOperand &tile() const; + + /** Get the tensor sampling information for the tile. + * + * If the tile is not available, throw an error. + */ + ckw::TensorSampler &tensor_sampler(); + + /** Get the tensor sampling information for the tile. + * + * If the tile is not available, throw an error. + */ + const ckw::TensorSampler &tensor_sampler() const; + +private: + ckw::TensorOperand _tensor{}; + ckw::TileOperand _tile{}; + ckw::TensorSampler _sampler{}; +}; + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute + +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWCOMPONENTARGUMENT_H diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp new file mode 100644 index 0000000000..a0e5e16aa0 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Window.h" + +#include "src/common/utils/Log.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h" +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" + +#include "compute_kernel_writer/include/ckw/KernelWriter.h" +#include "compute_kernel_writer/include/ckw/types/TargetArchitecture.h" +#include "compute_kernel_writer/include/ckw/types/TargetLanguage.h" + +using namespace ckw; + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +GpuCkwDriver::GpuCkwDriver(const GpuKernelComponentGroup &components) +{ + _components = components; + + // Generate kernel name + std::string kernel_name; + for (auto &comp : _components) + { + auto ckw_driver = comp->ckw_component_driver(); + ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr); + kernel_name += ckw_driver->get_name(_components) + "__"; + } + + // Generate kernel code + auto root_writer = + KernelWriter::create_instance(ckw::TargetArchitecture::GpuArmMaliValhall, ckw::TargetLanguage::OpenCL); + GpuCkwScopedKernelWriter writer(root_writer.get()); + GpuCkwVariableTable vtable{}; + + for (auto &comp : _components) + { + auto ckw_driver = comp->ckw_component_driver(); + ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr); + ckw_driver->write_component_code(_components, vtable, writer); + } + auto kernel = root_writer->emit_kernel(kernel_name); + + // Set the kernel name, kernel arguments and source code + _kernel_name = kernel_name; + _kernel_args = kernel->arguments(); + _kernel_code = kernel->source_code(); +} + +std::string GpuCkwDriver::get_name() +{ + return _kernel_name; +} + +std::string GpuCkwDriver::get_code() +{ + return _kernel_code; +} + +std::string GpuCkwDriver::get_config_id() +{ + std::string id; + for (auto &comp : _components) + { + auto ckw_driver = comp->ckw_component_driver(); + ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr); + id = ckw_driver->get_tuner_id(_components) + "__"; + } + return id; +} + +Window GpuCkwDriver::get_window() const +{ + const auto root_comp = _components.get_root_component(); + ARM_COMPUTE_ERROR_ON_MSG(root_comp == nullptr, "No root component found"); + return root_comp->ckw_component_driver()->get_window(); +} + +GpuKernelArgumentList GpuCkwDriver::get_kernel_arguments() +{ + GpuKernelArgumentList args{}; + for (const auto &arg : _kernel_args) + { + switch (arg.type()) + { + case KernelArgument::Type::TensorStorage: + { + args.emplace_back(static_cast<ITensorInfo::Id>(arg.id()), from_ckw(arg.tensor_storage_type())); + break; + } + case KernelArgument::Type::TensorComponent: + { + args.emplace_back(static_cast<ITensorInfo::Id>(arg.id()), from_ckw(arg.tensor_component_type())); + break; + } + default: + { + ARM_COMPUTE_ERROR("Unsupported KernelArgument Type"); + break; + } + } + } + return args; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h new file mode 100644 index 0000000000..f8770920b7 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER_H + +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h" + +#include "compute_kernel_writer/include/ckw/Kernel.h" +#include "compute_kernel_writer/include/ckw/KernelArgument.h" +#include <string> + +namespace arm_compute +{ +/** Forward declarations */ +class Window; + +namespace experimental +{ +namespace dynamic_fusion +{ +/** Use Kernel Writer to write kernel code + * Used by dynamic_fusion module + */ +class GpuCkwDriver : public IGpuKernelWriter +{ +public: + /** Default constructor */ + GpuCkwDriver() = delete; + /** Constructor + * + * @param[in] components Kernel component group from which the kernel will be generated + */ + GpuCkwDriver(const GpuKernelComponentGroup &components); + /** Destructor */ + ~GpuCkwDriver() override = default; + /** Generate kernel name */ + std::string get_name() override; + /** Generate kernel code */ + std::string get_code() override; + /** Generate config id string of the entire kernel. This is used for tuning */ + std::string get_config_id() override; + /** Generate execution window */ + Window get_window() const override; + /** Get the flat list of arguments of the kernel*/ + GpuKernelArgumentList get_kernel_arguments() override; + +private: + GpuKernelComponentGroup _components{}; + std::string _kernel_name{}; + std::vector<ckw::KernelArgument> _kernel_args{}; + std::string _kernel_code{}; +}; + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute + +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER_H diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp new file mode 100644 index 0000000000..ae12d13e5a --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ + +GpuCkwScopedKernelWriter::GpuCkwScopedKernelWriter(ckw::KernelWriter *writer) + : _writer(writer), _parent_id_space(writer->id_space()) +{ + _writer->new_id_space(); +} + +GpuCkwScopedKernelWriter::GpuCkwScopedKernelWriter(const GpuCkwScopedKernelWriter &other) + : _writer(other._writer), _parent_id_space(other._writer->id_space()) +{ + _writer->new_id_space(); +} + +ckw::KernelWriter *GpuCkwScopedKernelWriter::operator->() +{ + return _writer; +} + +const ckw::KernelWriter *GpuCkwScopedKernelWriter::operator->() const +{ + return _writer; +} + +ckw::KernelWriter *GpuCkwScopedKernelWriter::writer() +{ + return _writer; +} + +const ckw::KernelWriter *GpuCkwScopedKernelWriter::writer() const +{ + return _writer; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h new file mode 100644 index 0000000000..84dd706cd0 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWSCOPEDKERNELWRITER_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWSCOPEDKERNELWRITER_H + +#include "compute_kernel_writer/include/ckw/KernelWriter.h" +#include <cstdint> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ + +/** Helper to automatically manage kernel writer ID space. */ +class GpuCkwScopedKernelWriter +{ +public: + /** Initialize a new instance of @ref GpuCkwScopedKernelWriter class. */ + explicit GpuCkwScopedKernelWriter(ckw::KernelWriter *writer); + + /** Create a new scope from the specified scoped kernel writer. */ + GpuCkwScopedKernelWriter(const GpuCkwScopedKernelWriter &other); + + /** Assignment is disallowed. */ + GpuCkwScopedKernelWriter &operator=(const GpuCkwScopedKernelWriter &) = delete; + + /** Access the underlying kernel writer. */ + ckw::KernelWriter *operator->(); + + /** Access the underlying kernel writer. */ + const ckw::KernelWriter *operator->() const; + + /** Get the kernel writer. */ + ckw::KernelWriter *writer(); + + /** Get the kernel writer. */ + const ckw::KernelWriter *writer() const; + +private: + ckw::KernelWriter *_writer; + int32_t _parent_id_space; +}; + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute + +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWSCOPEDKERNELWRITER_H diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp new file mode 100644 index 0000000000..66ccc1ac34 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" + +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" + +#include <sstream> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +GpuCkwComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group, + GpuCkwScopedKernelWriter &writer, + const ITensorInfo *tensor, + const std::string &alias) +{ + ARM_COMPUTE_ERROR_ON_MSG(!tensor->has_valid_id(), "Tensor info with valid id expected"); + + // Do not re-declare if the variable associated with the tensor has already been declared + auto it = _vars.find(tensor->id()); + + if (it != _vars.end()) + { + return &it->second; + } + if (comp_group.is_intermediate_tensor(tensor)) + { + // Create a virtual tensor variable + GpuCkwComponentArgument var; + auto &&inserted = _vars.emplace(tensor->id(), var); + return &(inserted.first->second); + } + else + { + // Create a user tensor variable + std::stringstream ss; + ss << alias << "_t" << abs(tensor->id()); + const auto uniq_name = ss.str(); + GpuCkwComponentArgument var{writer->declare_tensor_argument(uniq_name, to_ckw(*tensor))}; + auto &&inserted = _vars.emplace(tensor->id(), var); + return &(inserted.first->second); + } +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h new file mode 100644 index 0000000000..fc8764c3e2 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE_H + +#include "arm_compute/core/ITensorInfo.h" + +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h" + +#include <map> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +class GpuKernelComponentGroup; +class GpuCkwScopedKernelWriter; + +/** A table of all the variables used in the kernel. + * + * It determines whether we create an virtual tensor var or a user tensor var + * It avoids duplicating variables for the same tensors (Tensors with the same id) + * Each kernel has exactly one variable table. + */ +class GpuCkwVariableTable +{ +public: + /** Declare a kernel component variable(argument) for the corresponding tensor info. + * + * @param[in] comp_group Component group the tensor belongs to + * @param[in] writer Compute Kernel Writer + * @param[in] tensor Tensor info with which the new variable is associated + * @param[in] alias Alias for the variable. Will be used as part of the variable name + * + * @return GpuCkwComponentArgument* + */ + GpuCkwComponentArgument *declare_variable(const GpuKernelComponentGroup &comp_group, + GpuCkwScopedKernelWriter &writer, + const ITensorInfo *tensor, + const std::string &alias = "unnamed"); + +private: + std::map<ITensorInfo::Id, GpuCkwComponentArgument> _vars{}; +}; + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE_H diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h new file mode 100644 index 0000000000..52e56e2e35 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_IGPUCKWCOMPONENTDRIVER +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_IGPUCKWCOMPONENTDRIVER + +#include "arm_compute/core/Window.h" + +#include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/components/Types.h" + +namespace arm_compute +{ +class ITensorInfo; +namespace experimental +{ +namespace dynamic_fusion +{ +/** Forward declaration */ +class GpuKernelComponentGroup; +class GpuCkwVariableTable; +class GpuCkwScopedKernelWriter; + +/** An interface used by @ref GpuCkwDriver to write source code for a kernel component + * + * There are 3 main architecture layers for using Compute Kernel Writer (Ckw) inside ACL's dynamic fusion module + * From top level to bottom level: + * | Layer | Library + * =========================== + * | dynamic_fusion | acl + * | ckw_driver | acl + * | ckw | ckw + * + * ckw_driver is a glue layer that directs how fused code is produced using the ckw library + * + * There are two main groups within ckw_driver: + * - @ref GpuCkwDriver is a global driver that coordinates how the final fused code along with all the info necessary + * for run time execution is produced using ckw + * - Various classes implementing @ref IGpuCkwComponentDriver is a component driver that directs ckw to generate kernel component code (e.g. activation, store etc.) + * + * The overall flow goes like this: + * In dynamic_fusion module, @ref GpuLogicalKernel instantiates a @ref GpuCkwDriver from a @ref GpuKernelComponentGroup + * The logical kernel then uses the global driver's various interfaces to generate the code info. + * In particular, the @ref GpuCkwDriver::get_code() interface will call into each @ref IGpuCkwComponentDriver::write_component_code() + */ +class IGpuCkwComponentDriver +{ +public: + using ComponentGroup = GpuKernelComponentGroup; + +public: + /** Constructor + * + * @param[in] id Component id + * @param[in] tensors Tensor arguments to the components + */ + IGpuCkwComponentDriver(ComponentId id, const ArgumentPack<ITensorInfo> &tensors) : _id{id}, _tensors{tensors} + { + } + /** Destructor */ + virtual ~IGpuCkwComponentDriver() + { + } + /** Generate kernel component code + * + * @param[in] comp_group Component group of which the component is a part of + * @param[in, out] vtable Table of variables declared by each component + * @param[in, out] writer CKW writer that writes code scoped to this kernel component. + * + * @note @p writer can only be passed via value since the new scope is created in the copy constructor + */ + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const = 0; + /** Get tensor arguments */ + ArgumentPack<ITensorInfo> tensors() const + { + return _tensors; + } + /** Generate the execution window for the component */ + virtual Window get_window() const + { + return Window{}; + } + /** Generate the name of the component + * + * This will be concatenated with other components' names to form the name of the kernel + */ + virtual std::string get_name(const ComponentGroup &comp_group) const + { + ARM_COMPUTE_UNUSED(comp_group); + return "unnamed"; + } + /** Generate the tuner id of the component + * This id should capture all the parameters that distinguish one kernel's lws tuning from another. + * e.g. two components that are identical in every other way, but have output tensor dimensions should + * have different tuner ids, because the lws of one may not be optimal on the other. + * + * This will be concatenated with other components' tuner id to form the tuner id of the kernel + */ + virtual std::string get_tuner_id(const ComponentGroup &comp_group) const + { + ARM_COMPUTE_UNUSED(comp_group); + return ""; + } + /** Get component id */ + ComponentId id() const + { + return _id; + } + +private: + ComponentId _id{-1}; + ArgumentPack<ITensorInfo> _tensors{}; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute + +#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_IGPUCKWCOMPONENTDRIVER */ diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp new file mode 100644 index 0000000000..18fda5bd6b --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp @@ -0,0 +1,295 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "GpuCkwActivation.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" + +#include "compute_kernel_writer/include/ckw/KernelWriter.h" +#include <cstdint> +#include <string> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ + +GpuCkwActivation::GpuCkwActivation(ComponentId id, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes) // NOLINT + : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes} +{ + _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); + _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); + ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); +} + +void GpuCkwActivation::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const +{ + /******************************************************************************** + * 1 - Define tensors + ********************************************************************************/ + GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src"); + GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst"); + + /******************************************************************************** + * 2 - Define CKW constants + ********************************************************************************/ + const auto dst_h = static_cast<int32_t>(_dst->dimension(1)); + const auto dst_dt = to_ckw(_dst->data_type()); + + // CKW constants + auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32)); + auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32)); + auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32)); + auto const_neg_1_fp = writer->declare_constant_tile(ckw::ConstantData({{-1.0f}}, dst_dt)); + auto const_pos_1_fp = writer->declare_constant_tile(ckw::ConstantData({{1.0f}}, dst_dt)); + auto const_0_fp = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt)); + auto const_A_fp = writer->declare_constant_tile(ckw::ConstantData({{_attributes.a()}}, dst_dt)); + auto const_B_fp = writer->declare_constant_tile(ckw::ConstantData({{_attributes.b()}}, dst_dt)); + + /******************************************************************************** + * 3 - Define the compute block parameters and destination tile (if not root component) + * Bind the tile to the tensor to share it among different components and + * initialize the compute block parameters + ********************************************************************************/ + // The compute block parameters depend on the employed tensor format + + // Destination compute block size + int32_t dst_n0 = -1; + int32_t dst_m0 = -1; + + // Destination compute block size left-over + int32_t dst_n0_partial = -1; + int32_t dst_m0_partial = -1; + + // Shift-back for the overlapping-min strategy + int32_t dst_shift_back = -1; + + if (!dst->has_tile()) + { + // If ROOT component, we use ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1 + // as tensor format + const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); + + dst_n0 = root_window.x().step(); + dst_m0 = root_window.y().step(); + dst_n0_partial = _dst->dimension(0) % dst_n0; + dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0; + dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0; + + ckw::TensorSampler sampler_dst; + sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1); + + if (dst_n0_partial == 0) + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None); + } + else + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin); + } + + if (dst_m0_partial == 0) + { + sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None); + } + else + { + sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly); + } + + sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr); + + // Declare destination tile + auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0)); + + // Bind tile to the tensor + dst->init_virtual_tensor(tile_dst, sampler_dst); + } + else + { + // dst_m0_partial depends on the TensorSamplerFormat + dst_n0 = dst->tile().tile_info().width(); + dst_m0 = dst->tile().tile_info().height(); + dst_n0_partial = _dst->dimension(0) % dst_n0; + + ckw::TensorSampler sampler_dst = dst->tensor_sampler(); + + if (sampler_dst.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1) + { + dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0; + } + else if (sampler_dst.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2) + { + dst_m0_partial = _dst->dimension(1) % dst_m0; + } + + // Shift-back for the overlapping-min strategy + dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0; + } + + const auto &tile_dst = dst->tile(); + + /******************************************************************************** + * 4 - Define the compute block parameters CKW constants + ********************************************************************************/ + // Only now we can declare the N0 and M0 as constant + auto const_dst_n0 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32)); + auto const_dst_m0 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32)); + auto const_dst_shift_back_n0 = + writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32)); + + /******************************************************************************** + * 5 - Define the sampler for the input tensor + ********************************************************************************/ + if (!src->has_tile()) + { + // Sampler + ckw::TensorSampler sampler_src = dst->tensor_sampler(); + + auto tile_gid_0 = writer->declare_tile("gid_0_src", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_1 = writer->declare_tile("gid_1_src", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_2 = writer->declare_tile("gid_2_src", ckw::TileInfo(ckw::DataType::Int32)); + + writer->op_get_global_id(tile_gid_0, 0); + writer->op_get_global_id(tile_gid_1, 1); + writer->op_get_global_id(tile_gid_2, 2); + + auto tile_nout0 = writer->declare_tile("nout0_src", ckw::TileInfo(ckw::DataType::Int32)); // OFM + auto tile_mout0 = + writer->declare_tile("mout0_src", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT + auto tile_mout1 = writer->declare_tile("mout1_src", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0 + auto tile_bout0 = writer->declare_tile("bout0_src", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX + + get_coordinate_from_gws_overlapping_min(writer, tile_nout0, tile_gid_0, const_dst_n0, const_dst_shift_back_n0, + const_0_i32); + get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_dst_m0); + + // Get the boundary aware coordinates at each global dimension index + if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1) + { + writer->op_assign(tile_mout1, const_0_i32); + get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32); + } + else if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2) + { + writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32); + writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32); + } + + auto tile_src = writer->declare_tile("src", ckw::TileInfo(dst_dt, dst_m0, dst_n0)); + + writer->op_load(tile_src, src->tensor(), sampler_src, tile_nout0, tile_mout0, tile_mout1, tile_bout0); + + // Here, init_virtual_tensor() it is used to bring the tile_src outside the compound statement + src->init_virtual_tensor(tile_src, sampler_src); + } + + const auto &tile_src = src->tile(); + + /******************************************************************************** + * 7 - Write the rest of the code + ********************************************************************************/ + switch (_attributes.activation()) + { + case ActivationLayerInfo::ActivationFunction::LOGISTIC: + { + // dst = src * -1 + writer->op_binary(tile_dst, ckw::BinaryOp::Mul, tile_src, const_neg_1_fp); + // dst = exp(src * -1) + writer->op_unary(tile_dst, ckw::UnaryOp::Exp, tile_dst); + // dst = 1 + (exp(src * -1)) + writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, const_pos_1_fp); + // dst = 1 / 1 + (exp(src * -1)) + writer->op_binary(tile_dst, ckw::BinaryOp::Div, const_pos_1_fp, tile_dst); + break; + } + case ActivationLayerInfo::ActivationFunction::TANH: + { + writer->op_unary(tile_dst, ckw::UnaryOp::Tanh, tile_src); + break; + } + case ActivationLayerInfo::ActivationFunction::RELU: + { + // dst = max(src, 0) + writer->op_binary(tile_dst, ckw::BinaryOp::Max, tile_src, const_0_fp); + break; + } + case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: + { + //dst = max(src, 0) + writer->op_binary(tile_dst, ckw::BinaryOp::Max, tile_src, const_0_fp); + //dst = min(max(src, 0), A_VAL) + writer->op_binary(tile_dst, ckw::BinaryOp::Min, tile_dst, const_A_fp); + break; + } + case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: + { + //dst = max(src, B_VAL) + writer->op_binary(tile_dst, ckw::BinaryOp::Max, tile_src, const_B_fp); + //dst = min(max(src, B_VAL), A_VAL) + writer->op_binary(tile_dst, ckw::BinaryOp::Min, tile_dst, const_A_fp); + break; + } + default: + CKW_ASSERT(false); + break; + } + ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component"); +} + +Window GpuCkwActivation::get_window() const +{ + ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); + + TensorShape output_shape = _dst->tensor_shape(); + // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged + // This is in line with the collapsing convention used by operators like Conv2d + output_shape.collapse(2U, 1U); + constexpr uint32_t vector_size_byte_opencl = 16; + const uint32_t num_elems_processed_per_iteration = + adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); + Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); + + return win; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h new file mode 100644 index 0000000000..386e933a72 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWACTIVATION +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWACTIVATION + +#include "src/core/common/Macros.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +class GpuCkwActivation : public IGpuCkwComponentDriver +{ +public: + using Attributes = ClComponentActivation::Attributes; + /** Constructor + * + * For supported configurations please refer to @ref GpuCkwActivation::validate() + * + * @param[in] id Component id + * @param[in] tensors Tensor arguments to the component + * @param[in] attributes Component attributes + */ + GpuCkwActivation(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwActivation); + /** Destructor */ + ~GpuCkwActivation() override = default; + // Inherited methods overriden: + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + Window get_window() const override; + +private: + const ITensorInfo *_src; + const ITensorInfo *_dst; + Attributes _attributes; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute + +#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWACTIVATION */ diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp new file mode 100644 index 0000000000..d3e0dbafd4 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "GpuCkwCast.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" + +#include "compute_kernel_writer/include/ckw/KernelWriter.h" +#include <cstdint> +#include <string> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ + +GpuCkwCast::GpuCkwCast(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes) + : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes} +{ + _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); + _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); + ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); + ARM_COMPUTE_ERROR_ON_MSG(is_data_type_float(_src->data_type()) == false, + "The source data type must be a floating-point data type"); +} + +void GpuCkwCast::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const +{ + /******************************************************************************** + * 1 - Define tensors + ********************************************************************************/ + GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src"); + GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst"); + + /******************************************************************************** + * 2 - Define CKW constants + ********************************************************************************/ + const auto dst_h = static_cast<int32_t>(_dst->dimension(1)); + + // CKW constants + auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32)); + auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32)); + auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32)); + + /******************************************************************************** + * 3 - Define the compute block parameters and destination tile (if not root component) + * Bind the tile to the tensor to share it among different components and + * initialize the compute block parameters + ********************************************************************************/ + // The compute block parameters depend on the employed tensor format + + // Destination compute block size + int32_t dst_n0 = -1; + int32_t dst_m0 = -1; + + // Destination compute block size left-over + int32_t dst_n0_partial = -1; + int32_t dst_m0_partial = -1; + + // Shift-back for the overlapping-min strategy + int32_t dst_shift_back = -1; + + if (!dst->has_tile()) + { + // If ROOT component, we use ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1 + // as tensor format + const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); + + dst_n0 = root_window.x().step(); + dst_m0 = root_window.y().step(); + dst_n0_partial = _dst->dimension(0) % dst_n0; + dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0; + dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0; + + ckw::TensorSampler sampler_dst; + sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1); + if (dst_n0_partial == 0) + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None); + } + else + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin); + } + + if (dst_m0_partial == 0) + { + sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None); + } + else + { + sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly); + } + + sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr); + + // Declare destination tile + ckw::DataType dst_dt = to_ckw(_dst->data_type()); + auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0)); + + // Bind tile to the tensor + dst->init_virtual_tensor(tile_dst, sampler_dst); + } + else + { + // Change dst_n0 and dst_m0 if NOT root component! + // ATTENTION: + // dst_m0_partial depends on the TensorSamplerFormat + dst_n0 = dst->tile().tile_info().width(); + dst_m0 = dst->tile().tile_info().height(); + dst_n0_partial = _dst->dimension(0) % dst_n0; + + ckw::TensorSampler sampler_dst = dst->tensor_sampler(); + + if (sampler_dst.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1) + { + dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0; + } + else if (sampler_dst.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2) + { + dst_m0_partial = _dst->dimension(1) % dst_m0; + } + + // Shift-back for the overlapping-min strategy + dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0; + } + + const auto &tile_dst = dst->tile(); + + /******************************************************************************** + * 4 - Define the compute block parameters CKW constants + ********************************************************************************/ + // Only now we can declare the N0 and M0 as constant + auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32)); + auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32)); + auto const_dst_shift_back_n0_i32 = + writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32)); + + /******************************************************************************** + * 5 - Define the sampler for the input tensor + ********************************************************************************/ + if (!src->has_tile()) + { + // Sampler + ckw::TensorSampler sampler_src = dst->tensor_sampler(); + + auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32)); + + writer->op_get_global_id(tile_gid_0, 0); + writer->op_get_global_id(tile_gid_1, 1); + writer->op_get_global_id(tile_gid_2, 2); + + auto tile_cout0 = writer->declare_tile("cout0", ckw::TileInfo(ckw::DataType::Int32)); // OFM + auto tile_mout0 = writer->declare_tile("mout0", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT + auto tile_mout1 = writer->declare_tile("mout1", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0 + auto tile_bout0 = writer->declare_tile("bout0", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX + + // Calculate coordinates + get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_dst_n0_i32, + const_dst_shift_back_n0_i32, const_0_i32); + get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_dst_m0_i32); + + // Get the boundary aware coordinates at each global dimension index + if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1) + { + writer->op_assign(tile_mout1, const_0_i32); + get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32); + } + else if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2) + { + writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32); + writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32); + } + ckw::DataType src_dt = to_ckw(_src->data_type()); + auto tile_src = writer->declare_tile("src", ckw::TileInfo(src_dt, dst_m0, dst_n0)); + + writer->op_load(tile_src, src->tensor(), sampler_src, tile_cout0, tile_mout0, tile_mout1, tile_bout0); + + // Here, init_virtual_tensor() it is used to bring the tile_src outside the compound statement + src->init_virtual_tensor(tile_src, sampler_src); + } + + auto tile_src = src->tile(); + + /******************************************************************************** + * 6 - Extra operations required before writing the main code (optional) + ********************************************************************************/ + + // Not required + + /******************************************************************************** + * 7 - Write the rest of the code + ********************************************************************************/ + // Only None ConvertPolicy is supported for floating-point data types + ckw::ConvertPolicy convert_policy = ckw::ConvertPolicy::None; + + writer->op_cast(tile_dst, tile_src, convert_policy); + ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component"); +} + +Window GpuCkwCast::get_window() const +{ + ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); + + TensorShape output_shape = _dst->tensor_shape(); + // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged + // This is in line with the collapsing convention used by operators like Conv2d + output_shape.collapse(2U, 1U); + constexpr uint32_t vector_size_byte_opencl = 16; + const uint32_t num_elems_processed_per_iteration = + adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); + Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); + + return win; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h new file mode 100644 index 0000000000..2389301196 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWCAST +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWCAST + +#include "src/core/common/Macros.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +class GpuCkwCast : public IGpuCkwComponentDriver +{ +public: + using Attributes = ClComponentCast::Attributes; + /** Constructor + * + * For supported configurations please refer to @ref ClComponentCast::validate() + * + * @param[in] id Component id + * @param[in] tensors Tensor arguments to the component + * @param[in] attributes Component attributes + */ + GpuCkwCast(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwCast); + /** Destructor */ + ~GpuCkwCast() override = default; + // Inherited methods overriden: + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + Window get_window() const override; + +private: + const ITensorInfo *_src; + const ITensorInfo *_dst; + Attributes _attributes; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute + +#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWCAST */ diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.cpp new file mode 100644 index 0000000000..cfccab186b --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.cpp @@ -0,0 +1,361 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" + +#include "compute_kernel_writer/include/ckw/KernelWriter.h" +#include <cstdint> +#include <string> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +GpuCkwDepthwiseConv2d::GpuCkwDepthwiseConv2d(ComponentId id, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings) + : IGpuCkwComponentDriver{id, tensors}, _src{}, _wei{}, _bia{}, _dst{}, _attributes{attributes}, _settings{settings} +{ + _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); + _wei = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); + if (this->tensors().get_const_tensor(TensorType::ACL_SRC_2)) + { + _bia = this->tensors().get_const_tensor(TensorType::ACL_SRC_2); + } + _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); + ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _wei, _bia, _dst); +} + +void GpuCkwDepthwiseConv2d::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const +{ + // Data Layout is NHWC + const uint32_t width_idx = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::WIDTH); + const uint32_t height_idx = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::HEIGHT); + + /******************************************************************************** + * 1 - Define tensors + ********************************************************************************/ + GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src"); + GpuCkwComponentArgument *wei = vtable.declare_variable(comp_group, writer, _wei, "wei"); + GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst"); + GpuCkwComponentArgument *bia = nullptr; + + const bool using_bias = _bia != nullptr; + + if (using_bias) + { + bia = vtable.declare_variable(comp_group, writer, _bia, "bia"); + } + + /******************************************************************************** + * 2 - Define CKW constants + ********************************************************************************/ + const auto dst_dt = to_ckw(_dst->data_type()); + const auto kernel_height = static_cast<int32_t>(_wei->dimension(height_idx)); + const auto kernel_width = static_cast<int32_t>(_wei->dimension(width_idx)); + const auto src_w = static_cast<int32_t>(_src->dimension(width_idx)); + const auto src_h = static_cast<int32_t>(_src->dimension(height_idx)); + const auto dst_h = static_cast<int32_t>(_dst->dimension(height_idx)); + const auto stride_x = static_cast<int32_t>(_attributes.stride().x()); + const auto stride_y = static_cast<int32_t>(_attributes.stride().y()); + const auto pad_x = static_cast<int32_t>(_attributes.pad().left); + const auto pad_y = static_cast<int32_t>(_attributes.pad().top); + const auto depth_multiplier = static_cast<int32_t>(_attributes.depth_multiplier()); + const auto dilation_x = static_cast<int32_t>(_attributes.dilation().x()); + const auto dilation_y = static_cast<int32_t>(_attributes.dilation().y()); + const auto kernel_size = kernel_width * kernel_height; + + // CKW constants + auto const_kernel_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{kernel_width}}, ckw::DataType::Int32)); + auto const_kernel_size_i32 = + writer->declare_constant_tile(ckw::ConstantData({{kernel_size}}, ckw::DataType::Int32)); + auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32)); + auto const_src_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32)); + auto const_src_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32)); + auto const_stride_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_x}}, ckw::DataType::Int32)); + auto const_stride_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_y}}, ckw::DataType::Int32)); + auto const_pad_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{pad_x}}, ckw::DataType::Int32)); + auto const_pad_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{pad_y}}, ckw::DataType::Int32)); + auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32)); + auto const_neg_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{-1}}, ckw::DataType::Int32)); + auto const_depth_multiplier_i32 = + writer->declare_constant_tile(ckw::ConstantData({{depth_multiplier}}, ckw::DataType::Int32)); + auto const_dilation_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{dilation_x}}, ckw::DataType::Int32)); + auto const_dilation_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{dilation_y}}, ckw::DataType::Int32)); + auto const_0_fp = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt)); + + /******************************************************************************** + * 3 - Define the compute block parameters and destination tile (if not root component) + * Bind the tile to the tensor to share it among different components and + * initialize the compute block parameters + ********************************************************************************/ + // The compute block parameters depend on the employed tensor format + const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); + + // Destination compute block size + const int32_t dst_n0 = root_window.x().step(); + const int32_t dst_m0 = root_window.y().step(); + + // Destination compute block size left-over + const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0; + const int32_t dst_m0_partial = _dst->dimension(1) % dst_m0; + + // Shift-back for the overlapping-min strategy + const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0; + + const int32_t src_m0 = kernel_width + (dst_m0 - 1); + const int32_t src_n0 = depth_multiplier > 1 ? 1 : dst_n0; + const int32_t wei_m0 = kernel_width; + const int32_t wei_n0 = dst_n0; + + ckw::TensorSampler sampler_dst; + sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2); + if (dst_n0_partial == 0) + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None); + } + else + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin); + } + + if (dst_m0_partial == 0) + { + sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None); + } + else + { + sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly); + } + + sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr); + + // Declare destination tile + auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0)); + + // Initialize the destination tile + writer->op_assign(tile_dst, const_0_fp); + + // Bind tile to the tensor + dst->init_virtual_tensor(tile_dst, sampler_dst); + + /******************************************************************************** + * 4 - Define the compute block parameters CKW constants + ********************************************************************************/ + // Only now we can declare the N0 and M0 as constant + auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32)); + auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32)); + auto const_shift_back_dst_n0_i32 = + writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32)); + + /******************************************************************************** + * 5 - Define the sampler for the input tensors + ********************************************************************************/ + // SOURCE SAMPLER + ckw::TensorSampler sampler_src; + sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2); + sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None); + sampler_src.address_mode_y(ckw::TensorSamplerAddressModeY::SkipLessThanZero); + sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + sampler_src.storage(ckw::TensorStorageType::BufferUint8Ptr); + + // WEIGHTS SAMPLER + // We cannot have out-of-bounds accesses for the weights + ckw::TensorSampler sampler_wei; + sampler_wei.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2); + sampler_wei.address_mode_x(ckw::TensorSamplerAddressModeX::None); + sampler_wei.address_mode_y(ckw::TensorSamplerAddressModeY::None); + sampler_wei.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + if (_settings.export_weights_to_cl_image()) + { + sampler_wei.storage(ckw::TensorStorageType::Texture2dReadOnly); + } + else + { + sampler_wei.storage(ckw::TensorStorageType::BufferUint8Ptr); + } + + // BIAS SAMPLER + ckw::TensorSampler sampler_bia; + sampler_bia.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2); + sampler_bia.address_mode_x(sampler_dst.address_mode_x()); + sampler_bia.address_mode_y(ckw::TensorSamplerAddressModeY::None); + sampler_bia.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + sampler_bia.storage(ckw::TensorStorageType::BufferUint8Ptr); + + /******************************************************************************** + * 6 - Extra operations required before writing the main code (Optional) + ********************************************************************************/ + // Not required + + /******************************************************************************** + * 7 - Get the coordinates of the destination tile + ********************************************************************************/ + auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32)); + + writer->op_get_global_id(tile_gid_0, 0); + writer->op_get_global_id(tile_gid_1, 1); + writer->op_get_global_id(tile_gid_2, 2); + + auto tile_cout0 = writer->declare_tile("cout0", ckw::TileInfo(ckw::DataType::Int32)); // OFM + auto tile_mout0 = writer->declare_tile("mout0", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH + auto tile_mout1 = writer->declare_tile("mout1", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT + auto tile_bout0 = writer->declare_tile("bout0", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX + + // Calculate coordinates + get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_dst_n0_i32, + const_shift_back_dst_n0_i32, const_0_i32); + get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_dst_m0_i32); + writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32); + writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32); + + auto tile_src_ci = writer->declare_tile("src_ci", ckw::DataType::Int32); + writer->op_binary(tile_src_ci, ckw::BinaryOp::Div, tile_cout0, const_depth_multiplier_i32); + + auto tile_src_xi = writer->declare_tile("src_xi", ckw::DataType::Int32); + writer->op_binary(tile_src_xi, ckw::BinaryOp::Mul, tile_mout0, const_stride_x_i32); + writer->op_binary(tile_src_xi, ckw::BinaryOp::Sub, tile_src_xi, const_pad_x_i32); + + auto tile_src_yi = writer->declare_tile("src_yi", ckw::DataType::Int32); + writer->op_binary(tile_src_yi, ckw::BinaryOp::Mul, tile_mout1, const_stride_y_i32); + writer->op_binary(tile_src_yi, ckw::BinaryOp::Sub, tile_src_yi, const_pad_y_i32); + + // Loop variables + auto tile_yk = writer->declare_tile("yk", ckw::DataType::Int32); + + writer->op_assign(tile_yk, const_0_i32); + + // clang-format off + writer->op_for_loop(tile_yk, ckw::BinaryOp::Less, const_kernel_size_i32, tile_yk, ckw::AssignmentOp::Increment, const_kernel_w_i32, + [&]() + { + auto tile_src = writer->declare_tile("a", ckw::TileInfo(to_ckw(_src->data_type()), src_m0, src_n0)); + auto tile_wei = writer->declare_tile("b", ckw::TileInfo(to_ckw(_wei->data_type()), wei_m0, wei_n0)); + + writer->op_assign(tile_src, const_0_fp); + + auto tile_x_gte_0 = writer->declare_tile("x_gte_0", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_y_gte_0 = writer->declare_tile("y_gte_0", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_x_lt_w = writer->declare_tile("x_lt_w", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_y_lt_h = writer->declare_tile("y_lt_h", ckw::TileInfo(ckw::DataType::Int32)); + + // Check if yi + yk * DILATION_Y is out-of-bound + writer->op_binary(tile_y_gte_0, ckw::BinaryOp::GreaterEqual, tile_src_yi, const_0_i32); + writer->op_binary(tile_y_lt_h, ckw::BinaryOp::Less, tile_src_yi, const_src_h_i32); + + auto tile_src_mi = writer->declare_tile("src_mi", ckw::TileInfo(ckw::DataType::Int32)); + + // Load src + for(int32_t xk = 0; xk < src_m0; ++xk) + { + auto const_xk_i32 = writer->declare_constant_tile(ckw::ConstantData({{xk}}, ckw::DataType::Int32)); + + // xi + xk * DILATION_X + writer->op_binary(tile_src_mi, ckw::BinaryOp::Mul, const_xk_i32, const_dilation_x_i32); + writer->op_binary(tile_src_mi, ckw::BinaryOp::Add, tile_src_mi, tile_src_xi); + + // Check if xi + xk * DILATION_X is out-of-bound + writer->op_binary(tile_x_gte_0, ckw::BinaryOp::GreaterEqual, tile_src_mi, const_0_i32); + writer->op_binary(tile_x_lt_w, ckw::BinaryOp::Less, tile_src_mi, const_src_w_i32); + + // Set mi to -1 if we have out-of-bound memory accesses + writer->op_ternary(tile_src_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_src_mi, tile_x_gte_0); + writer->op_ternary(tile_src_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_src_mi, tile_x_lt_w); + writer->op_ternary(tile_src_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_src_mi, tile_y_gte_0); + writer->op_ternary(tile_src_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_src_mi, tile_y_lt_h); + + writer->op_load(tile_src.row(xk), src->tensor(), sampler_src, tile_src_ci, tile_src_mi, tile_src_yi, tile_bout0); + } + + // Load wei + writer->op_load(tile_wei, wei->tensor(), sampler_wei, tile_cout0, tile_yk, const_0_i32, const_0_i32); + + // Attention: MAC (Multiply-and-Accumulate) ternary operator is currently unsupported in CKW + // Therefore, this part should be replaced with the MAC ternary operator when availabe + auto tile_tmp = writer->declare_tile("tmp", ckw::TileInfo(to_ckw(_src->data_type()), 1, dst_n0)); + for(int32_t m0 = 0; m0 < dst_m0; ++m0) + { + for(int32_t xk = 0; xk < kernel_width; ++xk) + { + auto tile_a = tile_src.row(m0 + xk); + auto tile_b = tile_wei.row(xk); + auto tile_c = tile_dst.row(m0); + + writer->op_binary(tile_tmp, ckw::BinaryOp::Mul, tile_a, tile_b); + writer->op_binary(tile_c, ckw::BinaryOp::Add, tile_c, tile_tmp); + } + } + writer->op_binary(tile_src_yi, ckw::BinaryOp::Add, tile_src_yi, const_dilation_y_i32); + }); + // clang-format on + + // Bias addition + // NOTE: This operation will be removed from this kernel as the interface is standardized. The intended way of + // performing bias addition is to fuse this convolution kernel with a following elementwise addition kernel. + if (using_bias) + { + if (!bia->has_tile()) + { + auto tile_bia = writer->declare_tile("bia", ckw::TileInfo(to_ckw(_src->data_type()), 1, dst_n0)); + writer->op_load(tile_bia, bia->tensor(), sampler_bia, tile_cout0, const_0_i32, const_0_i32, const_0_i32); + bia->init_virtual_tensor(tile_bia, sampler_bia); + } + auto &tile_bia = bia->tile(); + + writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_bia); + } + + ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component"); +} + +Window GpuCkwDepthwiseConv2d::get_window() const +{ + ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); + TensorShape output_shape = _dst->tensor_shape(); + + Window win = calculate_max_window(output_shape, Steps(_settings.n0(), _settings.m0())); + return win.collapse(win, Window::DimZ); +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h new file mode 100644 index 0000000000..a15d3ee710 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDEPTHWISECONV2D_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDEPTHWISECONV2D_H + +#include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h" + +#include "src/core/common/Macros.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ + +class GpuCkwDepthwiseConv2d : public IGpuCkwComponentDriver +{ +public: + using Attributes = ClComponentDepthwiseConv2d::Attributes; + using Settings = ClComponentDepthwiseConv2d::Settings; + + /** Constructor + * + * For supported configurations please refer to @ref ClComponentDepthwiseConv2d::validate() + * + * @param[in] id Component id + * @param[in] tensors Tensor arguments to the component + * @param[in] attributes Component attributes + * @param[in] settings Component settings + */ + GpuCkwDepthwiseConv2d(ComponentId id, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwDepthwiseConv2d); + /** Destructor */ + ~GpuCkwDepthwiseConv2d() override = default; + // Inherited methods overriden: + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + Window get_window() const override; + +private: + const ITensorInfo *_src; + const ITensorInfo *_wei; + const ITensorInfo *_bia; + const ITensorInfo *_dst; + Attributes _attributes; + Settings _settings; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDEPTHWISECONV2D_H diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp new file mode 100644 index 0000000000..eb4f644eb6 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp @@ -0,0 +1,427 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" + +#include "compute_kernel_writer/include/ckw/KernelWriter.h" +#include <cstdint> +#include <string> +#include <vector> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ + +using TileContainer = std::vector<std::vector<int32_t>>; + +GpuCkwDirectConv2d::GpuCkwDirectConv2d(ComponentId id, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings) + : IGpuCkwComponentDriver{id, tensors}, _src{}, _wei{}, _bia{}, _dst{}, _attributes{attributes}, _settings{settings} +{ + _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); + _wei = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); + _bia = this->tensors().get_const_tensor(TensorType::ACL_SRC_2); + _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); + ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _wei, _dst); // Bias can be null +} + +void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const +{ + const auto desc = _settings.direct_conv_descriptor(); + ARM_COMPUTE_ERROR_ON_MSG(desc.export_input_to_cl_image || desc.export_output_to_cl_image, + "Only the weights tensor can be exported to cl_image"); + + const uint32_t channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL); + const uint32_t width_idx = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::WIDTH); + const uint32_t height_idx = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::HEIGHT); + + /******************************************************************************** + * 1 - Define tensors + ********************************************************************************/ + GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src"); + GpuCkwComponentArgument *wei = vtable.declare_variable(comp_group, writer, _wei, "wei"); + GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst"); + GpuCkwComponentArgument *bia = nullptr; + + const bool using_bias = _bia != nullptr; + + if (using_bias) + { + bia = vtable.declare_variable(comp_group, writer, _bia, "bia"); + } + + /******************************************************************************** + * 2 - Define CKW constants + ********************************************************************************/ + const auto dst_dt = to_ckw(_dst->data_type()); + const auto kernel_height = static_cast<int32_t>(_wei->dimension(height_idx)); + const auto kernel_width = static_cast<int32_t>(_wei->dimension(width_idx)); + const auto src_c = static_cast<int32_t>(_src->dimension(channel_idx)); + const auto src_w = static_cast<int32_t>(_src->dimension(width_idx)); + const auto src_h = static_cast<int32_t>(_src->dimension(height_idx)); + const auto dst_w = static_cast<int32_t>(_dst->dimension(width_idx)); + const auto stride_x = static_cast<int32_t>(_attributes.stride().x()); + const auto stride_y = static_cast<int32_t>(_attributes.stride().y()); + const auto pad_x = static_cast<int32_t>(_attributes.pad().left); + const auto pad_y = static_cast<int32_t>(_attributes.pad().top); + const auto kernel_size = kernel_width * kernel_height; + const auto k0 = + static_cast<int32_t>(adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx))); + + // CKW constants + auto const_kernel_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{kernel_width}}, ckw::DataType::Int32)); + auto const_kernel_size_i32 = + writer->declare_constant_tile(ckw::ConstantData({{kernel_size}}, ckw::DataType::Int32)); + auto const_src_c_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_c}}, ckw::DataType::Int32)); + auto const_src_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32)); + auto const_src_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32)); + auto const_dst_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_w}}, ckw::DataType::Int32)); + auto const_stride_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_x}}, ckw::DataType::Int32)); + auto const_stride_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_y}}, ckw::DataType::Int32)); + auto const_pad_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{pad_x}}, ckw::DataType::Int32)); + auto const_pad_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{pad_y}}, ckw::DataType::Int32)); + auto const_k0_i32 = writer->declare_constant_tile(ckw::ConstantData({{k0}}, ckw::DataType::Int32)); + auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32)); + auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32)); + auto const_neg_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{-1}}, ckw::DataType::Int32)); + auto const_0_fp = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt)); + auto const_src_c_i32_minus_k0_i32 = + writer->declare_constant_tile(ckw::ConstantData({{src_c - k0}}, ckw::DataType::Int32)); + + /******************************************************************************** + * 3 - Define the compute block parameters and destination tile (if not root component) + * Bind the tile to the tensor to share it among different components and + * initialize the compute block parameters + ********************************************************************************/ + // The compute block parameters depend on the employed tensor format + const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); + + // Destination compute block size + const int32_t dst_n0 = root_window.x().step(); + const int32_t dst_m0 = root_window.y().step(); + + // Destination compute block size left-over + const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0; + const int32_t dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0; + + // Shift-back for the overlapping-min strategy + const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0; + + ckw::TensorSampler sampler_dst; + sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1); + if (dst_n0_partial == 0) + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None); + } + else + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin); + } + + if (dst_m0_partial == 0) + { + sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None); + } + else + { + sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly); + } + + sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr); + + // Declare destination tile + auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0)); + + // Initialize destination tile + writer->op_assign(tile_dst, const_0_fp); + + // Bind tile to the tensor + dst->init_virtual_tensor(tile_dst, sampler_dst); + + /******************************************************************************** + * 4 - Define the compute block parameters CKW constants + ********************************************************************************/ + // Only now we can declare the N0 and M0 as constant + auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32)); + auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32)); + auto const_shift_back_dst_n0_i32 = + writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32)); + + /******************************************************************************** + * 5 - Define the samplers for the input tensors + ********************************************************************************/ + // Exporting the weights tensor to an OpenCL image object is currently only supported when: + // a) k0 is equal to 4 + // The current implementation expects to read a vector of 4 float values into the OpenCL image object. + // b) K is a multiple of 4 + // This is a limitation in the current interface due to the variable table being responsible for maintaining + // information about the TensorStorageType rather than the TensorTileSampler. As a result, TensorStorageType cannot + // be reassigned, and we cannot use a texture object for the weights tensor in cases where we expect to have an + // extra loop to compute the left-over elements. + const bool use_cl_image_for_weights = desc.export_weights_to_cl_image && (k0 == 4) && (src_c % 4 == 0); + + // SOURCE SAMPLER + // - We cannot have out-of-bounds reads in the X dimension (mapped to the IFMs) as we have an extra loop to + // compute left-over elements + // - We cannot have out-of-bounds reads when the kernel height is equal to 1. In all other cases, we need to ensure the + // indirection buffer mi does not contain negative values representing out-of-bounds reads. + auto address_mode_y_src = + kernel_height == 1 ? ckw::TensorSamplerAddressModeY::None : ckw::TensorSamplerAddressModeY::SkipLessThanZero; + ckw::TensorSampler sampler_src; + sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1); // 3rd dimension collapsed with 2nd dimension + sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None); + sampler_src.address_mode_y(address_mode_y_src); + sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + sampler_src.storage(ckw::TensorStorageType::BufferUint8Ptr); + + // WEIGHTS SAMPLER + // We cannot have out-of-bounds accesses for the weights + ckw::TensorSampler sampler_wei; + sampler_wei.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1); // 3rd dimension collapsed with 2nd dimension + sampler_wei.address_mode_x(ckw::TensorSamplerAddressModeX::None); + sampler_wei.address_mode_y(ckw::TensorSamplerAddressModeY::None); + sampler_wei.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + if (use_cl_image_for_weights) + { + sampler_wei.storage(ckw::TensorStorageType::Texture2dReadOnly); + } + else + { + sampler_wei.storage(ckw::TensorStorageType::BufferUint8Ptr); + } + + // BIAS SAMPLER + ckw::TensorSampler sampler_bia; + + if (using_bias) + { + sampler_bia.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1); + sampler_bia.address_mode_x(sampler_dst.address_mode_x()); + sampler_bia.address_mode_y(ckw::TensorSamplerAddressModeY::None); + sampler_bia.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + sampler_bia.storage(ckw::TensorStorageType::BufferUint8Ptr); + } + + /******************************************************************************** + * 6 - Extra operations required before writing the main code (optional) + ********************************************************************************/ + + // Not required + + /******************************************************************************** + * 7 - Get the coordinates of the destination tile + ********************************************************************************/ + auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32)); + + writer->op_get_global_id(tile_gid_0, 0); + writer->op_get_global_id(tile_gid_1, 1); + writer->op_get_global_id(tile_gid_2, 2); + + auto tile_cout = writer->declare_tile("cout", ckw::TileInfo(ckw::DataType::Int32)); // OFM + auto tile_mout = writer->declare_tile("mout", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH x HEIGHT + auto tile_bout = writer->declare_tile("bout", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX + + // Calculate coordinates + get_coordinate_from_gws_overlapping_min(writer, tile_cout, tile_gid_0, const_dst_n0_i32, + const_shift_back_dst_n0_i32, const_0_i32); + get_coordinate_from_gws(writer, tile_mout, tile_gid_1, const_dst_m0_i32); + get_coordinate_from_gws(writer, tile_bout, tile_gid_2, const_pos_1_i32); + + /******************************************************************************** + * 8 - Write the rest of the code + ********************************************************************************/ + // We create a 2d container of size (dst_m0, 1) to store the indices for iteration + TileContainer it; + for (int32_t m = 0; m < dst_m0; ++m) + { + std::vector<int32_t> idx{m}; + it.push_back({idx}); + } + + const auto &const_idxs = writer->declare_constant_tile(ckw::ConstantData(it, ckw::DataType::Int32)); + + auto tile_xi = writer->declare_tile("xi", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1)); + auto tile_yi = writer->declare_tile("yi", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1)); + + // Convert the linear index to coordinate + // xi = ((mout + i) % dst_w) * stride_x - pad_x + // yi = ((mout + i) / dst_w) * stride_y - pad_y + writer->op_binary(tile_xi, ckw::BinaryOp::Add, tile_mout, const_idxs); + writer->op_binary(tile_yi, ckw::BinaryOp::Add, tile_mout, const_idxs); + writer->op_binary(tile_xi, ckw::BinaryOp::Mod, tile_xi, const_dst_w_i32); + writer->op_binary(tile_yi, ckw::BinaryOp::Div, tile_yi, const_dst_w_i32); + writer->op_binary(tile_xi, ckw::BinaryOp::Mul, tile_xi, const_stride_x_i32); + writer->op_binary(tile_yi, ckw::BinaryOp::Mul, tile_yi, const_stride_y_i32); + writer->op_binary(tile_xi, ckw::BinaryOp::Sub, tile_xi, const_pad_x_i32); + writer->op_binary(tile_yi, ckw::BinaryOp::Sub, tile_yi, const_pad_y_i32); + + auto tile_y_b = writer->declare_tile("y_b", ckw::TileInfo(ckw::DataType::Int32)); + writer->op_binary(tile_y_b, ckw::BinaryOp::Mul, tile_cout, const_kernel_size_i32); + + auto tile_i = writer->declare_tile("i", ckw::TileInfo(ckw::DataType::Int32)); + writer->op_assign(tile_i, const_0_i32); + + // clang-format off + writer->op_for_loop(tile_i, ckw::BinaryOp::Less, const_kernel_size_i32, tile_i, ckw::AssignmentOp::Increment, const_pos_1_i32, [&]() + { + auto tile_x_k = writer->declare_tile("x_k", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_y_k = writer->declare_tile("y_k", ckw::TileInfo(ckw::DataType::Int32)); + + writer->op_binary(tile_x_k, ckw::BinaryOp::Mod, tile_i, const_kernel_w_i32); + writer->op_binary(tile_y_k, ckw::BinaryOp::Div, tile_i, const_kernel_w_i32); + + auto tile_ck = writer->declare_tile("ck", ckw::TileInfo(ckw::DataType::Int32)); + writer->op_assign(tile_ck, const_0_i32); + + // Construct an indirection buffer containing the precalculated addresses of elements in the source tensor + // x_s = xi + x_k + // y_s = yi + y_k + // mi = x_s + y_s * width; + // mi = select(-1, mi, x_s >= 0); + // mi = select(-1, mi, x_s < width); + // mi = select(-1, mi, y_s >= 0); + // mi = select(-1, mi, y_s < height); + auto tile_xs = writer->declare_tile("xs", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1)); + auto tile_ys = writer->declare_tile("ys", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1)); + auto tile_mi = writer->declare_tile("mi", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1)); + + auto tile_xs_gte_0 = writer->declare_tile("xs_gte_0", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1)); + auto tile_ys_gte_0 = writer->declare_tile("ys_gte_0", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1)); + auto tile_xs_lt_w = writer->declare_tile("xs_lt_w", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1)); + auto tile_ys_lt_h = writer->declare_tile("ys_lt_h", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1)); + + writer->op_binary(tile_xs, ckw::BinaryOp::Add, tile_xi, tile_x_k); + writer->op_binary(tile_ys, ckw::BinaryOp::Add, tile_yi, tile_y_k); + writer->op_binary(tile_mi, ckw::BinaryOp::Mul, tile_ys, const_src_w_i32); + writer->op_binary(tile_mi, ckw::BinaryOp::Add, tile_mi, tile_xs); + writer->op_binary(tile_xs_gte_0, ckw::BinaryOp::GreaterEqual, tile_xs, const_0_i32); + writer->op_binary(tile_ys_gte_0, ckw::BinaryOp::GreaterEqual, tile_ys, const_0_i32); + writer->op_binary(tile_xs_lt_w, ckw::BinaryOp::Less, tile_xs, const_src_w_i32); + writer->op_binary(tile_ys_lt_h, ckw::BinaryOp::Less, tile_ys, const_src_h_i32); + writer->op_ternary(tile_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_mi, tile_xs_gte_0); + writer->op_ternary(tile_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_mi, tile_ys_gte_0); + writer->op_ternary(tile_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_mi, tile_xs_lt_w); + writer->op_ternary(tile_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_mi, tile_ys_lt_h); + + writer->op_for_loop(tile_ck, ckw::BinaryOp::LessEqual, const_src_c_i32_minus_k0_i32, tile_ck, ckw::AssignmentOp::Increment, const_k0_i32, [&]() + { + auto tile_lhs = writer->declare_tile("lhs", ckw::TileInfo(to_ckw(_src->data_type()), dst_m0, k0)); + auto tile_rhs = writer->declare_tile("rhs", ckw::TileInfo(to_ckw(_wei->data_type()), dst_n0, k0)); + writer->op_assign(tile_lhs, const_0_fp); + writer->op_assign(tile_rhs, const_0_fp); + + writer->op_load_indirect(tile_lhs, src->tensor(), sampler_src, tile_ck, tile_mi, const_0_i32, tile_bout); + writer->op_load_dilated(tile_rhs, wei->tensor(), sampler_wei, tile_ck, tile_y_b, const_0_i32, const_0_i32, const_pos_1_i32, const_kernel_size_i32); + + writer->op_binary(tile_dst, ckw::BinaryOp::MatMul_Nt_T, tile_lhs, tile_rhs); + }); + + // Left-over accumulations for when K is not a multiple of k0 + if(((src_c % k0) != 0)) + { + writer->op_for_loop(tile_ck, ckw::BinaryOp::Less, const_src_c_i32, tile_ck, ckw::AssignmentOp::Increment, const_pos_1_i32, [&]() + { + auto tile_lhs = writer->declare_tile("lhs_leftover", ckw::TileInfo(to_ckw(_src->data_type()), dst_m0, 1)); + auto tile_rhs = writer->declare_tile("rhs_leftover", ckw::TileInfo(to_ckw(_wei->data_type()), dst_n0, 1)); + writer->op_assign(tile_lhs, const_0_fp); + writer->op_assign(tile_rhs, const_0_fp); + + writer->op_load_indirect(tile_lhs, src->tensor(), sampler_src, tile_ck, tile_mi, const_0_i32, tile_bout); + writer->op_load_dilated(tile_rhs, wei->tensor(), sampler_wei, tile_ck, tile_y_b, const_0_i32, const_0_i32, const_pos_1_i32, const_kernel_size_i32); + + writer->op_binary(tile_dst, ckw::BinaryOp::MatMul_Nt_T, tile_lhs, tile_rhs); + }); + } + + writer->op_binary(tile_y_b, ckw::BinaryOp::Add, tile_y_b, const_pos_1_i32); + }); + // clang-format on + + // NOTE: The bias addition will be removed from this kernel as the interface is standardized. The intended way of + // performing bias addition is to fuse this convolution kernel with a following elementwise addition kernel. + if (using_bias) + { + if (!bia->has_tile()) + { + auto tile_bia = writer->declare_tile("bia", ckw::TileInfo(to_ckw(_src->data_type()), 1, dst_n0)); + writer->op_load(tile_bia, bia->tensor(), sampler_bia, tile_cout, const_0_i32, const_0_i32, const_0_i32); + bia->init_virtual_tensor(tile_bia, sampler_bia); + } + auto &tile_bia = bia->tile(); + + writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_bia); + } + + ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component"); +} + +Window GpuCkwDirectConv2d::get_window() const +{ + ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); + + const auto dst_shape = _dst->tensor_shape(); + const auto desc = _settings.direct_conv_descriptor(); + + const uint32_t dst_n0 = adjust_vec_size(desc.n0, dst_shape[0]); + const uint32_t dst_m0 = adjust_vec_size(desc.m0, dst_shape[1] * dst_shape[2]); + + Window win = calculate_max_window(dst_shape, Steps(dst_n0, dst_m0)); + + const size_t dim_y_collapsed = ceil_to_multiple(dst_shape[1] * dst_shape[2], dst_m0); + win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, dst_m0)); + win.set(Window::DimZ, Window::Dimension(0, dst_shape.total_size_upper(3), 1)); + + return win; +} + +std::string GpuCkwDirectConv2d::get_name(const ComponentGroup &comp_group) const +{ + ARM_COMPUTE_UNUSED(comp_group); + + return "direct_conv2d"; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h new file mode 100644 index 0000000000..139cf620e2 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D_H + +#include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h" + +#include "src/core/common/Macros.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +class GpuCkwDirectConv2d : public IGpuCkwComponentDriver +{ +public: + using Attributes = ClComponentDirectConv2d::Attributes; + using Settings = ClComponentDirectConv2d::Settings; + +public: + /** Constructor + * + * For supported configurations please refer to @ref ClComponentDirectConv2d::validate() + * + * @param[in] id Component id + * @param[in] tensors Tensor arguments to the component + * @param[in] attributes Component attributes. Attributes are a set of parameters that define what a component does + * @param[in] settings Component settings. Settings are a set of parameters that influence the implementation of a component + */ + GpuCkwDirectConv2d(ComponentId id, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwDirectConv2d); + /** Destructor */ + ~GpuCkwDirectConv2d() override = default; + + // Inherited methods overriden + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + Window get_window() const override; + std::string get_name(const ComponentGroup &comp_group) const override; + +private: + const ITensorInfo *_src; + const ITensorInfo *_wei; + const ITensorInfo *_bia; + const ITensorInfo *_dst; + + Attributes _attributes; + Settings _settings; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute + +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D_H diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp new file mode 100644 index 0000000000..fb55acad53 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp @@ -0,0 +1,434 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "GpuCkwElementwiseBinary.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" +#include "src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "support/StringSupport.h" + +#include "compute_kernel_writer/include/ckw/KernelWriter.h" +#include "compute_kernel_writer/include/ckw/types/ConstantData.h" +#include "compute_kernel_writer/include/ckw/types/TensorSamplerTypes.h" +#include <cstdint> +#include <string> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +GpuCkwElementwiseBinary::GpuCkwElementwiseBinary(ComponentId id, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes) + : IGpuCkwComponentDriver{id, tensors}, _lhs{}, _rhs{}, _dst{}, _attributes{attributes} +{ + _lhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); + _rhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); + _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); + ARM_COMPUTE_ERROR_ON_NULLPTR(_lhs, _rhs, _dst); +} + +void GpuCkwElementwiseBinary::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const +{ + /******************************************************************************** + * 1 - Define tensors + ********************************************************************************/ + GpuCkwComponentArgument *lhs = vtable.declare_variable(comp_group, writer, _lhs, "lhs"); + GpuCkwComponentArgument *rhs = vtable.declare_variable(comp_group, writer, _rhs, "rhs"); + GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst"); + + /******************************************************************************** + * 2 - Define CKW constants + ********************************************************************************/ + const auto dst_h = static_cast<int32_t>(_dst->dimension(1)); + + // CKW constants + auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32)); + auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32)); + auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32)); + + /******************************************************************************** + * 3 - Define the compute block parameters and destination tile (if not root component) + * Bind the tile to the tensor to share it among different components and + * initialize the compute block parameters + ********************************************************************************/ + // The compute block parameters depend on the employed tensor format + + // Destination compute block size + int32_t dst_n0 = -1; + int32_t dst_m0 = -1; + + // Destination compute block size left-over + int32_t dst_n0_partial = -1; + int32_t dst_m0_partial = -1; + + if (!dst->has_tile()) + { + // If ROOT component, we use ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1 + // as tensor format + const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); + + dst_n0 = root_window.x().step(); + dst_m0 = root_window.y().step(); + dst_n0_partial = _dst->dimension(0) % dst_n0; + dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0; + + ckw::TensorSampler sampler_dst; + sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1); + if (dst_n0_partial == 0) + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None); + } + else + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin); + } + + if (dst_m0_partial == 0) + { + sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None); + } + else + { + sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly); + } + sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr); + + // Declare destination tile + ckw::DataType dst_dt = to_ckw(_dst->data_type()); + auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0)); + + // Bind tile to the tensor + dst->init_virtual_tensor(tile_dst, sampler_dst); + } + else + { + // Change dst_n0 and dst_m0 if NOT root component! + dst_n0 = dst->tile().tile_info().width(); + dst_m0 = dst->tile().tile_info().height(); + + // Here, it is not required the calculation of dst_n0_partial and dst_m0_partial + // because if we enter this condition it means that the element-wise op is not the + // root component and the address modes have been already set. + } + + const auto &tile_dst = dst->tile(); + + /******************************************************************************** + * 4 - Define the compute block parameters CKW constants + ********************************************************************************/ + // ... + + /******************************************************************************** + * 5 - Define the samplers for the input tensors + ********************************************************************************/ + // Check whether the lhs tensor is a tile or tensor + // If it is a tile, create a sampler and load the content in a tile + if (!lhs->has_tile()) + { + // Sampler + ckw::TensorSampler sampler_lhs = dst->tensor_sampler(); + + bool broadcast_x = false; + bool broadcast_y = false; + + int32_t lhs_n0 = dst_n0; + int32_t lhs_m0 = dst_m0; + + // Check whether we have broadcasting + // In case of broadcast, lhs can only be a vector or scalar. + // Broadcasting in other dimensions is not supported + if (_dst->dimension(0) != _lhs->dimension(0)) + { + broadcast_x = true; + lhs_n0 = 1; + } + + if (sampler_lhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1) + { + if (_dst->dimension(1) * _dst->dimension(2) != _lhs->dimension(1) * _lhs->dimension(2)) + { + broadcast_y = true; + lhs_m0 = 1; + } + } + else if (sampler_lhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2) + { + if (_dst->dimension(1) != _lhs->dimension(1)) + { + broadcast_y = true; + lhs_m0 = 1; + } + } + + const int32_t lhs_partial_n0 = _lhs->dimension(0) % lhs_n0; + const int32_t lhs_shift_back = (lhs_n0 - lhs_partial_n0) % lhs_n0; + + // Constants + auto const_lhs_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{lhs_n0}}, ckw::DataType::Int32)); + auto const_lhs_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{lhs_m0}}, ckw::DataType::Int32)); + auto const_lhs_shift_back_n0_i32 = + writer->declare_constant_tile(ckw::ConstantData({{lhs_shift_back}}, ckw::DataType::Int32)); + + auto tile_gid_0 = writer->declare_tile("gid_0_lhs", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_1 = writer->declare_tile("gid_1_lhs", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_2 = writer->declare_tile("gid_2_lhs", ckw::TileInfo(ckw::DataType::Int32)); + + writer->op_get_global_id(tile_gid_0, 0); + writer->op_get_global_id(tile_gid_1, 1); + writer->op_get_global_id(tile_gid_2, 2); + + auto tile_cout0 = writer->declare_tile("cout0_lhs", ckw::TileInfo(ckw::DataType::Int32)); // OFM + auto tile_mout0 = + writer->declare_tile("mout0_lhs", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT + auto tile_mout1 = writer->declare_tile("mout1_lhs", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0 + auto tile_bout0 = writer->declare_tile("bout0_lhs", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX + + // Calculate coordinates + if (!broadcast_x) + { + get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_lhs_n0_i32, + const_lhs_shift_back_n0_i32, const_0_i32); + } + else + { + writer->op_assign(tile_cout0, const_0_i32); + } + + if (!broadcast_y) + { + get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_lhs_m0_i32); + } + else + { + writer->op_assign(tile_mout0, const_0_i32); + } + + // Get the boundary aware coordinates at each global dimension index + if (sampler_lhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1) + { + writer->op_assign(tile_mout1, const_0_i32); + get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32); + } + else if (sampler_lhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2) + { + // For tile_mout1 and tile_bout0 the step can only be 1 + if (!broadcast_y) + { + writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32); + } + else + { + // If broadcast_y == true, it means that we have either a scalar or vector + // because broadcasting in other dimensions is not supported + writer->op_assign(tile_mout1, const_0_i32); + } + + writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32); + } + + ckw::DataType lhs_dt = to_ckw(_lhs->data_type()); + auto tile_lhs = writer->declare_tile("lhs", ckw::TileInfo(lhs_dt, lhs_m0, lhs_n0)); + + writer->op_load(tile_lhs, lhs->tensor(), sampler_lhs, tile_cout0, tile_mout0, tile_mout1, tile_bout0); + + // Here, init_virtual_tensor() is used to bring the tile_lhs outside the compound statement + lhs->init_virtual_tensor(tile_lhs, sampler_lhs); + } + + // Check whether the rhs tensor is a tile or tensor + // If it is a tile, create a sampler and load the content in a tile + if (!rhs->has_tile()) + { + // Sampler + ckw::TensorSampler sampler_rhs = dst->tensor_sampler(); + + bool broadcast_x = false; + bool broadcast_y = false; + + int32_t rhs_n0 = dst_n0; + int32_t rhs_m0 = dst_m0; + + // Check whether we have broadcasting + // In case of broadcast, rhs can only be a vector or scalar. + // Broadcasting in other dimensions is not supported + if (_dst->dimension(0) != _rhs->dimension(0)) + { + broadcast_x = true; + rhs_n0 = 1; + } + + if (sampler_rhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1) + { + if (_dst->dimension(1) * _dst->dimension(2) != _rhs->dimension(1) * _rhs->dimension(2)) + { + broadcast_y = true; + rhs_m0 = 1; + } + } + else if (sampler_rhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2) + { + if (_dst->dimension(1) != _rhs->dimension(1)) + { + broadcast_y = true; + rhs_m0 = 1; + } + } + + const int32_t rhs_partial_n0 = _rhs->dimension(0) % rhs_n0; + const int32_t rhs_shift_back = (rhs_n0 - rhs_partial_n0) % rhs_n0; + + // Constants + auto const_rhs_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{rhs_n0}}, ckw::DataType::Int32)); + auto const_rhs_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{rhs_m0}}, ckw::DataType::Int32)); + auto const_rhs_shift_back_n0_i32 = + writer->declare_constant_tile(ckw::ConstantData({{rhs_shift_back}}, ckw::DataType::Int32)); + + auto tile_gid_0 = writer->declare_tile("gid_0_rhs", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_1 = writer->declare_tile("gid_1_rhs", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_2 = writer->declare_tile("gid_2_rhs", ckw::TileInfo(ckw::DataType::Int32)); + + writer->op_get_global_id(tile_gid_0, 0); + writer->op_get_global_id(tile_gid_1, 1); + writer->op_get_global_id(tile_gid_2, 2); + + auto tile_cout0 = writer->declare_tile("cout0_rhs", ckw::TileInfo(ckw::DataType::Int32)); // OFM + auto tile_mout0 = + writer->declare_tile("mout0_rhs", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT + auto tile_mout1 = writer->declare_tile("mout1_rhs", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0 + auto tile_bout0 = writer->declare_tile("bout0_rhs", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX + + // Calculate coordinates + if (!broadcast_x) + { + get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_rhs_n0_i32, + const_rhs_shift_back_n0_i32, const_0_i32); + } + else + { + writer->op_assign(tile_cout0, const_0_i32); + } + + if (!broadcast_y) + { + get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_rhs_m0_i32); + } + else + { + writer->op_assign(tile_mout0, const_0_i32); + } + + // Get the boundary aware coordinates at each global dimension index + if (sampler_rhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1) + { + writer->op_assign(tile_mout1, const_0_i32); + get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32); + } + else if (sampler_rhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2) + { + // For tile_mout1 and tile_bout0 the step can only be 1 + const auto src_w = static_cast<int32_t>(_rhs->dimension(1)); + auto const_src_w = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32)); + if (!broadcast_y) + { + writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_mout1, const_src_w); + } + else + { + // If broadcast_y == true, it means that we have either a scalar or vector + // because broadcasting in other dimensions is not supported + writer->op_assign(tile_mout1, const_0_i32); + } + + writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_mout1, const_src_w); + } + + ckw::DataType rhs_dt = to_ckw(_rhs->data_type()); + auto tile_rhs = writer->declare_tile("rhs", ckw::TileInfo(rhs_dt, rhs_m0, rhs_n0)); + + writer->op_load(tile_rhs, rhs->tensor(), sampler_rhs, tile_cout0, tile_mout0, tile_mout1, tile_bout0); + + // Here, init_virtual_tensor() is used to bring the tile_rhs outside the compound statement + rhs->init_virtual_tensor(tile_rhs, sampler_rhs); + } + + const auto &tile_lhs = lhs->tile(); + const auto &tile_rhs = rhs->tile(); + + /******************************************************************************** + * 7 - Write the rest of the code + ********************************************************************************/ + // Perform the element-wise operation + writer->op_binary(tile_dst, to_ckw(_attributes), tile_lhs, tile_rhs); + + ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component"); +} + +Window GpuCkwElementwiseBinary::get_window() const +{ + ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); + + TensorShape output_shape = _dst->tensor_shape(); + // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged + // This is in line with the collapsing convention used by operators like Conv2d + output_shape.collapse(2U, 1U); + constexpr uint32_t vector_size_byte_opencl = 16; + const uint32_t num_elems_processed_per_iteration = + adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); + Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); + + return win; +} + +std::string GpuCkwElementwiseBinary::get_name(const ComponentGroup &comp_group) const +{ + ARM_COMPUTE_UNUSED(comp_group); + const std::vector<std::string> build_params = { + "elementwise_binary", + "op", + to_string(_attributes.operation()), + "dt", + lower_string(string_from_data_type(_dst->data_type())), + }; + return join(build_params, "_"); +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h new file mode 100644 index 0000000000..c6cbba28d3 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWELEMENTWISEBINARY_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWELEMENTWISEBINARY_H + +#include "src/core/common/Macros.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +class GpuCkwElementwiseBinary : public IGpuCkwComponentDriver +{ +public: + using Attributes = ClComponentElementwiseBinary::Attributes; + /** Constructor + * + * For supported configurations please refer to @ref ClComponentElementwiseBinary::validate() + * + * @param[in] id Component id + * @param[in] tensors Tensor arguments to the component + * @param[in] attributes Component attributes + */ + GpuCkwElementwiseBinary(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwElementwiseBinary); + /** Destructor */ + ~GpuCkwElementwiseBinary() override = default; + // Inherited methods overriden: + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + Window get_window() const override; + std::string get_name(const ComponentGroup &comp_group) const override; + +private: + const ITensorInfo *_lhs; + const ITensorInfo *_rhs; + const ITensorInfo *_dst; + Attributes _attributes; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute + +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWELEMENTWISEBINARY_H diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp new file mode 100644 index 0000000000..14ad3847fc --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" +#include "support/StringSupport.h" + +#include "compute_kernel_writer/include/ckw/KernelWriter.h" +#include <cstdint> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ + +GpuCkwMatMul::GpuCkwMatMul(ComponentId id, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings) + : IGpuCkwComponentDriver{id, tensors}, _lhs{}, _rhs{}, _dst{}, _attributes{attributes}, _settings{settings} +{ + _lhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); + _rhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); + _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); + ARM_COMPUTE_ERROR_ON_NULLPTR(_lhs, _rhs, _dst); +} + +void GpuCkwMatMul::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const +{ + /******************************************************************************** + * 1 - Define tensors + ********************************************************************************/ + GpuCkwComponentArgument *lhs = vtable.declare_variable(comp_group, writer, _lhs, "lhs"); + GpuCkwComponentArgument *rhs = vtable.declare_variable(comp_group, writer, _rhs, "rhs"); + GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst"); + + /******************************************************************************** + * 2 - Define CKW constants + ********************************************************************************/ + const auto k = + _attributes.adj_lhs() ? static_cast<int32_t>(_lhs->dimension(1)) : static_cast<int32_t>(_lhs->dimension(0)); + const auto k0 = static_cast<int32_t>(adjust_vec_size(_settings.k0(), k)); + const auto dst_dt = to_ckw(_dst->data_type()); + + // CKW constants + auto const_k_i32 = writer->declare_constant_tile(ckw::ConstantData({{k}}, ckw::DataType::Int32)); + auto const_k0_i32 = writer->declare_constant_tile(ckw::ConstantData({{k0}}, ckw::DataType::Int32)); + auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32)); + auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32)); + auto const_0_fp = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt)); + auto const_k_minus_k0_i32 = writer->declare_constant_tile(ckw::ConstantData({{k - k0}}, ckw::DataType::Int32)); + + /******************************************************************************** + * 3 - Define the compute block parameters and destination tile (if not root component) + * Bind the tile to the tensor to share it among different components and + * initialize the compute block parameters + ********************************************************************************/ + // The n0 and m0 parameters from root_window only refers to the output + const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); + + // Destination compute block size + const int32_t dst_n0 = root_window.x().step(); + const int32_t dst_m0 = root_window.y().step(); + + // Destination compute block size left-over + const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0; + const int32_t dst_m0_partial = _dst->dimension(1) % dst_m0; + + // Shift-back for the overlapping-min strategy + const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0; + + ckw::TensorSampler sampler_dst; + sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2); + if (dst_n0_partial == 0) + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None); + } + else + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin); + } + + if (dst_m0_partial == 0) + { + sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None); + } + else + { + sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly); + } + + sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr); + + // Declare destination tile + auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0)); + + // Initialize destination tile + writer->op_assign(tile_dst, const_0_fp); + + // Bind tile to the tensor + dst->init_virtual_tensor(tile_dst, sampler_dst); + + /******************************************************************************** + * 4 - Define the compute block parameters CKW constants + ********************************************************************************/ + // Only now we can declare the N0 and M0 as constant + auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32)); + auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32)); + auto const_shift_back_dst_n0_i32 = + writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32)); + + /******************************************************************************** + * 5 - Define the samplers for the input tensors + ********************************************************************************/ + // LHS SAMPLER + // The assumption here is that M is multiple of M0. This limitation will be removed once + // we have the support for OverlappingMin as address mode for the Y direction + ckw::TensorSampler sampler_lhs; + sampler_lhs.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2); + sampler_lhs.address_mode_x(ckw::TensorSamplerAddressModeX::None); + sampler_lhs.address_mode_y(ckw::TensorSamplerAddressModeY::None); + sampler_lhs.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + sampler_lhs.storage(ckw::TensorStorageType::BufferUint8Ptr); + + // RHS SAMPLER + ckw::TensorSampler sampler_rhs; + sampler_rhs.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2); + sampler_rhs.address_mode_x(ckw::TensorSamplerAddressModeX::None); + sampler_rhs.address_mode_y(ckw::TensorSamplerAddressModeY::None); + sampler_rhs.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + sampler_rhs.storage(ckw::TensorStorageType::BufferUint8Ptr); + + /******************************************************************************** + * 6 - Extra operations required before writing the main code (optional) + ********************************************************************************/ + + // Not required + + /******************************************************************************** + * 7 - Get the coordinates of the destination tile + ********************************************************************************/ + auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32)); + + writer->op_get_global_id(tile_gid_0, 0); + writer->op_get_global_id(tile_gid_1, 1); + writer->op_get_global_id(tile_gid_2, 2); + + auto tile_idx_n = writer->declare_tile("idx_n", ckw::TileInfo(ckw::DataType::Int32)); // N index + auto tile_idx_m = writer->declare_tile("idx_m", ckw::TileInfo(ckw::DataType::Int32)); // M index + auto tile_idx_b = writer->declare_tile("idx_b", ckw::TileInfo(ckw::DataType::Int32)); // BATCH index + + // Calculate coordinates + get_coordinate_from_gws_overlapping_min(writer, tile_idx_n, tile_gid_0, const_dst_n0_i32, + const_shift_back_dst_n0_i32, const_0_i32); + get_coordinate_from_gws(writer, tile_idx_m, tile_gid_1, const_dst_m0_i32); + get_coordinate_from_gws(writer, tile_idx_b, tile_gid_2, const_pos_1_i32); + + /******************************************************************************** + * 8 - Write the rest of the code + ********************************************************************************/ + auto tile_idx_k = writer->declare_tile("idx_k", ckw::TileInfo(ckw::DataType::Int32)); // K index + + writer->op_assign(tile_idx_k, const_0_i32); + + // clang-format off + writer->op_for_loop(tile_idx_k, ckw::BinaryOp::LessEqual, const_k_minus_k0_i32, tile_idx_k, ckw::AssignmentOp::Increment, const_k0_i32, + [&]() + { + auto tile_lhs = writer->declare_tile("lhs", ckw::TileInfo(to_ckw(_lhs->data_type()), dst_m0, k0)); + auto tile_rhs = writer->declare_tile("rhs", ckw::TileInfo(to_ckw(_rhs->data_type()), dst_n0, k0)); + writer->op_assign(tile_lhs, const_0_fp); + writer->op_assign(tile_rhs, const_0_fp); + + writer->op_load(tile_lhs, lhs->tensor(), sampler_lhs, tile_idx_k, tile_idx_m, tile_idx_b, const_0_i32); + writer->op_load(tile_rhs, rhs->tensor(), sampler_rhs, tile_idx_k, tile_idx_n, tile_idx_b, const_0_i32); + + writer->op_binary(tile_dst, ckw::BinaryOp::MatMul_Nt_T, tile_lhs, tile_rhs); + + }); + + // Left-over accumulations for when K is not a multiple of k0 + if(((k % k0) != 0)) + { + writer->op_for_loop(tile_idx_k, ckw::BinaryOp::Less, const_k_i32, tile_idx_k, ckw::AssignmentOp::Increment, const_pos_1_i32, [&]() + { + auto tile_lhs = writer->declare_tile("lhs", ckw::TileInfo(to_ckw(_lhs->data_type()), dst_m0, 1)); + auto tile_rhs = writer->declare_tile("rhs", ckw::TileInfo(to_ckw(_rhs->data_type()), dst_n0, 1)); + writer->op_assign(tile_lhs, const_0_fp); + writer->op_assign(tile_rhs, const_0_fp); + + writer->op_load(tile_lhs, lhs->tensor(), sampler_lhs, tile_idx_k, tile_idx_m, tile_idx_b, const_0_i32); + writer->op_load(tile_rhs, rhs->tensor(), sampler_rhs, tile_idx_k, tile_idx_n, tile_idx_b, const_0_i32); + + writer->op_binary(tile_dst, ckw::BinaryOp::MatMul_Nt_T, tile_lhs, tile_rhs); + }); + } + // clang-format on +} + +Window GpuCkwMatMul::get_window() const +{ + ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); + + const int32_t m = _dst->dimension(1); + const int32_t n = _dst->dimension(0); + const bool adj_lhs = _attributes.adj_lhs(); + + const int32_t m0 = adj_lhs ? adjust_vec_size(_settings.m0(), m) : std::min(_settings.m0(), m); + const int32_t n0 = adjust_vec_size(_settings.n0(), n); + + // Configure kernel window + Window win = calculate_max_window(_dst->tensor_shape(), Steps(n0, m0)); + win = win.collapse(win, Window::DimZ); + + return win; +} + +std::string GpuCkwMatMul::get_name(const ComponentGroup &comp_group) const +{ + ARM_COMPUTE_UNUSED(comp_group); + + std::string kernel_name("mat_mul_native"); + + const int32_t m = _dst->dimension(1); + const int32_t n = _dst->dimension(0); + const int32_t k = _attributes.adj_lhs() ? _lhs->tensor_shape().y() : _lhs->tensor_shape().x(); + + kernel_name += _attributes.adj_lhs() ? "_t" : "_nt"; + kernel_name += _attributes.adj_rhs() ? "_t" : "_nt"; + kernel_name += "_"; + kernel_name += support::cpp11::to_string(m); + kernel_name += "_"; + kernel_name += support::cpp11::to_string(n); + kernel_name += "_"; + kernel_name += support::cpp11::to_string(k); + kernel_name += "_"; + kernel_name += support::cpp11::to_string(_dst->dimension(2)); + kernel_name += "_"; + kernel_name += support::cpp11::to_string(_settings.m0()); + kernel_name += "_"; + kernel_name += support::cpp11::to_string(_settings.n0()); + kernel_name += "_"; + kernel_name += support::cpp11::to_string(_settings.k0()); + + return kernel_name; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h new file mode 100644 index 0000000000..790418bf50 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWMATMUL_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWMATMUL_H + +#include "arm_compute/dynamic_fusion/sketch/attributes/MatMulAttributes.h" + +#include "src/core/common/Macros.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +class GpuCkwMatMul final : public IGpuCkwComponentDriver +{ +public: + using Attributes = ClComponentMatMul::Attributes; + using Settings = ClComponentMatMul::Settings; + +public: + /** Constructor + * + * For supported configurations please refer to @ref ClComponentMatMul::validate() + * + * @param[in] id Component id + * @param[in] tensors Tensor arguments to the component + * @param[in] attributes Component attributes. Attributes are a set of parameters that define what a component does + * @param[in] settings Component settings. Settings are a set of parameters that influence the implementation of a component + */ + GpuCkwMatMul(ComponentId id, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings); + + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwMatMul); + + /** Destructor */ + ~GpuCkwMatMul() override = default; + + // Inherited methods overriden + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + Window get_window() const override; + std::string get_name(const ComponentGroup &comp_group) const override; + +private: + const ITensorInfo *_lhs; + const ITensorInfo *_rhs; + const ITensorInfo *_dst; + + Attributes _attributes; + Settings _settings; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute + +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWMATMUL_H diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp new file mode 100644 index 0000000000..d027f348ef --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp @@ -0,0 +1,405 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" + +#include "compute_kernel_writer/include/ckw/KernelWriter.h" +#include <cstdint> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +GpuCkwPool2d::GpuCkwPool2d(ComponentId id, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings) + : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}, _settings{settings} + +{ + _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); + _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); + ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); +} + +void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const +{ + const uint32_t width_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::WIDTH); + const uint32_t height_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::HEIGHT); + + /******************************************************************************** + * 1 - Define tensors + ********************************************************************************/ + GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src"); + GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst"); + + /******************************************************************************** + * 2 - Define CKW constants + ********************************************************************************/ + const auto dst_dt = to_ckw(_dst->data_type()); + const auto pool_sz_x = static_cast<int32_t>(_attributes.pool_size().x()); + const auto pool_sz_y = static_cast<int32_t>(_attributes.pool_size().y()); + const auto pad_x = static_cast<int32_t>(_attributes.pad().left); + const auto pad_y = static_cast<int32_t>(_attributes.pad().top); + const auto stride_x = static_cast<int32_t>(_attributes.stride().x()); + const auto stride_y = static_cast<int32_t>(_attributes.stride().y()); + const auto src_w = static_cast<int32_t>(_src->dimension(width_idx)); + const auto src_h = static_cast<int32_t>(_src->dimension(height_idx)); + const auto dst_h = static_cast<int32_t>(_dst->dimension(height_idx)); + + // CKW constants + auto const_pool_sz_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{pool_sz_x}}, ckw::DataType::Int32)); + auto const_pool_sz_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{pool_sz_y}}, ckw::DataType::Int32)); + auto const_pad_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{pad_x}}, ckw::DataType::Int32)); + auto const_pad_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{pad_y}}, ckw::DataType::Int32)); + auto const_stride_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_x}}, ckw::DataType::Int32)); + auto const_stride_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_y}}, ckw::DataType::Int32)); + auto const_src_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32)); + auto const_src_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32)); + auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32)); + auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32)); + auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32)); + auto const_0_fp = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt)); + auto const_lowest_val_fp = + writer->declare_constant_tile(ckw::ConstantData({{std::numeric_limits<float>::lowest()}}, ckw::DataType::Fp32)); + auto const_neg_inf_val_fp = writer->declare_constant_tile(ckw::ConstantData({{-1.0f / 0.0f}}, ckw::DataType::Fp32)); + + /******************************************************************************** + * 3 - Define the compute block parameters and destination tile (if not root component) + * Bind the tile to the tensor to share it among different components and + * initialize the compute block parameters + ********************************************************************************/ + // The n0 and m0 parameters from root_window only refers to the output + const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); + + // Destination compute block size + const int32_t dst_n0 = root_window.x().step(); + const int32_t dst_m0 = root_window.y().step(); + + // Destination compute block size left-over + const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0; + const int32_t dst_m0_partial = _dst->dimension(1) % dst_m0; + + // Shift-back for the overlapping-min strategy + const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0; + + ckw::TensorSampler sampler_dst; + sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2); + if (dst_n0_partial == 0) + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None); + } + else + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin); + } + + if (dst_m0_partial == 0) + { + sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None); + } + else + { + sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly); + } + + sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr); + + // Declare destination tile + auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0)); + + // Initialize destination tile + writer->op_assign(tile_dst, const_0_fp); + + // Bind tile to the tensor + dst->init_virtual_tensor(tile_dst, sampler_dst); + + /******************************************************************************** + * 4 - Define the compute block parameters CKW constants + ********************************************************************************/ + // Only now we can declare the N0 and M0 as constant + auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32)); + auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32)); + auto const_shift_back_dst_n0_i32 = + writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32)); + + /******************************************************************************** + * 5 - Define the sampler for the input tensor + ********************************************************************************/ + ckw::TensorSampler sampler_src; + sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2); + sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None); + sampler_src.address_mode_y(ckw::TensorSamplerAddressModeY::None); + sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + + /******************************************************************************** + * 6 - Extra operations required before writing the main code + ********************************************************************************/ + // Check if it is global pooling + const bool is_global_pooling = (pool_sz_x == src_w) && (pool_sz_y == src_h) && (pad_x == 0) && (pad_y == 0); + + // Accumulate always in F32 if the pool type is not MAX + const bool acc_f32 = (dst_dt == ckw::DataType::Fp32) || + ((dst_dt == ckw::DataType::Fp16) && _attributes.pool_type() != PoolingType::MAX); + + const auto acc_dt = acc_f32 ? ckw::DataType::Fp32 : ckw::DataType::Fp16; + + const bool is_wider_acc = dst_dt != acc_dt; + + /******************************************************************************** + * 7 - Get the coordinates of the destination tile + ********************************************************************************/ + auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32)); + + writer->op_get_global_id(tile_gid_0, 0); + writer->op_get_global_id(tile_gid_1, 1); + writer->op_get_global_id(tile_gid_2, 2); + + auto tile_cout0 = writer->declare_tile("cout0", ckw::TileInfo(ckw::DataType::Int32)); // OFM + auto tile_mout0 = writer->declare_tile("mout0", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH + auto tile_mout1 = writer->declare_tile("mout1", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT + auto tile_bout0 = writer->declare_tile("bout0", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX + + // Calculate coordinates + get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_dst_n0_i32, + const_shift_back_dst_n0_i32, const_0_i32); + get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_dst_m0_i32); + writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32); + writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32); + + /******************************************************************************** + * 8 - Write the rest of the code + ********************************************************************************/ + // A tile used to temporarily store results or as an accumulator in case of AVG and L2 pooling. + auto tile_res = writer->declare_tile("tile_res", ckw::TileInfo(acc_dt, dst_m0, dst_n0)); + + // Initialise result tile with appropriate value + if (_attributes.pool_type() == PoolingType::MAX) + { + if (_settings.use_inf_as_limit()) + { + writer->op_cast(tile_res, const_neg_inf_val_fp, ckw::ConvertPolicy::None); + } + else + { + writer->op_cast(tile_res, const_lowest_val_fp, ckw::ConvertPolicy::None); + } + } + else + { + writer->op_cast(tile_res, const_0_fp, ckw::ConvertPolicy::None); + } + + // tile_idx_in_w = tile_mout0 * STRIDE_X - PAD_X + auto tile_src_coord_x_start = writer->declare_tile("idx_in_w", ckw::DataType::Int32); + writer->op_binary(tile_src_coord_x_start, ckw::BinaryOp::Mul, tile_mout0, const_stride_x_i32); + writer->op_binary(tile_src_coord_x_start, ckw::BinaryOp::Sub, tile_src_coord_x_start, const_pad_x_i32); + + // tile_idx_in_h = tile_mout1 * STRIDE_Y - PAD_Y + auto tile_src_coord_y_start = writer->declare_tile("idx_in_h", ckw::DataType::Int32); + writer->op_binary(tile_src_coord_y_start, ckw::BinaryOp::Mul, tile_mout1, const_stride_y_i32); + writer->op_binary(tile_src_coord_y_start, ckw::BinaryOp::Sub, tile_src_coord_y_start, const_pad_y_i32); + + auto tile_neg_src_coord_x_start = writer->declare_tile("neg_src_coord_x_start", ckw::DataType::Int32); + auto tile_neg_src_coord_y_start = writer->declare_tile("neg_src_coord_y_start", ckw::DataType::Int32); + + writer->op_binary(tile_neg_src_coord_x_start, ckw::BinaryOp::Sub, const_0_i32, tile_src_coord_x_start); + writer->op_binary(tile_neg_src_coord_y_start, ckw::BinaryOp::Sub, const_0_i32, tile_src_coord_y_start); + + // int pool_x_s = max((int)0, -idx_in_w); + // int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w); + // int pool_y_s = max((int)0, -idx_in_h); + // int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h); + auto tile_pool_x_s = writer->declare_tile("pool_x_s", ckw::DataType::Int32); + auto tile_pool_y_s = writer->declare_tile("pool_y_s", ckw::DataType::Int32); + auto tile_pool_x_e = writer->declare_tile("pool_x_e", ckw::DataType::Int32); + auto tile_pool_y_e = writer->declare_tile("pool_y_e", ckw::DataType::Int32); + + writer->op_binary(tile_pool_x_s, ckw::BinaryOp::Max, const_0_i32, tile_neg_src_coord_x_start); + writer->op_binary(tile_pool_x_e, ckw::BinaryOp::Add, const_src_w_i32, tile_neg_src_coord_x_start); + writer->op_binary(tile_pool_x_e, ckw::BinaryOp::Min, const_pool_sz_x_i32, tile_pool_x_e); + writer->op_binary(tile_pool_y_s, ckw::BinaryOp::Max, const_0_i32, tile_neg_src_coord_y_start); + writer->op_binary(tile_pool_y_e, ckw::BinaryOp::Add, const_src_h_i32, tile_neg_src_coord_y_start); + writer->op_binary(tile_pool_y_e, ckw::BinaryOp::Min, const_pool_sz_y_i32, tile_pool_y_e); + + // #if defined(EXCLUDE_PADDING) + // int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s); + // #else // defined(EXCLUDE_PADDING) + // int filter_size = POOL_SIZE_X * POOL_SIZE_Y; + // #endif // defined(EXCLUDE_PADDING) + auto tile_filter_size = writer->declare_tile("filter_size", ckw::DataType::Int32); + if (_attributes.exclude_padding()) + { + auto tile_x_diff = writer->declare_tile("x_diff", ckw::DataType::Int32); + auto tile_y_diff = writer->declare_tile("y_diff", ckw::DataType::Int32); + + writer->op_binary(tile_x_diff, ckw::BinaryOp::Sub, tile_pool_x_e, tile_pool_x_s); + writer->op_binary(tile_y_diff, ckw::BinaryOp::Sub, tile_pool_y_e, tile_pool_y_s); + writer->op_binary(tile_filter_size, ckw::BinaryOp::Mul, tile_x_diff, tile_y_diff); + } + else + { + writer->op_binary(tile_filter_size, ckw::BinaryOp::Mul, const_pool_sz_x_i32, const_pool_sz_y_i32); + } + + auto tile_x = writer->declare_tile("x", ckw::DataType::Int32); + auto tile_y = writer->declare_tile("y", ckw::DataType::Int32); + + if (is_global_pooling) + { + writer->op_assign(tile_y, const_0_i32); + writer->op_assign(tile_pool_y_e, const_pool_sz_y_i32); + } + else + { + writer->op_assign(tile_y, tile_pool_y_s); + } + + // Y dim for-loop + writer->op_for_loop( + tile_y, ckw::BinaryOp::Less, tile_pool_y_e, tile_y, ckw::AssignmentOp::Increment, const_pos_1_i32, + [&]() + { + // Reset the iterator for the inner loop + if (is_global_pooling) + { + writer->op_assign(tile_x, const_0_i32); + writer->op_assign(tile_pool_x_e, const_pool_sz_x_i32); + } + else + { + writer->op_assign(tile_x, tile_pool_x_s); + } + + auto tile_src_coord_y = writer->declare_tile("src_coord_y", ckw::DataType::Int32); + writer->op_binary(tile_src_coord_y, ckw::BinaryOp::Add, tile_src_coord_y_start, tile_y); + + // X dim for-loop + writer->op_for_loop( + tile_x, ckw::BinaryOp::Less, tile_pool_x_e, tile_x, ckw::AssignmentOp::Increment, const_pos_1_i32, + [&]() + { + auto tile_src_coord_x = writer->declare_tile("src_coord_x", ckw::DataType::Int32); + writer->op_binary(tile_src_coord_x, ckw::BinaryOp::Add, tile_src_coord_x_start, tile_x); + + ckw::DataType src_dt = to_ckw(_src->data_type()); + auto tile_src = writer->declare_tile("tile_src", ckw::TileInfo(acc_dt, dst_m0, dst_n0)); + + // Load src tile + if (is_wider_acc) + { + auto tile_src0 = writer->declare_tile("src_tile0", ckw::TileInfo(src_dt, dst_m0, dst_n0)); + writer->op_load(tile_src0, src->tensor(), sampler_src, tile_cout0, tile_src_coord_x, + tile_src_coord_y, tile_bout0); + writer->op_cast(tile_src, tile_src0, ckw::ConvertPolicy::None); + } + else + { + writer->op_load(tile_src, src->tensor(), sampler_src, tile_cout0, tile_src_coord_x, + tile_src_coord_y, tile_bout0); + } + + // Take the square of the input, for L2 Pooling + if (_attributes.pool_type() == PoolingType::L2) + { + writer->op_binary(tile_src, ckw::BinaryOp::Mul, tile_src, tile_src); + } + + // Perfom Pooling op + if (_attributes.pool_type() == PoolingType::MAX) + { + writer->op_binary(tile_res, ckw::BinaryOp::Max, tile_res, tile_src); + } + else + { + writer->op_binary(tile_res, ckw::BinaryOp::Add, tile_res, tile_src); + } + }); + }); + + if ((_attributes.pool_type() == PoolingType::AVG) || (_attributes.pool_type() == PoolingType::L2)) + { + // Filter_size is automatically broadcasted in the operation + auto tile_filter_size_fp = writer->declare_tile("filter_size_fp", ckw::TileInfo(acc_dt)); + writer->op_cast(tile_filter_size_fp, tile_filter_size, ckw::ConvertPolicy::None); + writer->op_binary(tile_res, ckw::BinaryOp::Div, tile_res, tile_filter_size_fp); + } + + // Take square root of the result in L2 pooling + if (_attributes.pool_type() == PoolingType::L2) + { + writer->op_unary(tile_res, ckw::UnaryOp::Sqrt, tile_res); + } + + // Store the results and do casting if mixed precision + if (is_wider_acc) + { + writer->op_cast(tile_dst, tile_res, ckw::ConvertPolicy::None); + } + else + { + writer->op_assign(tile_dst, tile_res); + } +} + +Window GpuCkwPool2d::get_window() const +{ + ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); + + TensorShape output_shape = _dst->tensor_shape(); + const uint32_t vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0)); + // Create and configure kernel window + auto win = calculate_max_window(output_shape, Steps(vec_size)); + win = win.collapse_if_possible(win, Window::DimZ); // collapse window on batch size. + return win; +} + +std::string GpuCkwPool2d::get_name(const ComponentGroup &comp_group) const +{ + ARM_COMPUTE_UNUSED(comp_group); + + return "pool2dMxN"; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h new file mode 100644 index 0000000000..822282a108 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWPOOL2D_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWPOOL2D_H + +#include "src/core/common/Macros.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h" + +#include <string> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +class GpuCkwPool2d : public IGpuCkwComponentDriver +{ +public: + using Attributes = ClComponentPool2d::Attributes; + using Settings = ClComponentPool2d::Settings; + + /** Constructor + * + * For supported configurations please refer to @ref ClComponentCast::validate() + * + * @param[in] id Component id + * @param[in] tensors Tensor arguments to the component + * @param[in] attributes Component attributes + * @param[in] settings Component settings + */ + GpuCkwPool2d(ComponentId id, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwPool2d); + /** Destructor */ + ~GpuCkwPool2d() override = default; + // Inherited methods overriden: + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + Window get_window() const override; + std::string get_name(const ComponentGroup &comp_group) const override; + +private: + const ITensorInfo *_src; + const ITensorInfo *_dst; + Attributes _attributes; + Settings _settings; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute + +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWPOOL2D_H diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp new file mode 100644 index 0000000000..edd7ea9a38 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp @@ -0,0 +1,576 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/utils/ScaleUtils.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "support/StringSupport.h" + +#include <cstdint> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +constexpr uint32_t opencl_vector_size_in_bytes = 16; +} // namespace + +GpuCkwResize::GpuCkwResize(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes) + : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes} +{ + _src = this->tensors().get_const_tensor(TensorType::ACL_SRC); + _dst = this->tensors().get_const_tensor(TensorType::ACL_DST); + ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); +} + +void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const +{ + const uint32_t width_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::WIDTH); + const uint32_t height_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::HEIGHT); + + /******************************************************************************** + * 1 - Define tensors + ********************************************************************************/ + GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src"); + GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst"); + + /******************************************************************************** + * 2 - Define CKW constants + ********************************************************************************/ + const auto dst_dt = to_ckw(_dst->data_type()); + const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx), + _attributes.align_corners()); + const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx), + _attributes.align_corners()); + const auto src_w = static_cast<int32_t>(_src->dimension(width_idx)); + const auto src_h = static_cast<int32_t>(_src->dimension(height_idx)); + const auto dst_h = static_cast<int32_t>(_dst->dimension(height_idx)); + + // CKW constants + auto const_src_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32)); + auto const_src_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32)); + auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32)); + auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32)); + auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32)); + auto const_0_fp = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt)); + auto const_pos_0_5_fp = writer->declare_constant_tile(ckw::ConstantData({{0.5f}}, ckw::DataType::Fp32)); + auto const_scale_x_fp = writer->declare_constant_tile(ckw::ConstantData({{scale_x}}, ckw::DataType::Fp32)); + auto const_scale_y_fp = writer->declare_constant_tile(ckw::ConstantData({{scale_y}}, ckw::DataType::Fp32)); + + /******************************************************************************** + * 3 - Define the compute block parameters and destination tile (if not root component) + * Bind the tile to the tensor to share it among different components and + * initialize the compute block parameters + ********************************************************************************/ + // The n0 and m0 parameters from root_window only refers to the output + const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); + + // Destination compute block size + const int32_t dst_n0 = root_window.x().step(); + + // dst_m0 must be 1 + ARM_COMPUTE_ERROR_ON(root_window.y().step() != 1); + + // Destination compute block size left-over + const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0; + + // Shift-back for the overlapping-min strategy + const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0; + + ckw::TensorSampler sampler_dst; + sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2); + if (dst_n0_partial == 0) + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None); + } + else + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin); + } + sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None); + sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr); + + // Declare destination tile + auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, 1, dst_n0)); + + // Initialize destination tile + writer->op_assign(tile_dst, const_0_fp); + + // Bind tile to the tensor + dst->init_virtual_tensor(tile_dst, sampler_dst); + + /******************************************************************************** + * 4 - Define the compute block parameters CKW constants + ********************************************************************************/ + auto const_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32)); + auto const_shift_back_n0_i32 = + writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32)); + + /******************************************************************************** + * 5 - Define the samplers for the input tensor + ********************************************************************************/ + ckw::TensorSampler sampler_src; + sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2); + sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None); + sampler_src.address_mode_y(ckw::TensorSamplerAddressModeY::None); + sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + + /******************************************************************************** + * 6 - Extra operations required before writing the main code + ********************************************************************************/ + + // .... + + /******************************************************************************** + * 7 - Get the coordinates of the destination tile + ********************************************************************************/ + auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32)); + + writer->op_get_global_id(tile_gid_0, 0); + writer->op_get_global_id(tile_gid_1, 1); + writer->op_get_global_id(tile_gid_2, 2); + + auto tile_co = writer->declare_tile("co", ckw::TileInfo(ckw::DataType::Int32)); // OFM + auto tile_xo = writer->declare_tile("xo", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH + auto tile_yo = writer->declare_tile("yo", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT + auto tile_bo = writer->declare_tile("bo", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX + + // Calculate coordinates + get_coordinate_from_gws_overlapping_min(writer, tile_co, tile_gid_0, const_n0_i32, const_shift_back_n0_i32, + const_0_i32); + writer->op_assign(tile_xo, tile_gid_1); + writer->op_binary(tile_yo, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32); + writer->op_binary(tile_bo, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32); + + /******************************************************************************** + * 8 - Write the rest of the code + ********************************************************************************/ + auto tile_xi_f = writer->declare_tile("xi_f", ckw::DataType::Fp32); + auto tile_yi_f = writer->declare_tile("yi_f", ckw::DataType::Fp32); + + switch (_attributes.sampling_policy()) + { + case SamplingPolicy::TOP_LEFT: + // xi_f = (xo * scale_x) + // yi_f = (yo * scale_y) + writer->op_cast(tile_xi_f, tile_xo, ckw::ConvertPolicy::None); + writer->op_cast(tile_yi_f, tile_yo, ckw::ConvertPolicy::None); + writer->op_binary(tile_xi_f, ckw::BinaryOp::Mul, tile_xi_f, const_scale_x_fp); + writer->op_binary(tile_yi_f, ckw::BinaryOp::Mul, tile_yi_f, const_scale_y_fp); + break; + case SamplingPolicy::CENTER: + { + // xi_f = ((xo + 0.5f) * scale_x) + // yi_f = ((yo + 0.5f) * scale_y) + const auto &tile_xo_plus_half = writer->declare_tile("xo_plus_half", ckw::DataType::Fp32); + const auto &tile_yo_plus_half = writer->declare_tile("yo_plus_half", ckw::DataType::Fp32); + + writer->op_cast(tile_xo_plus_half, tile_xo, ckw::ConvertPolicy::None); + writer->op_cast(tile_yo_plus_half, tile_yo, ckw::ConvertPolicy::None); + writer->op_binary(tile_xo_plus_half, ckw::BinaryOp::Add, tile_xo_plus_half, const_pos_0_5_fp); + writer->op_binary(tile_yo_plus_half, ckw::BinaryOp::Add, tile_yo_plus_half, const_pos_0_5_fp); + writer->op_binary(tile_xi_f, ckw::BinaryOp::Mul, tile_xo_plus_half, const_scale_x_fp); + writer->op_binary(tile_yi_f, ckw::BinaryOp::Mul, tile_yo_plus_half, const_scale_y_fp); + } + break; + default: + ARM_COMPUTE_ERROR("Unsupported sampling policy"); + } + + if (_attributes.align_corners()) + { + writer->op_unary(tile_xi_f, ckw::UnaryOp::Round, tile_xi_f); + writer->op_unary(tile_yi_f, ckw::UnaryOp::Round, tile_yi_f); + } + + // xi0 = clamp((int)xi_f, 0, (int)src_w - 1) + // yi0 = clamp((int)yi_f, 0, (int)src_h - 1) + auto tile_xi_f_int = writer->declare_tile("xi_f_int", ckw::DataType::Int32); + auto tile_yi_f_int = writer->declare_tile("yi_f_int", ckw::DataType::Int32); + + writer->op_cast(tile_xi_f_int, tile_xi_f, ckw::ConvertPolicy::None); + writer->op_cast(tile_yi_f_int, tile_yi_f, ckw::ConvertPolicy::None); + + auto tile_src_w_minus_1 = writer->declare_tile("src_w_minus_1", ckw::DataType::Int32); + auto tile_src_h_minus_1 = writer->declare_tile("src_h_minus_1", ckw::DataType::Int32); + + writer->op_binary(tile_src_w_minus_1, ckw::BinaryOp::Sub, const_src_w_i32, const_pos_1_i32); + writer->op_binary(tile_src_h_minus_1, ckw::BinaryOp::Sub, const_src_h_i32, const_pos_1_i32); + + auto tile_xi0 = writer->declare_tile("xi0", ckw::DataType::Int32); + auto tile_yi0 = writer->declare_tile("yi0", ckw::DataType::Int32); + + writer->op_ternary(tile_xi0, ckw::TernaryOp::Clamp, tile_xi_f_int, const_0_i32, tile_src_w_minus_1); + writer->op_ternary(tile_yi0, ckw::TernaryOp::Clamp, tile_yi_f_int, const_0_i32, tile_src_h_minus_1); + + auto tile_src = writer->declare_tile("src_tile", ckw::TileInfo(dst_dt, 1, dst_n0)); + writer->op_load(tile_src, src->tensor(), sampler_src, tile_co, tile_xi0, tile_yi0, tile_bo); + + writer->op_assign(tile_dst, tile_src); +} + +void GpuCkwResize::do_bilinear_resize(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const +{ + const size_t width_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::WIDTH); + const size_t height_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::HEIGHT); + + /******************************************************************************** + * 1 - Define tensors + ********************************************************************************/ + GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src"); + GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst"); + + /******************************************************************************** + * 2 - Define CKW constants + ********************************************************************************/ + const auto dst_dt = to_ckw(_dst->data_type()); + const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx), + _attributes.align_corners()); + const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx), + _attributes.align_corners()); + const auto src_w = static_cast<int32_t>(_src->dimension(width_idx)); + const auto src_h = static_cast<int32_t>(_src->dimension(height_idx)); + const auto dst_h = static_cast<int32_t>(_dst->dimension(height_idx)); + + // CKW constants + auto const_src_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32)); + auto const_src_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32)); + auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32)); + auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32)); + auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32)); + auto const_0_fp = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt)); + auto const_pos_1_fp = writer->declare_constant_tile(ckw::ConstantData({{1.0f}}, ckw::DataType::Fp32)); + auto const_pos_0_5_fp = writer->declare_constant_tile(ckw::ConstantData({{0.5f}}, ckw::DataType::Fp32)); + auto const_scale_x_fp = writer->declare_constant_tile(ckw::ConstantData({{scale_x}}, ckw::DataType::Fp32)); + auto const_scale_y_fp = writer->declare_constant_tile(ckw::ConstantData({{scale_y}}, ckw::DataType::Fp32)); + + /******************************************************************************** + * 3 - Define the compute block parameters and destination tile (if not root component) + * Bind the tile to the tensor to share it among different components and + * initialize the compute block parameters + ********************************************************************************/ + // The n0 and m0 parameters from root_window only refers to the output + const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); + + // Destination compute block size + const int32_t dst_n0 = root_window.x().step(); + + // dst_m0 must be 1 + ARM_COMPUTE_ERROR_ON(root_window.y().step() != 1); + + // Destination compute block size left-over + const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0; + + // Shift-back for the overlapping-min strategy + const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0; + + ckw::TensorSampler sampler_dst; + sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2); + if (dst_n0_partial == 0) + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None); + } + else + { + sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin); + } + sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None); + sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr); + + // Declare destination tile + auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, 1, dst_n0)); + + // Initialize destination tile + writer->op_assign(tile_dst, const_0_fp); + + // Bind tile to the tensor + dst->init_virtual_tensor(tile_dst, sampler_dst); + + /******************************************************************************** + * 4 - Define the compute block parameters CKW constants + ********************************************************************************/ + auto const_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32)); + auto const_shift_back_n0_i32 = + writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32)); + + /******************************************************************************** + * 5 - Define the sampler for the input tensor + ********************************************************************************/ + ckw::TensorSampler sampler_src; + sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2); + sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None); + sampler_src.address_mode_y(ckw::TensorSamplerAddressModeY::None); + sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None); + + /******************************************************************************** + * 6 - Extra operations required before writing the main code + ********************************************************************************/ + + // .... + + /******************************************************************************** + * 7 - Get the coordinates of the destination tile + ********************************************************************************/ + auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32)); + + writer->op_get_global_id(tile_gid_0, 0); + writer->op_get_global_id(tile_gid_1, 1); + writer->op_get_global_id(tile_gid_2, 2); + + auto tile_co = writer->declare_tile("co", ckw::TileInfo(ckw::DataType::Int32)); // OFM + auto tile_xo = writer->declare_tile("xo", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH + auto tile_yo = writer->declare_tile("yo", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT + auto tile_bo = writer->declare_tile("bo", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX + + // Calculate coordinates + get_coordinate_from_gws_overlapping_min(writer, tile_co, tile_gid_0, const_n0_i32, const_shift_back_n0_i32, + const_0_i32); + writer->op_assign(tile_xo, tile_gid_1); + writer->op_binary(tile_yo, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32); + writer->op_binary(tile_bo, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32); + + /******************************************************************************** + * 8 - Write the rest of the code + ********************************************************************************/ + auto tile_xi_f = writer->declare_tile("xi_f", ckw::DataType::Fp32); + auto tile_yi_f = writer->declare_tile("yi_f", ckw::DataType::Fp32); + + switch (_attributes.sampling_policy()) + { + case SamplingPolicy::TOP_LEFT: + // xi_f = (xo * scale_x) + // yi_f = (yo * scale_y) + writer->op_cast(tile_xi_f, tile_xo, ckw::ConvertPolicy::None); + writer->op_cast(tile_yi_f, tile_yo, ckw::ConvertPolicy::None); + writer->op_binary(tile_xi_f, ckw::BinaryOp::Mul, tile_xi_f, const_scale_x_fp); + writer->op_binary(tile_yi_f, ckw::BinaryOp::Mul, tile_yi_f, const_scale_y_fp); + break; + case SamplingPolicy::CENTER: + { + // xi_f = ((xo + 0.5f) * scale_x - 0.5f) + // yi_f = ((yo + 0.5f) * scale_y - 0.5f) + const auto &tile_xo_plus_half = writer->declare_tile("xo_plus_half", ckw::DataType::Fp32); + const auto &tile_yo_plus_half = writer->declare_tile("yo_plus_half", ckw::DataType::Fp32); + + writer->op_cast(tile_xo_plus_half, tile_xo, ckw::ConvertPolicy::None); + writer->op_cast(tile_yo_plus_half, tile_yo, ckw::ConvertPolicy::None); + writer->op_binary(tile_xo_plus_half, ckw::BinaryOp::Add, tile_xo_plus_half, const_pos_0_5_fp); + writer->op_binary(tile_yo_plus_half, ckw::BinaryOp::Add, tile_yo_plus_half, const_pos_0_5_fp); + writer->op_binary(tile_xi_f, ckw::BinaryOp::Mul, tile_xo_plus_half, const_scale_x_fp); + writer->op_binary(tile_yi_f, ckw::BinaryOp::Mul, tile_yo_plus_half, const_scale_y_fp); + + writer->op_binary(tile_xi_f, ckw::BinaryOp::Sub, tile_xi_f, const_pos_0_5_fp); + writer->op_binary(tile_yi_f, ckw::BinaryOp::Sub, tile_yi_f, const_pos_0_5_fp); + } + break; + default: + ARM_COMPUTE_ERROR("Unsupported sampling policy"); + } + + // xi = (int)floor(xi_f); + // yi = (int)floor(yi_f); + auto tile_xi_f_floor = writer->declare_tile("xi_f_floor", ckw::DataType::Fp32); + auto tile_yi_f_floor = writer->declare_tile("yi_f_floor", ckw::DataType::Fp32); + writer->op_unary(tile_xi_f_floor, ckw::UnaryOp::Floor, tile_xi_f); + writer->op_unary(tile_yi_f_floor, ckw::UnaryOp::Floor, tile_yi_f); + + auto tile_xi = writer->declare_tile("xi", ckw::DataType::Int32); + auto tile_yi = writer->declare_tile("yi", ckw::DataType::Int32); + writer->op_cast(tile_xi, tile_xi_f_floor, ckw::ConvertPolicy::None); + writer->op_cast(tile_yi, tile_yi_f_floor, ckw::ConvertPolicy::None); + + // xi0 = clamp(xi, 0, (int)src_w - 1); + // yi0 = clamp(yi, 0, (int)src_h - 1); + // xi1 = clamp(xi + 1, 0, (int)src_w - 1); + // yi1 = clamp(yi + 1, 0, (int)src_h - 1); + auto tile_src_w_minus_1 = writer->declare_tile("src_w_minus_1", ckw::DataType::Int32); + auto tile_src_h_minus_1 = writer->declare_tile("src_h_minus_1", ckw::DataType::Int32); + writer->op_binary(tile_src_w_minus_1, ckw::BinaryOp::Sub, const_src_w_i32, const_pos_1_i32); + writer->op_binary(tile_src_h_minus_1, ckw::BinaryOp::Sub, const_src_h_i32, const_pos_1_i32); + + auto tile_xi_plus_1 = writer->declare_tile("xi_plus_1", ckw::DataType::Int32); + auto tile_yi_plus_1 = writer->declare_tile("yi_plus_1", ckw::DataType::Int32); + writer->op_binary(tile_xi_plus_1, ckw::BinaryOp::Add, tile_xi, const_pos_1_i32); + writer->op_binary(tile_yi_plus_1, ckw::BinaryOp::Add, tile_yi, const_pos_1_i32); + + auto tile_xi0 = writer->declare_tile("xi0", ckw::DataType::Int32); + auto tile_yi0 = writer->declare_tile("yi0", ckw::DataType::Int32); + auto tile_xi1 = writer->declare_tile("xi1", ckw::DataType::Int32); + auto tile_yi1 = writer->declare_tile("yi1", ckw::DataType::Int32); + + writer->op_ternary(tile_xi0, ckw::TernaryOp::Clamp, tile_xi, const_0_i32, tile_src_w_minus_1); + writer->op_ternary(tile_yi0, ckw::TernaryOp::Clamp, tile_yi, const_0_i32, tile_src_h_minus_1); + writer->op_ternary(tile_xi1, ckw::TernaryOp::Clamp, tile_xi_plus_1, const_0_i32, tile_src_w_minus_1); + writer->op_ternary(tile_yi1, ckw::TernaryOp::Clamp, tile_yi_plus_1, const_0_i32, tile_src_h_minus_1); + + auto tile_in00 = writer->declare_tile("in00", ckw::TileInfo(dst_dt, 1, dst_n0)); + auto tile_in01 = writer->declare_tile("in01", ckw::TileInfo(dst_dt, 1, dst_n0)); + auto tile_in10 = writer->declare_tile("in10", ckw::TileInfo(dst_dt, 1, dst_n0)); + auto tile_in11 = writer->declare_tile("in11", ckw::TileInfo(dst_dt, 1, dst_n0)); + + writer->op_load(tile_in00, src->tensor(), sampler_src, tile_co, tile_xi0, tile_yi0, tile_bo); + writer->op_load(tile_in01, src->tensor(), sampler_src, tile_co, tile_xi1, tile_yi0, tile_bo); + writer->op_load(tile_in10, src->tensor(), sampler_src, tile_co, tile_xi0, tile_yi1, tile_bo); + writer->op_load(tile_in11, src->tensor(), sampler_src, tile_co, tile_xi1, tile_yi1, tile_bo); + + // Weights of each nearest pixel + auto tile_a = writer->declare_tile("a", ckw::DataType::Fp32); + auto tile_b = writer->declare_tile("b", ckw::DataType::Fp32); + auto tile_a1 = writer->declare_tile("a1", ckw::DataType::Fp32); + auto tile_b1 = writer->declare_tile("b1", ckw::DataType::Fp32); + + // a = (xi_f - (float)xi) + // b = (1.f - a) + // a1 = (yi_f - (float)yi) + // b1 = (1.f - a1) + auto tile_xi_float = writer->declare_tile("xi_float", ckw::DataType::Fp32); + auto tile_yi_float = writer->declare_tile("yi_float", ckw::DataType::Fp32); + writer->op_cast(tile_xi_float, tile_xi, ckw::ConvertPolicy::None); + writer->op_cast(tile_yi_float, tile_yi, ckw::ConvertPolicy::None); + + writer->op_binary(tile_a, ckw::BinaryOp::Sub, tile_xi_f, tile_xi_float); + writer->op_binary(tile_b, ckw::BinaryOp::Sub, const_pos_1_fp, tile_a); + writer->op_binary(tile_a1, ckw::BinaryOp::Sub, tile_yi_f, tile_yi_float); + writer->op_binary(tile_b1, ckw::BinaryOp::Sub, const_pos_1_fp, tile_a1); + + // Cast weights to source type + const auto &tile_a_src_type = writer->declare_tile("a_src_t", to_ckw(_src->data_type())); + const auto &tile_b_src_type = writer->declare_tile("b_src_t", to_ckw(_src->data_type())); + const auto &tile_a1_src_type = writer->declare_tile("a1_src_t", to_ckw(_src->data_type())); + const auto &tile_b1_src_type = writer->declare_tile("b1_src_t", to_ckw(_src->data_type())); + + writer->op_cast(tile_a_src_type, tile_a, ckw::ConvertPolicy::None); + writer->op_cast(tile_b_src_type, tile_b, ckw::ConvertPolicy::None); + writer->op_cast(tile_a1_src_type, tile_a1, ckw::ConvertPolicy::None); + writer->op_cast(tile_b1_src_type, tile_b1, ckw::ConvertPolicy::None); + + // in00 * b * b1 + writer->op_binary(tile_in00, ckw::BinaryOp::Mul, tile_in00, tile_b_src_type); + writer->op_binary(tile_in00, ckw::BinaryOp::Mul, tile_in00, tile_b1_src_type); + + // in01 * a * b1 + writer->op_binary(tile_in01, ckw::BinaryOp::Mul, tile_in01, tile_a_src_type); + writer->op_binary(tile_in01, ckw::BinaryOp::Mul, tile_in01, tile_b1_src_type); + + // in10 * b * a1 + writer->op_binary(tile_in10, ckw::BinaryOp::Mul, tile_in10, tile_b_src_type); + writer->op_binary(tile_in10, ckw::BinaryOp::Mul, tile_in10, tile_a1_src_type); + + // in11 * a * a1 + writer->op_binary(tile_in11, ckw::BinaryOp::Mul, tile_in11, tile_a_src_type); + writer->op_binary(tile_in11, ckw::BinaryOp::Mul, tile_in11, tile_a1_src_type); + + // Summation of above terms + writer->op_assign(tile_dst, tile_in00); + writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_in01); + writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_in10); + writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_in11); +} + +void GpuCkwResize::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const +{ + switch (_attributes.interpolation_policy()) + { + case InterpolationPolicy::NEAREST_NEIGHBOR: + do_nearest_neighbor_resize(comp_group, vtable, writer); + break; + case InterpolationPolicy::BILINEAR: + do_bilinear_resize(comp_group, vtable, writer); + break; + default: + ARM_COMPUTE_ERROR("Unsupported interpolation policy"); + } +} + +Window GpuCkwResize::get_window() const +{ + ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); + + const uint32_t n0 = adjust_vec_size(opencl_vector_size_in_bytes / _src->element_size(), _src->dimension(0)); + Window win = calculate_max_window(*_dst, Steps(n0)); + return win.collapse(win, Window::DimZ); +} + +std::string GpuCkwResize::get_tuner_id(const ComponentGroup &comp_group) const +{ + ARM_COMPUTE_UNUSED(comp_group); + + std::string tuner_id = "resize_"; + tuner_id += _attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR ? "nearest_neighbor" : ""; + tuner_id += _attributes.interpolation_policy() == InterpolationPolicy::BILINEAR ? "bilinear" : ""; + tuner_id += "_"; + tuner_id += _attributes.sampling_policy() == SamplingPolicy::CENTER ? "center" : "topleft"; + tuner_id += "_"; + tuner_id += support::cpp11::to_string(_dst->dimension(0)); + tuner_id += "_"; + tuner_id += support::cpp11::to_string(_dst->dimension(1)); + tuner_id += "_"; + tuner_id += support::cpp11::to_string(_dst->dimension(2)); + tuner_id += "_"; + tuner_id += support::cpp11::to_string(_dst->dimension(3)); + + return tuner_id; +} + +std::string GpuCkwResize::get_name(const ComponentGroup &comp_group) const +{ + ARM_COMPUTE_UNUSED(comp_group); + + std::string name = "resize_"; + name += _attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR ? "nearest_neighbor" : ""; + name += _attributes.interpolation_policy() == InterpolationPolicy::BILINEAR ? "bilinear" : ""; + + return name; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h new file mode 100644 index 0000000000..1266c05921 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWRESIZE_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWRESIZE_H + +#include "src/core/common/Macros.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +class GpuCkwResize final : public IGpuCkwComponentDriver +{ +public: + using Attributes = ClComponentResize::Attributes; + +public: + /** Constructor + * + * @param[in] id Component id + * @param[in] tensors Tensor arguments to the components + * @param[in] attributes Component attributes + */ + GpuCkwResize(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes); + + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwResize); + + /** Destructor */ + ~GpuCkwResize() override = default; + + // Inherited methods overriden + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + Window get_window() const override; + std::string get_name(const ComponentGroup &comp_group) const override; + std::string get_tuner_id(const ComponentGroup &comp_group) const override; + +private: + /** Resize using nearest neighbor interpolation + * + * @param[in] comp_group Component group to which this component belongs to + * @param[in, out] vtable Table of variables declared by this component + * @param[in, out] writer CKW writer that writes code scoped to this kernel component + */ + void do_nearest_neighbor_resize(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const; + + /** Resize using bilinear interpolation + * + * @param[in] comp_group Component group to which this component belongs to + * @param[in, out] vtable Table of variables declared by this component + * @param[in, out] writer CKW writer that writes code scoped to this kernel component + */ + void do_bilinear_resize(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const; + + const ITensorInfo *_src; + const ITensorInfo *_dst; + Attributes _attributes; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute + +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWRESIZE_H diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp new file mode 100644 index 0000000000..d9d741fea5 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "GpuCkwStore.h" + +#include "arm_compute/core/Error.h" + +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" + +#include <cstdint> +#include <string> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +GpuCkwStore::GpuCkwStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors) + : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{} +{ + _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); + _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); +} +void GpuCkwStore::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const +{ + /******************************************************************************** + * 1 - Define tensors + ********************************************************************************/ + GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src"); + GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst"); + + /******************************************************************************** + * 2 - Define CKW constants + ********************************************************************************/ + const auto dst_h = static_cast<int32_t>(_dst->dimension(2)); + + auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32)); + auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32)); + auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32)); + + /******************************************************************************** + * 3 - Define the compute block parameters and destination tile (if not root component) + * Bind the tile to the tensor to share it among different components and + * initialize the compute block parameters + ********************************************************************************/ + const auto &tile_src = src->tile(); + auto &sampler_src = src->tensor_sampler(); + + const auto dst_n0 = static_cast<int32_t>(tile_src.tile_info().width()); + const auto dst_m0 = static_cast<int32_t>(tile_src.tile_info().height()); + const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0; + const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0; + + /******************************************************************************** + * 4 - Define the compute block parameters CKW constants + ********************************************************************************/ + auto const_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32)); + auto const_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32)); + auto const_shift_back_n0_i32 = + writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32)); + + /******************************************************************************** + * 5 - Define the samplers for the input tensor + ********************************************************************************/ + // Not required + + /******************************************************************************** + * 6 - Extra operations required before writing the main code + ********************************************************************************/ + // Not required + + /******************************************************************************** + * 7 - Get the coordinates of the destination tile + ********************************************************************************/ + auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32)); + auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32)); + + writer->op_get_global_id(tile_gid_0, 0); + writer->op_get_global_id(tile_gid_1, 1); + writer->op_get_global_id(tile_gid_2, 2); + + auto tile_nout0 = writer->declare_tile("cout0", ckw::TileInfo(ckw::DataType::Int32)); // OFM + auto tile_mout0 = writer->declare_tile("mout0", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT + auto tile_mout1 = writer->declare_tile("mout1", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0 + auto tile_bout0 = writer->declare_tile("bout0", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX + + // Calculate coordinates + get_coordinate_from_gws_overlapping_min(writer, tile_nout0, tile_gid_0, const_n0_i32, const_shift_back_n0_i32, + const_0_i32); + get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_m0_i32); + + // Get the boundary aware coordinates at each global dimension index + if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1) + { + writer->op_assign(tile_mout1, const_0_i32); + get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32); + } + else if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2) + { + // For tile_mout1 and tile_bout0 the step can only be 1 + writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32); + writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32); + } + + /******************************************************************************** + * 8 - Write the rest of the code + ********************************************************************************/ + writer->op_store(dst->tensor(), tile_src, sampler_src, tile_nout0, tile_mout0, tile_mout1, tile_bout0); +} + +std::string GpuCkwStore::get_name(const ComponentGroup &comp_group) const +{ + ARM_COMPUTE_UNUSED(comp_group); + return "store"; +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h new file mode 100644 index 0000000000..c9ce7eb269 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWSTORE_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWSTORE_H + +#include "src/core/common/Macros.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +class GpuCkwStore : public IGpuCkwComponentDriver +{ +public: + /** Constructor + * + * @param[in] id Component id + * @param[in] tensors Tensor arguments to the component + */ + GpuCkwStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwStore); + /** Destructor */ + ~GpuCkwStore() override = default; + // Inherited methods overriden: + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + std::string get_name(const ComponentGroup &comp_group) const override; + +private: + const ITensorInfo *_src; + const ITensorInfo *_dst; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute + +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWSTORE_H diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.cpp new file mode 100644 index 0000000000..1e6f0841ad --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.cpp @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "CkwHelper.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +void get_coordinate_from_gws(GpuCkwScopedKernelWriter writer, + ckw::TileOperand &coord, + const ckw::TileOperand &gid, + ckw::TileOperand &step) +{ + writer->op_binary(coord, ckw::BinaryOp::Mul, gid, step); +} + +void get_coordinate_from_gws_overlapping_min(GpuCkwScopedKernelWriter writer, + ckw::TileOperand &coord, + const ckw::TileOperand &gid, + ckw::TileOperand &step, + ckw::TileOperand &shift_back, + ckw::TileOperand &const_0) +{ + // Applied formula: max((gid * step) - shift_back, 0) + // where the shift_back operand is: (step - leftover_step) % step + + writer->op_binary(coord, ckw::BinaryOp::Mul, gid, step); + writer->op_binary(coord, ckw::BinaryOp::Sub, coord, shift_back); + writer->op_binary(coord, ckw::BinaryOp::Max, coord, const_0); +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h new file mode 100644 index 0000000000..956e7c8ecb --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_CKWHELPER_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_CKWHELPER_H + +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Get coordinate along one axis. + * + * @param[in,out] writer Writer + * @param[out] coord Resultant coordinate + * @param[in] gid Global work item id + * @param[in] step Step size / vector size + */ +void get_coordinate_from_gws(GpuCkwScopedKernelWriter writer, + ckw::TileOperand &coord, + const ckw::TileOperand &gid, + ckw::TileOperand &step); + +/** Get boundary aware coordinate along one axis. + * + * @param[in,out] writer Writer + * @param[out] coord Resultant coordinate + * @param[in] gid Global work item id + * @param[in] step Step size / vector size + * @param[in] shift_back It is (step - leftover_step) % step + * @param[in] const_0 Constant tile of value 0 + */ +void get_coordinate_from_gws_overlapping_min(GpuCkwScopedKernelWriter writer, + ckw::TileOperand &coord, + const ckw::TileOperand &gid, + ckw::TileOperand &step, + ckw::TileOperand &shift_back, + ckw::TileOperand &const_0); +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_CKWHELPER_H diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.cpp new file mode 100644 index 0000000000..ad31b06362 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.cpp @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "Common.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +ckw::DataType to_ckw(DataType dt) +{ + switch (dt) + { + case DataType::F32: + return ckw::DataType::Fp32; + case DataType::F16: + return ckw::DataType::Fp16; + case DataType::S32: + return ckw::DataType::Int32; + case DataType::S16: + return ckw::DataType::Int16; + case DataType::S8: + case DataType::QASYMM8_SIGNED: + return ckw::DataType::Int8; + case DataType::U32: + return ckw::DataType::Uint32; + case DataType::U16: + return ckw::DataType::Uint16; + case DataType::U8: + case DataType::QASYMM8: + return ckw::DataType::Uint8; + default: + return ckw::DataType::Unknown; + } +} + +ckw::TensorShape to_ckw(const TensorShape &shape) +{ + ARM_COMPUTE_ERROR_ON(shape.num_max_dimensions < std::tuple_size<ckw::TensorShape>{}); + ARM_COMPUTE_ERROR_ON(std::tuple_size<ckw::TensorShape>{} != 5); + /// NOTE: Overflow danger. Use size_t? + return ckw::TensorShape{static_cast<int32_t>(shape[0]), static_cast<int32_t>(shape[1]), + static_cast<int32_t>(shape[2]), static_cast<int32_t>(shape[3]), + static_cast<int32_t>(shape[4])}; +} + +ckw::TensorDataLayout to_ckw(DataLayout dl) +{ + switch (dl) + { + case DataLayout::NHWC: + return ckw::TensorDataLayout::Nhwc; + case DataLayout::NDHWC: + return ckw::TensorDataLayout::Ndhwc; + default: + return ckw::TensorDataLayout::Unknown; + } +} + +ckw::TensorInfo to_ckw(const ITensorInfo &tensor_info) +{ + return ckw::TensorInfo{to_ckw(tensor_info.data_type()), to_ckw(tensor_info.tensor_shape()), + to_ckw(tensor_info.data_layout()), tensor_info.id()}; +} + +ckw::TensorStorageType to_ckw(const TensorStorageType &storage) +{ + switch (storage) + { + case TensorStorageType::ClBufferUint8Ptr: + return ckw::TensorStorageType::BufferUint8Ptr; + case TensorStorageType::ClImage2dReadOnly: + return ckw::TensorStorageType::Texture2dReadOnly; + case TensorStorageType::ClImage2dWriteOnly: + return ckw::TensorStorageType::Texture2dWriteOnly; + case TensorStorageType::Unknown: + return ckw::TensorStorageType::Unknown; + default: + ARM_COMPUTE_ERROR("Unknown tensor storage type"); + } +} + +TensorComponentType from_ckw(const ckw::TensorComponentType &component) +{ + switch (component) + { + case ckw::TensorComponentType::OffsetFirstElement: + return TensorComponentType::OffsetFirstElement; + case ckw::TensorComponentType::Stride0: + return TensorComponentType::Stride0; + case ckw::TensorComponentType::Stride1: + return TensorComponentType::Stride1; + case ckw::TensorComponentType::Stride2: + return TensorComponentType::Stride2; + case ckw::TensorComponentType::Stride3: + return TensorComponentType::Stride3; + case ckw::TensorComponentType::Stride4: + return TensorComponentType::Stride4; + case ckw::TensorComponentType::Dim0: + return TensorComponentType::Dim0; + case ckw::TensorComponentType::Dim1: + return TensorComponentType::Dim1; + case ckw::TensorComponentType::Dim2: + return TensorComponentType::Dim2; + case ckw::TensorComponentType::Dim3: + return TensorComponentType::Dim3; + case ckw::TensorComponentType::Dim4: + return TensorComponentType::Dim4; + case ckw::TensorComponentType::Dim1xDim2: + return TensorComponentType::Dim1xDim2; + case ckw::TensorComponentType::Dim2xDim3: + return TensorComponentType::Dim2xDim3; + case ckw::TensorComponentType::Dim1xDim2xDim3: + return TensorComponentType::Dim1xDim2xDim3; + case ckw::TensorComponentType::Unknown: + return TensorComponentType::Unknown; + default: + ARM_COMPUTE_ERROR("Unknown CKW tensor component"); + } +} + +TensorStorageType from_ckw(const ckw::TensorStorageType &storage) +{ + switch (storage) + { + case ckw::TensorStorageType::BufferUint8Ptr: + return TensorStorageType::ClBufferUint8Ptr; + case ckw::TensorStorageType::Texture2dReadOnly: + return TensorStorageType::ClImage2dReadOnly; + case ckw::TensorStorageType::Texture2dWriteOnly: + return TensorStorageType::ClImage2dWriteOnly; + case ckw::TensorStorageType::Unknown: + return TensorStorageType::Unknown; + default: + ARM_COMPUTE_ERROR("Unknown CKW tensor storage type"); + } +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h new file mode 100644 index 0000000000..26740cdd04 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_COMMON_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_COMMON_H + +#include "arm_compute/core/CoreTypes.h" +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/core/TensorShape.h" + +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" + +#include "compute_kernel_writer/include/ckw/TensorInfo.h" +#include "compute_kernel_writer/include/ckw/types/DataType.h" +#include "compute_kernel_writer/include/ckw/types/TensorComponentType.h" +#include "compute_kernel_writer/include/ckw/types/TensorStorageType.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Convert the Compute Library data type to Compute Kernel Writer data type + * + * @param[in] dt The Compute Library data type + * + * @return the Compute Kernel Writer data type (ckw::DataType) + */ +ckw::DataType to_ckw(DataType dt); + +/** Convert the Compute Library tensor shape to Compute Kernel Writer tensor shape + * + * @param[in] shape The Compute Library tensor shape + * + * @return the Compute Kernel Writer tensor shape (ckw::TensorShape) + */ +ckw::TensorShape to_ckw(const TensorShape &shape); + +/** Convert the Compute Library data layout to Compute Kernel Writer data layout + * + * @param[in] dl The Compute Library data layout + * + * @return the Compute Kernel Writer data layout (ckw::TensorDataLayout) + */ +ckw::TensorDataLayout to_ckw(DataLayout dl); + +/** Convert the Compute Library tensor info to Compute Kernel Writer tensor info + * + * @param[in] tensor_info The Compute Library tensor info + * + * @return the Compute Kernel Writer tensor info (ckw::TensorInfo) + */ +ckw::TensorInfo to_ckw(const ITensorInfo &tensor_info); + +/** Convert the Compute Library tensor storage to Compute Kernel Writer tensor storage + * + * @param[in] storage The Compute Library tensor storage + * + * @return the Compute Kernel Writer tensor storate (ckw::TensorStorageType) + */ +ckw::TensorStorageType to_ckw(const TensorStorageType &storage); + +/** Convert the Compute Kernel Writer tensor component to Compute Library tensor component + * + * @param[in] component The Compute Kernel Writer tensor component + * + * @return the Compute Library tensor component + */ +TensorComponentType from_ckw(const ckw::TensorComponentType &component); + +/** Convert the Compute Kernel Writer tensor storage to Compute Library tensor storage + * + * @param[in] storage The Compute Kernel Writer tensor storage + * + * @return the Compute Library tensor storage + */ +TensorStorageType from_ckw(const ckw::TensorStorageType &storage); + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_COMMON_H diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.cpp new file mode 100644 index 0000000000..5630e390d5 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.cpp @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h" + +#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +ckw::BinaryOp to_ckw(const ElementwiseBinaryCommonAttributes &attributes) +{ + switch (attributes.operation()) + { + case ElementwiseBinaryCommonAttributes::ElementwiseOp::Add: + return ckw::BinaryOp::Add; + case ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub: + return ckw::BinaryOp::Sub; + case ElementwiseBinaryCommonAttributes::ElementwiseOp::Div: + return ckw::BinaryOp::Div; + case ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul: + return ckw::BinaryOp::Mul; + case ElementwiseBinaryCommonAttributes::ElementwiseOp::Min: + case ElementwiseBinaryCommonAttributes::ElementwiseOp::Max: + case ElementwiseBinaryCommonAttributes::ElementwiseOp::Power: + case ElementwiseBinaryCommonAttributes::ElementwiseOp::Prelu: + case ElementwiseBinaryCommonAttributes::ElementwiseOp::SquaredDiff: + default: + ARM_COMPUTE_ERROR("Cannot convert ElementwiseBinaryCommonAttributes to corresponding ckw::BinaryOp"); + } +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h new file mode 100644 index 0000000000..644a407702 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY_H + +#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h" + +#include "compute_kernel_writer/include/ckw/types/Operators.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +ckw::BinaryOp to_ckw(const ElementwiseBinaryCommonAttributes &attributes); +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute + +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY_H diff --git a/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h b/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h new file mode 100644 index 0000000000..ee109a7e2b --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY + +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" + +#include "Types.h" +#include <memory> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Factory class that creates new instances of @ref IGpuKernelComponent by assigning new component ids + */ +class GpuKernelComponentFactory +{ +public: + /** Create a new kernel component + * + * @tparam T Any polymorphic type descending from @ref IGpuKernelComponent + * @tparam Args Argument types to construct the kernel component + * + * @param[in] args Arguments to construct the kernel component + * + * @return std::unique_ptr<IGpuKernelComponent> + */ + template <typename T, typename... Args> + std::unique_ptr<IGpuKernelComponent> create(Args &&...args) + { + return std::make_unique<T>(_count++, std::forward<Args>(args)...); + } + +private: + ComponentId _count{0}; +}; + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY */ diff --git a/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h new file mode 100644 index 0000000000..6678c929e9 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT_H + +#include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h" + +#include "Types.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Properties common to all kernel component types */ +class KernelProperties +{ +public: + KernelProperties &stage(const UnitWorkloadStage &stage) + { + _stage = stage; + return *this; + } + UnitWorkloadStage stage() const + { + return _stage; + } + +private: + UnitWorkloadStage _stage{}; +}; + +inline bool operator==(const KernelProperties &config0, const KernelProperties &config1) +{ + return config0.stage() == config1.stage(); +} + +/** Forward declaration */ +class IGpuTemplateComponentWriter; +class IGpuCkwComponentDriver; + +/** An abstract interface of a component. It enables manipulation by the component graph for purposes like fusion + */ +class IGpuKernelComponent +{ +public: + using Properties = KernelProperties; + +public: + /** Constructor + * + * @param[in] id Component id + * @param[in] properties Kernel component properties + * @param[in] tensors Tensor arguments to the components + */ + IGpuKernelComponent(ComponentId id, const Properties &properties, const ArgumentPack<ITensorInfo> &tensors) + : _id{id}, _properties{properties}, _tensors{tensors} + { + } + /** Destructor */ + virtual ~IGpuKernelComponent() + { + } + /** Get component id */ + ComponentId id() const + { + return _id; + } + /** Get tensor arguments */ + ArgumentPack<ITensorInfo> tensors() const + { + return _tensors; + } + /** Get properties */ + Properties properties() const + { + return _properties; + } + /** Get writer for the component */ + virtual const IGpuCkwComponentDriver *ckw_component_driver() const + { + return nullptr; + } + /** Get component type */ + virtual GpuComponentType type() const = 0; + +private: + ComponentId _id{-1}; + Properties _properties{}; + ArgumentPack<ITensorInfo> _tensors{}; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT_H diff --git a/src/dynamic_fusion/sketch/gpu/components/Types.h b/src/dynamic_fusion/sketch/gpu/components/Types.h new file mode 100644 index 0000000000..54b3a69057 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/Types.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_TYPES +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_TYPES + +#include <cstdint> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Uniquely identifies a kernel component within a workload + */ +using ComponentId = int32_t; + +/** Component type in the context of fusion + * Its main purpose is to inform the optimizer how to perform fusion. + */ +enum class GpuComponentType +{ + Complex, + Simple, + Unfusable, + Output +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_TYPES */ diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp new file mode 100644 index 0000000000..e316bdf46d --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "ClComponentActivation.h" + +#include "src/core/CL/CLValidate.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +Status ClComponentActivation::validate(const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes) +{ + ARM_COMPUTE_UNUSED(properties, attributes); + + const ITensorInfo *const src = tensors.get_const_tensor(TensorType::ACL_SRC); + const ITensorInfo *const dst = tensors.get_const_tensor(TensorType::ACL_DST); + + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); + + // All tensor infos are initialized + ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); + + // Device requirements are met + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(dst); + + return Status{}; +} + +ClComponentActivation::ClComponentActivation(ComponentId id, + const IGpuKernelComponent::Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes) + : IGpuKernelComponent{id, properties, tensors}, + _component_writer{std::make_unique<GpuCkwActivation>(id, tensors, attributes)} +{ +} + +ClComponentActivation::~ClComponentActivation() +{ +} + +const IGpuCkwComponentDriver *ClComponentActivation::ckw_component_driver() const +{ + return _component_writer.get(); +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h new file mode 100644 index 0000000000..b8185158f3 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTACTIVATION_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTACTIVATION_H + +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" + +namespace arm_compute +{ +/** Forward declaration */ +class ITensorInfo; +namespace experimental +{ +namespace dynamic_fusion +{ +/** Forward declaration */ +template <typename T> +class ArgumentPack; + +/** Forward declaration */ +class GpuCkwActivation; + +class ClComponentActivation final : public IGpuKernelComponent +{ +public: + /** Attributes are a set of backend-agnostic parameters that define what a component does */ + using Attributes = ActivationLayerInfo; + + /** Validate the component + * + * @param[in] properties Component properties @ref Properties + * @param[in, out] tensors Tensor arguments to the component + * @param[in] attributes Component attributes @ref Attributes + * + * @return Status Validation results + * + * Tensor argument names: + * - ACL_SRC: Input + * - ACL_DST: Output + * + * Tensor argument constness: + * - ACL_SRC: Const + * - ACL_DST: Const + * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |ACL_SRC |ACL_DST | + * |:--------------|:--------------| + * |F16 |F16 | + * |F32 |F32 | + */ + static Status + validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes); + + /** Constructor + * + * Similar to @ref ClComponentActivation::validate() + */ + ClComponentActivation(ComponentId id, + const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes); + + /** Destructor */ + ~ClComponentActivation() override; + + /** Prevent instances of this class from being copy constructed */ + ClComponentActivation(const ClComponentActivation &component) = delete; + + /** Prevent instances of this class from being copied */ + ClComponentActivation &operator=(const ClComponentActivation &component) = delete; + + /** Allow instances of this class to be move constructed */ + ClComponentActivation(ClComponentActivation &&component) = default; + + /** Allow instances of this class to be moved */ + ClComponentActivation &operator=(ClComponentActivation &&component) = default; + + /** Get writer for the component */ + const IGpuCkwComponentDriver *ckw_component_driver() const override; + + /** Get component type */ + GpuComponentType type() const override + { + return GpuComponentType::Simple; + } + +private: + std::unique_ptr<GpuCkwActivation> _component_writer; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTACTIVATION_H diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp new file mode 100644 index 0000000000..e1850d78c4 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "ClComponentCast.h" + +#include "arm_compute/core/Error.h" + +#include "src/core/CL/CLValidate.h" +#include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +Status ClComponentCast::validate(const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings) +{ + ARM_COMPUTE_UNUSED(properties, attributes, settings); + + const ITensorInfo *src = tensors.get_const_tensor(TensorType::ACL_SRC_0); + const ITensorInfo *dst = tensors.get_const_tensor(TensorType::ACL_DST_0); + + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(dst); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON(src == dst); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == attributes.data_type(), + "input and target data types should be different"); + + // Validate in case of configured dst + if (dst->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != attributes.data_type(), + "dst and target data types should be same"); + } + + return Status{}; +} +ClComponentCast::ClComponentCast(ComponentId id, + const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings) + : IGpuKernelComponent{id, properties, tensors}, + _component_writer{std::make_unique<GpuCkwCast>(id, tensors, attributes)} +{ + ARM_COMPUTE_UNUSED(attributes, settings); +} + +ClComponentCast::~ClComponentCast() +{ +} + +const IGpuCkwComponentDriver *ClComponentCast::ckw_component_driver() const +{ + return _component_writer.get(); +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h new file mode 100644 index 0000000000..201dacc288 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTCAST_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTCAST_H + +#include "arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h" + +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" + +namespace arm_compute +{ +/** Forward declaration */ +class ITensorInfo; +namespace experimental +{ +namespace dynamic_fusion +{ +/** Forward declaration */ +template <typename T> +class ArgumentPack; + +/** Component specific settings + */ +class ClComponentCastSettings +{ +public: +private: +}; + +/** Forward declaration */ +class GpuCkwCast; + +class ClComponentCast final : public IGpuKernelComponent +{ +public: + /** Attributes are a set of backend-agnostic parameters that define what a component does */ + using Attributes = CastAttributes; + /** Settings are a set of backend-specific parameters that influence the implementation of a component */ + using Settings = ClComponentCastSettings; + + /** Validate the component + * + * @param[in] properties Component properties @ref Properties + * @param[in,out] tensors Tensor arguments to the component + * @param[in] attributes Component attributes @ref Attributes + * @param[in] settings Component settings @ref Settings + * + * @return Status Validation results + * + * Tensor argument names: + * - ACL_SRC_0: Input + * - ACL_DST_0: Output + * + * Tensor argument constness: + * - ACL_SRC_0: Const + * - ACL_DST_0: Const + * + * Valid data layouts: + * - All + * + ** Valid data type configurations: + * |ACL_SRC_0 |ACL_DST_0 | + * |:--------------|:--------------------------------------| + * |U8 | S8, U16, S16, U32, S32, F16, F32 | + * |U16 | U8, S8, S16, U32, S32, F16, F32 | + * |S16 | U8, S8, U16, U32, S32, F16, F32 | + * |U32 | U8, S8, U16, S16, S32, F16, F32 | + * |S32 | U8, S8, U16, S16, U32, F16, F32 | + * |F16 | U8, S8, U16, S16, U32, S32, F32 | + * |F32 | U8, S8, U16, S16, U32, S32, F16 | + */ + static Status validate(const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings); + + /** Constructor + * + * Similar to @ref ClComponentCast::validate() + */ + ClComponentCast(ComponentId id, + const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings); + + /** Destructor */ + ~ClComponentCast() override; + /** Prevent instances of this class from being copy constructed */ + ClComponentCast(const ClComponentCast &component) = delete; + /** Prevent instances of this class from being copied */ + ClComponentCast &operator=(const ClComponentCast &component) = delete; + /** Allow instances of this class to be move constructed */ + ClComponentCast(ClComponentCast &&component) = default; + /** Allow instances of this class to be moved */ + ClComponentCast &operator=(ClComponentCast &&component) = default; + /** Get writer for the component */ + const IGpuCkwComponentDriver *ckw_component_driver() const override; + /** Get component type */ + GpuComponentType type() const override + { + return GpuComponentType::Simple; + } + +private: + std::unique_ptr<GpuCkwCast> _component_writer; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute + +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTCAST_H diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp new file mode 100644 index 0000000000..7cd23d6115 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "ClComponentDepthwiseConv2d.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h" + +#include "src/core/CL/CLValidate.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +using Settings = ClComponentDepthwiseConv2dSettings; + +Settings &Settings::export_input_to_cl_image(bool cl_image) +{ + _export_input_to_cl_image = cl_image; + return *this; +} + +bool Settings::export_input_to_cl_image() const +{ + return _export_input_to_cl_image; +} + +Settings &Settings::export_weights_to_cl_image(bool cl_image) +{ + _export_weights_to_cl_image = cl_image; + return *this; +} + +bool Settings::export_weights_to_cl_image() const +{ + return _export_weights_to_cl_image; +} + +Settings &Settings::fast_relaxed_math(bool fast_relaxed_math) +{ + _fast_relaxed_math = fast_relaxed_math; + return *this; +} + +bool Settings::fast_relaxed_math() const +{ + return _fast_relaxed_math; +} + +Settings &Settings::is_fma_available(bool is_fma_available) +{ + _is_fma_available = is_fma_available; + return *this; +} + +bool Settings::is_fma_available() const +{ + return _is_fma_available; +} + +Settings &Settings::n0(unsigned int n0) +{ + _n0 = n0; + return *this; +} + +unsigned int Settings::n0() const +{ + return _n0; +} + +Settings &Settings::m0(unsigned int m0) +{ + _m0 = m0; + return *this; +} + +unsigned int Settings::m0() const +{ + return _m0; +} + +Status ClComponentDepthwiseConv2d::validate(const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings) +{ + ARM_COMPUTE_UNUSED(properties, settings); + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); + const auto wei = tensors.get_const_tensor(TensorType::ACL_SRC_1); + const auto bia = tensors.get_const_tensor(TensorType::ACL_SRC_2); + const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0); + + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei, dst); + + // 1. Check validity + // Matching data type + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, wei); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + if (bia != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bia); + } + + // Matching data layout + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, wei); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); + if (bia != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, bia); + } + + // All tensor infos are initialized + ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(wei->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); + if (bia != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON(bia->tensor_shape().total_size() == 0); + } + // Device requirements are met + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + // wei shape is correct + const DataLayout data_layout = src->data_layout(); + const size_t channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + + ARM_COMPUTE_RETURN_ERROR_ON(wei->dimension(channel_idx) != + (src->dimension(channel_idx) * attributes.depth_multiplier())); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->num_dimensions() > 3, "Weights can be at most 3 dimensional"); + + // dst shape is correct + const PadStrideInfo pad_stride_info = + PadStrideInfo(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, attributes.pad().right, + attributes.pad().top, attributes.pad().bottom, attributes.dimension_rounding_type()); + const ConvolutionInfo conv_info{pad_stride_info, attributes.depth_multiplier(), ActivationLayerInfo(), + attributes.dilation()}; + const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *wei, conv_info); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape); + + // Check strides and dilation + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first < 1); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().second < 1); + ARM_COMPUTE_RETURN_ERROR_ON((conv_info.dilation.x() < 1) || (conv_info.dilation.y() < 1)); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first > 1 && settings.m0() != 1); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation.x() > 1 && settings.m0() != 1); + + if (conv_info.depth_multiplier > 1 && settings.n0() > 1) + { + ARM_COMPUTE_RETURN_ERROR_ON((conv_info.depth_multiplier % settings.n0()) != 0); + } + + // Check export weights to cl image + ARM_COMPUTE_RETURN_ERROR_ON_MSG((settings.export_weights_to_cl_image() == true) && + (export_to_cl_image(wei) == false), + "Weights cannot be exported to cl_image!"); + ARM_COMPUTE_RETURN_ERROR_ON((settings.export_weights_to_cl_image() == true) && ((settings.n0() % 4) != 0)); + + ARM_COMPUTE_RETURN_ERROR_ON(wei->dimension(channel_idx) != + (src->dimension(channel_idx) * conv_info.depth_multiplier)); + + // bia shape is correct + if (bia != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->dimension(0) != output_shape[channel_idx], + "Biases size and number of dst feature maps should match"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->num_dimensions() > 1, "Biases should be one dimensional"); + } + + // 2. Check support level + // Data type + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); + // Data layout + ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC); + // Texture in the input tensor + ARM_COMPUTE_RETURN_ERROR_ON((settings.export_input_to_cl_image() == true)); + + return Status{}; +} + +ClComponentDepthwiseConv2d::ClComponentDepthwiseConv2d(ComponentId id, + const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings) + : IGpuKernelComponent{id, properties, tensors}, + _component_writer{std::make_unique<GpuCkwDepthwiseConv2d>(id, tensors, attributes, settings)} +{ + ARM_COMPUTE_UNUSED(attributes, settings); +} +ClComponentDepthwiseConv2d::~ClComponentDepthwiseConv2d() +{ +} +const IGpuCkwComponentDriver *ClComponentDepthwiseConv2d::ckw_component_driver() const +{ + return _component_writer.get(); +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h new file mode 100644 index 0000000000..7526361f1c --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDEPTHWISECONV2D_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDEPTHWISECONV2D_H + +#include "arm_compute/core/Error.h" + +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" + +#include <memory> + +namespace arm_compute +{ +/** Forward declaration */ +class ITensorInfo; +namespace experimental +{ +namespace dynamic_fusion +{ +/** Forward declaration */ +template <typename T> +class ArgumentPack; +class DepthwiseConv2dAttributes; + +/** Forward declaration */ +class GpuCkwDepthwiseConv2d; + +/** Component specific settings + */ +class ClComponentDepthwiseConv2dSettings +{ +public: + /** Set export_input_to_cl_image flag */ + ClComponentDepthwiseConv2dSettings &export_input_to_cl_image(bool cl_image); + /** Get export_input_to_cl_image flag */ + bool export_input_to_cl_image() const; + + /** Set export_weights_to_cl_image flag */ + ClComponentDepthwiseConv2dSettings &export_weights_to_cl_image(bool cl_image); + /** Get export_weights_to_cl_image flag */ + bool export_weights_to_cl_image() const; + + /** Set fast_relaxed_math flag */ + ClComponentDepthwiseConv2dSettings &fast_relaxed_math(bool fast_relaxed_math); + /** Get fast_relaxed_math flag */ + bool fast_relaxed_math() const; + + /** Set is_fma_available flag */ + ClComponentDepthwiseConv2dSettings &is_fma_available(bool is_fma_available); + /** Get is_fma_available flag */ + bool is_fma_available() const; + + /** Set N0: number of columns processed by each thread */ + ClComponentDepthwiseConv2dSettings &n0(unsigned int n0); + /** Get N0: number of columns processed by each thread */ + unsigned int n0() const; + + /** Set M0: number of rows processed by each thread */ + ClComponentDepthwiseConv2dSettings &m0(unsigned int m0); + /** Set M0: number of rows processed by each thread */ + unsigned int m0() const; + +private: + bool _export_input_to_cl_image{false}; /**< Export input to cl_image */ + bool _export_weights_to_cl_image{false}; /**< Export the weights to cl_image */ + bool _fast_relaxed_math{true}; /**< Enable/disable -cl-fast-relaxed-math flag */ + bool _is_fma_available{false}; /**< Is fma instruction available */ + unsigned int _n0{0}; /**< Number of columns processed by each thread */ + unsigned int _m0{0}; /**< Number of rows processed by each thread */ +}; + +/** Forward declaration */ +class ClTemplateDepthwiseConv2d; + +class ClComponentDepthwiseConv2d final : public IGpuKernelComponent +{ +public: + /** Attributes are a set of backend-agnostic parameters that define what a component does */ + using Attributes = DepthwiseConv2dAttributes; + /** Settings are a set of backend-specific parameters that influence the implementation of a component */ + using Settings = ClComponentDepthwiseConv2dSettings; + +public: + /** Validate the component + * + * @param[in] properties Component properties @ref Properties + * @param[in,out] tensors Tensor arguments to the component + * @param[in] attributes Component attributes @ref Attributes + * @param[in] settings Component settings @ref Settings + * + * @return Status Validation results + * + * Tensor argument names: + * - ACL_SRC_0: Input + * - ACL_SRC_1: Weight + * - ACL_SRC_2: Bias (Optional) + * - ACL_DST_0: Output + * + * Tensor argument constness: + * - ACL_SRC_0: Const + * - ACL_SRC_1: Const + * - ACL_SRC_2: Const + * - ACL_DST_0: Const + * + * Valid data layouts: + * - NHWC + * + * Valid data type configurations: + * |ACL_SRC_0 |ACL_SRC_1 |ACL_SRC_2 |ACL_DST_0 | + * |:--------------|:--------------|:--------------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + */ + static Status validate(const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings); + + /** Constructor + * + * Similar to @ref ClComponentDepthwiseConv2d::validate() + */ + ClComponentDepthwiseConv2d(ComponentId id, + const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings); + + /** Destructor */ + ~ClComponentDepthwiseConv2d() override; + /** Prevent instances of this class from being copy constructed */ + ClComponentDepthwiseConv2d(const ClComponentDepthwiseConv2d &component) = delete; + /** Prevent instances of this class from being copied */ + ClComponentDepthwiseConv2d &operator=(const ClComponentDepthwiseConv2d &component) = delete; + /** Allow instances of this class to be move constructed */ + ClComponentDepthwiseConv2d(ClComponentDepthwiseConv2d &&component) = default; + /** Allow instances of this class to be moved */ + ClComponentDepthwiseConv2d &operator=(ClComponentDepthwiseConv2d &&component) = default; + /** Get writer for the component */ + const IGpuCkwComponentDriver *ckw_component_driver() const override; + /** Get component type */ + GpuComponentType type() const override + { + return GpuComponentType::Complex; + } + +private: + std::unique_ptr<GpuCkwDepthwiseConv2d> _component_writer; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDEPTHWISECONV2D_H diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp new file mode 100644 index 0000000000..783a17df30 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "ClComponentDirectConv2d.h" + +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h" + +#include "src/core/CL/CLValidate.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +bool ClComponentDirectConv2dSettings::export_to_cl_image() const +{ + return _desc.export_weights_to_cl_image; +} + +ClComponentDirectConv2dSettings &ClComponentDirectConv2dSettings::fast_relaxed_math(bool fast_relaxed_math) +{ + _fast_relaxed_math = fast_relaxed_math; + return *this; +} + +bool ClComponentDirectConv2dSettings::fast_relaxed_math() const +{ + return _fast_relaxed_math; +} + +ClComponentDirectConv2dSettings & +ClComponentDirectConv2dSettings::direct_conv_descriptor(const DirectConvComputeKernelInfo &desc) +{ + _desc = desc; + return *this; +} + +DirectConvComputeKernelInfo ClComponentDirectConv2dSettings::direct_conv_descriptor() const +{ + return _desc; +} + +Status ClComponentDirectConv2d::validate(const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings) +{ + ARM_COMPUTE_UNUSED(properties); + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); + const auto wei = tensors.get_const_tensor(TensorType::ACL_SRC_1); + const auto bia = tensors.get_const_tensor(TensorType::ACL_SRC_2); + const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0); + + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei, dst); + + // 1. Check validity + // Matching data type + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, wei); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + if (bia != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bia); + } + + // Matching data layout + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, wei); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); + if (bia != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, bia); + } + + // All tensor infos are initialized + ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(wei->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); + if (bia != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON(bia->tensor_shape().total_size() == 0); + } + // Device requirements are met + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + // wei shape is correct + const DataLayout data_layout = src->data_layout(); + const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->dimension(channel_idx) != src->dimension(channel_idx), + "Weights feature map dimension should match the respective src's one"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->num_dimensions() > 4, "Weights can be at most 4 dimensional"); + + // dst shape is correct + PadStrideInfo legacy_pad_stride(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, + attributes.pad().right, attributes.pad().top, attributes.pad().bottom, + DimensionRoundingType{}); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + dst->tensor_shape(), misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, legacy_pad_stride)); + + // bia shape is correct + if (bia != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->dimension(0) != wei->dimension(3), + "Biases size and number of dst feature maps should match"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->num_dimensions() > 1, "Biases should be one dimensional"); + } + + // 2. Check support level + // Data type + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); + // Data layout + ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC); + + const auto desc = settings.direct_conv_descriptor(); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 && + desc.n0 != 16, + "N0 can only be: 1, 2, 3, 4, 8, and 16"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 && + desc.k0 != 16, + "K0 can only be: 1, 2, 3, 4, 8, and 16"); + return Status{}; +} + +ClComponentDirectConv2d::ClComponentDirectConv2d(ComponentId id, + const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings) + : IGpuKernelComponent{id, properties, tensors}, + _component_writer{std::make_unique<GpuCkwDirectConv2d>(id, tensors, attributes, settings)} +{ +} + +ClComponentDirectConv2d::~ClComponentDirectConv2d() +{ +} + +const IGpuCkwComponentDriver *ClComponentDirectConv2d::ckw_component_driver() const +{ + return _component_writer.get(); +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h new file mode 100644 index 0000000000..c50b0fa0ce --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D_H + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/KernelDescriptors.h" + +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" + +#include <memory> + +namespace arm_compute +{ +/** Forward declaration */ +class ITensorInfo; +namespace experimental +{ +namespace dynamic_fusion +{ +/** Forward declaration */ +template <typename T> +class ArgumentPack; +class Conv2dAttributes; + +/** Component specific settings + */ +class ClComponentDirectConv2dSettings +{ +public: + /** Get export_to_cl_image flag */ + bool export_to_cl_image() const; + + /** Set fast_relaxed_math flag */ + ClComponentDirectConv2dSettings &fast_relaxed_math(bool fast_relaxed_math); + /** Get fast_relaxed_math flag */ + bool fast_relaxed_math() const; + + /** Set direct convolution descriptor */ + ClComponentDirectConv2dSettings &direct_conv_descriptor(const DirectConvComputeKernelInfo &desc); + /** Get direct convolution descriptor */ + DirectConvComputeKernelInfo direct_conv_descriptor() const; + +private: + bool _fast_relaxed_math{true}; + DirectConvComputeKernelInfo _desc{}; // Direct convolution descriptor +}; + +/** Forward declaration */ +class GpuCkwDirectConv2d; + +class ClComponentDirectConv2d final : public IGpuKernelComponent +{ +public: + /** Attributes are a set of backend-agnostic parameters that define what a component does */ + using Attributes = Conv2dAttributes; + /** Settings are a set of backend-specific parameters that influence the implementation of a component */ + using Settings = ClComponentDirectConv2dSettings; + +public: + /** Validate the component + * + * @param[in] properties Component properties + * @param[in,out] tensors Tensor arguments to the component + * @param[in] attributes Component attributes + * @param[in] settings Component settings + * + * @return Status Validation results + * + * Tensor argument names: + * - ACL_SRC_0: Input + * - ACL_SRC_1: Weight + * - ACL_SRC_2: Bias (Optional) + * - ACL_DST_0: Output + * + * Tensor argument constness: + * - ACL_SRC_0: Const + * - ACL_SRC_1: Const + * - ACL_SRC_2: Const + * - ACL_DST_0: Const + * + * Valid data layouts: + * - NHWC + * + * Valid data type configurations: + * |ACL_SRC_0 |ACL_SRC_1 |ACL_SRC_2 |ACL_DST_0 | + * |:--------------|:--------------|:--------------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + */ + static Status validate(const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings); + + /** Constructor + * + * Similar to @ref ClComponentDirectConv2d::validate() + */ + ClComponentDirectConv2d(ComponentId id, + const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings); + + /** Destructor */ + ~ClComponentDirectConv2d() override; + /** Prevent instances of this class from being copy constructed */ + ClComponentDirectConv2d(const ClComponentDirectConv2d &component) = delete; + /** Prevent instances of this class from being copied */ + ClComponentDirectConv2d &operator=(const ClComponentDirectConv2d &component) = delete; + /** Allow instances of this class to be move constructed */ + ClComponentDirectConv2d(ClComponentDirectConv2d &&component) = default; + /** Allow instances of this class to be moved */ + ClComponentDirectConv2d &operator=(ClComponentDirectConv2d &&component) = default; + /** Get writer for the component */ + const IGpuCkwComponentDriver *ckw_component_driver() const override; + /** Get component type */ + GpuComponentType type() const override + { + return GpuComponentType::Complex; + } + +private: + std::unique_ptr<GpuCkwDirectConv2d> _component_writer; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D_H diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp new file mode 100644 index 0000000000..209c73dbee --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "ClComponentElementwiseBinary.h" + +#include "arm_compute/core/Validate.h" + +#include "src/core/CL/CLValidate.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +std::set<ElementwiseBinaryCommonAttributes::ElementwiseOp> supported_ops{ + ElementwiseBinaryCommonAttributes::ElementwiseOp::Add, ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub, + ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul}; +} + +Status ClComponentElementwiseBinary::validate(const ArgumentPack<ITensorInfo> &tensors, + const ElementwiseBinaryCommonAttributes &attributes) +{ + const auto lhs = tensors.get_const_tensor(TensorType::ACL_SRC_0); + const auto rhs = tensors.get_const_tensor(TensorType::ACL_SRC_1); + const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0); + + // Check operator type + ARM_COMPUTE_RETURN_ERROR_ON_MSG(supported_ops.find(attributes.operation()) == supported_ops.end(), + "Provided Elementwise operation not supported."); + + // Check validity + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst); + + //Check data type for different elementwise operators + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::S32, + DataType::S16, DataType::U8); + + // dst shape is correct + const TensorShape out_shape = TensorShape::broadcast_shape(lhs->tensor_shape(), rhs->tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), + "Wrong shape for dst."); + + const auto &lhs_shape = lhs->tensor_shape(); + const auto &rhs_shape = rhs->tensor_shape(); + const auto &dst_shape = dst->tensor_shape(); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(lhs_shape, dst_shape, 0) && + detail::have_different_dimensions(rhs_shape, dst_shape, 0), + "Only LHS or RHS can be broadcasting, not both."); + + // Dimension Y and Z are collapsed together in the current kernel implementation, + // hence they cannot be independently broadcast or non-broadcast. + // See: ClTemplateElementwiseBinary::get_window + ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs_shape[1] != dst_shape[1] || rhs_shape[1] != dst_shape[1]) != + (lhs_shape[2] != dst_shape[2] || rhs_shape[2] != dst_shape[2]), + "Dimension Y and Z must both be either broadcast or non-broadcast."); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(lhs_shape, dst_shape, 3), + "LHS broadcast in dimension 3 or higher is not supported."); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(rhs_shape, dst_shape, 3), + "RHS broadcast in dimension 3 or higher is not supported."); + + // Matching data type + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst); + + // Matching data layout + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(lhs, rhs); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(lhs, dst); + + // All tensor infos are initialized + ARM_COMPUTE_RETURN_ERROR_ON(lhs->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(rhs->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); + + // Device requirements are met + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(lhs); + + return Status{}; +} + +ClComponentElementwiseBinary::~ClComponentElementwiseBinary() +{ +} +ClComponentElementwiseBinary::ClComponentElementwiseBinary(ComponentId id, + const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes) + : IGpuKernelComponent{id, properties, tensors}, + _component_writer{std::make_unique<GpuCkwElementwiseBinary>(id, tensors, attributes)} +{ +} + +const IGpuCkwComponentDriver *ClComponentElementwiseBinary::ckw_component_driver() const +{ + return _component_writer.get(); +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h new file mode 100644 index 0000000000..a4395a6219 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY_H + +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" +#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h" + +namespace arm_compute +{ +/** Forward declaration */ +class ITensorInfo; +namespace experimental +{ +namespace dynamic_fusion +{ +/** Forward declaration */ +template <typename T> +class ArgumentPack; + +/** Forward declaration */ +class GpuCkwElementwiseBinary; + +class ClComponentElementwiseBinary final : public IGpuKernelComponent +{ +public: + /** Attributes are a set of backend-agnostic parameters that define what a component does */ + using Attributes = ElementwiseBinaryCommonAttributes; + +public: + /** Validate the component + * + * @param[in,out] tensors Tensor arguments to the component + * @param[in] attributes Component attributes + * + * @return Status Validation results + * + * Tensor argument names: + * - ACL_SRC_0: lhs + * - ACL_SRC_1: rhs + * - ACL_DST_0: dst + * + * Tensor argument constness: + * - ACL_SRC_0: Const + * - ACL_SRC_1: Const + * - ACL_DST_0: Const + * + * Valid data layouts: + * - All + * + * Valid data type configurations (for DIV FP32/FP16/S32 supported, for POWER only FP32/FP16 supported): + * |ACL_SRC_0 |ACL_SRC_1 |ACL_DST_0 | + * |:--------------|:--------------|:--------------| + * |F16 |F16 |F16 | + * |F32 |F32 |F32 | + * |S32 |S32 |S32 | + * |S16 |S16 |S16 | + * |U8 |U8 |U8 | + */ + static Status validate(const ArgumentPack<ITensorInfo> &tensors, + const ElementwiseBinaryCommonAttributes &attributes); + + /** Constructor + * + * Similar to @ref ClComponentElementwiseBinary::validate() + */ + ClComponentElementwiseBinary(ComponentId id, + const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes); + + /** Destructor */ + ~ClComponentElementwiseBinary() override; + /** Prevent instances of this class from being copy constructed */ + ClComponentElementwiseBinary(const ClComponentElementwiseBinary &component) = delete; + /** Prevent instances of this class from being copied */ + ClComponentElementwiseBinary &operator=(const ClComponentElementwiseBinary &component) = delete; + /** Allow instances of this class to be move constructed */ + ClComponentElementwiseBinary(ClComponentElementwiseBinary &&component) = default; + /** Allow instances of this class to be moved */ + ClComponentElementwiseBinary &operator=(ClComponentElementwiseBinary &&component) = default; + /** Get writer for the component */ + const IGpuCkwComponentDriver *ckw_component_driver() const override; + /** Get component type */ + GpuComponentType type() const override + { + return GpuComponentType::Simple; + } + +private: + std::unique_ptr<GpuCkwElementwiseBinary> _component_writer; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY_H diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp new file mode 100644 index 0000000000..53ac8da41f --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.h" + +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/dynamic_fusion/sketch/attributes/MatMulAttributes.h" + +#include "src/core/CL/CLValidate.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h" +#include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +using Attributes = MatMulAttributes; +using Settings = GpuMatMulSettings; + +Status validate_matmul_kernel_info(Attributes attributes, Settings settings) +{ + const bool adj_lhs = attributes.adj_lhs(); + const bool adj_rhs = attributes.adj_rhs(); + const int m0 = settings.m0(); + const int n0 = settings.n0(); + const int k0 = settings.k0(); + + // Validate M0 + ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0"); + + if (adj_lhs) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16), + "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed"); + } + + // Validate N0 + ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 < 1, "Only positive integers are supported for N0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16), + "Only 1,2,3,4,8,16 are supported for N0"); + + // Validate K0 + ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 < 1, "Only positive integers are supported for K0"); + if (!adj_lhs || adj_rhs) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16), + "Only 1,2,3,4,8,16 are supported for K0"); + } + + return Status{}; +} + +} // namespace + +Status ClComponentMatMul::validate(const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings) +{ + ARM_COMPUTE_UNUSED(properties); + ARM_COMPUTE_UNUSED(attributes); + + const auto lhs = tensors.get_const_tensor(TensorType::ACL_SRC_0); + const auto rhs = tensors.get_const_tensor(TensorType::ACL_SRC_1); + const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0); + + // Currently, the only supported case is when adj_lhs = false and adj_rhs = true + ARM_COMPUTE_RETURN_ERROR_ON((attributes.adj_lhs() != false) && (attributes.adj_rhs() != true)); + + // Check if Matching data type + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst); + + // Data type + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst); + + // All tensor infos are initialized + ARM_COMPUTE_RETURN_ERROR_ON(lhs->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(rhs->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); + + // Device requirements are met + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(lhs); + + // Check if block sizes are supported + MatMulKernelInfo matmul_kernel_info = + MatMulKernelInfo(attributes.adj_lhs(), attributes.adj_rhs(), settings.m0(), settings.n0(), settings.k0()); + ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(attributes, settings)); + ARM_COMPUTE_RETURN_ON_ERROR( + opencl::kernels::validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)); + + // Check if dst shape is correct + const auto expected_dst_shape = + misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), expected_dst_shape); + + return Status{}; +} + +ClComponentMatMul::ClComponentMatMul(ComponentId id, + const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings) + : IGpuKernelComponent{id, properties, tensors}, + _component_writer{std::make_unique<GpuCkwMatMul>(id, tensors, attributes, settings)} +{ +} + +ClComponentMatMul::~ClComponentMatMul() +{ +} + +const IGpuCkwComponentDriver *ClComponentMatMul::ckw_component_driver() const +{ + return _component_writer.get(); +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.h new file mode 100644 index 0000000000..41833e4adb --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.h @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTMATMUL_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTMATMUL_H + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMatMul.h" + +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" + +#include <memory> + +namespace arm_compute +{ +/** Forward declaration */ +class ITensorInfo; +namespace experimental +{ +namespace dynamic_fusion +{ +/** Forward declaration */ +template <typename T> +class ArgumentPack; +class MatMulAttributes; +class GpuCkwMatMul; + +class ClComponentMatMul final : public IGpuKernelComponent +{ +public: + /** Attributes are a set of backend-agnostic parameters that define what a component does */ + using Attributes = MatMulAttributes; + /** Settings are a set of backend-specific parameters that influence the implementation of a component */ + using Settings = GpuMatMulSettings; + + /** Validate the component + * + * @param[in] properties Component properties + * @param[in,out] tensors Tensor arguments to the component + * @param[in] attributes Component attributes + * @param[in] settings Component settings + * + * @return Status Validation results + * + * Tensor argument names: + * - ACL_SRC_0: LHS + * - ACL_SRC_1: RHS + * - ACL_DST_0: Output + * + * Tensor argument constness: + * - ACL_SRC_0: Const + * - ACL_SRC_1: Const + * - ACL_DST_0: Const + * + * Valid data layouts: + * - NHWC + * + * Valid data type configurations: + * |ACL_SRC_0 |ACL_SRC_1 |ACL_DST_0 | + * |:--------------|:--------------|:--------------| + * |F16 |F16 |F16 | + * |F32 |F32 |F32 | + */ + static Status validate(const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings); + + /** Constructor + * + * Similar to @ref ClComponentMatMul::validate() + */ + ClComponentMatMul(ComponentId id, + const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings); + /** Destructor */ + ~ClComponentMatMul() override; + /** Prevent instances of this class from being copy constructed */ + ClComponentMatMul(const ClComponentMatMul &component) = delete; + /** Prevent instances of this class from being copied */ + ClComponentMatMul &operator=(const ClComponentMatMul &component) = delete; + /** Allow instances of this class to be move constructed */ + ClComponentMatMul(ClComponentMatMul &&component) = default; + /** Allow instances of this class to be moved */ + ClComponentMatMul &operator=(ClComponentMatMul &&component) = default; + /** Get writer for the component */ + const IGpuCkwComponentDriver *ckw_component_driver() const override; + /** Get component type */ + GpuComponentType type() const override + { + return GpuComponentType::Complex; + } + +private: + std::unique_ptr<GpuCkwMatMul> _component_writer; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTMATMUL_H diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp new file mode 100644 index 0000000000..6e7243dc04 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "ClComponentPool2d.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h" + +#include "src/core/CL/CLValidate.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h" +#include "src/dynamic_fusion/utils/Utils.h" + +#include <memory> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +Status ClComponentPool2d::validate(const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings) +{ + ARM_COMPUTE_UNUSED(properties, settings); + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); + const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0); + + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_ON_MSG((attributes.pool_type() != PoolingType::AVG && attributes.pool_type() != PoolingType::MAX), + "Unsupported Pooling type"); + + // 1. Check validity + // Check if pooling is valid + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + is_pool_region_entirely_outside_input(convert_pool_attr_to_pool_info(attributes, true)), + "Pooling region that is entirely outside input tensor is unsupported"); + + // Matching data type + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + + // Matching data layout + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); + + // All tensor infos are initialized + ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); + + // Device requirements are met + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + dst->tensor_shape(), + misc::shape_calculator::compute_pool_shape(*src, convert_pool_attr_to_pool_info(attributes, true))); + + // 2. Check support level + // Data type + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); + + // Data layout + ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC); + + return Status{}; +} + +ClComponentPool2d::ClComponentPool2d(ComponentId id, + const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings) + : IGpuKernelComponent{id, properties, tensors}, + _component_writer{std::make_unique<GpuCkwPool2d>(id, tensors, attributes, settings)} +{ +} +ClComponentPool2d::~ClComponentPool2d() +{ +} +const IGpuCkwComponentDriver *ClComponentPool2d::ckw_component_driver() const +{ + return _component_writer.get(); +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h new file mode 100644 index 0000000000..d33e601f18 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTPOOL2D_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTPOOL2D_H + +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h" + +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" + +namespace arm_compute +{ +/** Forward declaration */ +class ITensorInfo; +namespace experimental +{ +namespace dynamic_fusion +{ +/** Forward declaration */ +template <typename T> +class ArgumentPack; +class Pool2dAttributes; + +/** Forward declaration */ +class GpuCkwPool2d; + +class ClComponentPool2d final : public IGpuKernelComponent +{ +public: + /** Attributes are a set of backend-agnostic parameters that define what a component does */ + using Attributes = Pool2dAttributes; + /** Settings are a set of backend-specific parameters that influence the implementation of a component */ + using Settings = GpuPool2dSettings; + +public: + /** Validate the component + * + * @param[in] properties Component properties + * @param[in,out] tensors Tensor arguments to the component + * @param[in] attributes Component attributes + * @param[in] settings Component settings + * + * @return Status Validation results + * + * Tensor argument names: + * - ACL_SRC_0: Input + * - ACL_DST_0: Output + * + * Tensor argument constness: + * - ACL_SRC_0: Const + * - ACL_DST_0: Const + * + * Valid data layouts: + * - NHWC + * + * Valid data type configurations: + * |ACL_SRC_0 |ACL_DST_0 | + * |:--------------|:--------------| + * |F16 |F16 | + * |F32 |F32 | + */ + static Status validate(const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings); + + /** Constructor + * + * @param[in] id Unique Component Identifier within a workload + * @param[in] properties Component properties + * @param[in,out] tensors Tensor arguments to the component + * @param[in] attributes Component attributes + * @param[in] settings Component settings + */ + ClComponentPool2d(ComponentId id, + const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes, + const Settings &settings); + + /** Destructor */ + ~ClComponentPool2d() override; + + /** Prevent instances of this class from being copy constructed */ + ClComponentPool2d(const ClComponentPool2d &component) = delete; + + /** Prevent instances of this class from being copied */ + ClComponentPool2d &operator=(const ClComponentPool2d &component) = delete; + + /** Allow instances of this class to be move constructed */ + ClComponentPool2d(ClComponentPool2d &&component) = default; + + /** Allow instances of this class to be moved */ + ClComponentPool2d &operator=(ClComponentPool2d &&component) = default; + + /** Get GPU kernel writer for the component */ + const IGpuCkwComponentDriver *ckw_component_driver() const override; + + /** Get component type */ + GpuComponentType type() const override + { + return GpuComponentType::Complex; + } + +private: + std::unique_ptr<GpuCkwPool2d> _component_writer; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTPOOL2D_H diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp new file mode 100644 index 0000000000..dce85c424e --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "ClComponentReshape.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/CL/CLValidate.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +Status ClComponentReshape::validate(const ArgumentPack<ITensorInfo> &tensors) +{ + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); + const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() != dst->tensor_shape().total_size()); + + return Status{}; +} + +ClComponentReshape::ClComponentReshape(ComponentId id, + const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors) + : IGpuKernelComponent{id, properties, tensors} +{ +} +ClComponentReshape::~ClComponentReshape() +{ +} +const IGpuCkwComponentDriver *ClComponentReshape::ckw_component_driver() const +{ + /* NOT IMPLEMENTED */ + return nullptr; +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h new file mode 100644 index 0000000000..fd0f966da1 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESHAPE_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESHAPE_H + +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" + +namespace arm_compute +{ +/** Forward declaration */ +class ITensorInfo; +namespace experimental +{ +namespace dynamic_fusion +{ +/** Forward declaration */ +template <typename T> +class ArgumentPack; + +/** Forward declaration */ +class ClTemplateReshape; + +class ClComponentReshape final : public IGpuKernelComponent +{ +public: +public: + /** Validate the component + * + * @param[in,out] tensors Tensor arguments to the component + * + * @return Status Validation results + * + * Tensor argument names: + * - ACL_SRC_0: src + * - ACL_DST_0: dst + * + * Tensor argument constness: + * - ACL_SRC_0: Const + * - ACL_DST_0: Const + * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * - All + */ + static Status validate(const ArgumentPack<ITensorInfo> &tensors); + + /** Constructor + * + * @param[in] id Component id + * @param[in] properties Component properties @ref Properties + * @param[in] tensors Tensor arguments to the component + */ + ClComponentReshape(ComponentId id, const Properties &properties, const ArgumentPack<ITensorInfo> &tensors); + + /** Destructor */ + ~ClComponentReshape() override; + /** Prevent instances of this class from being copy constructed */ + ClComponentReshape(const ClComponentReshape &component) = delete; + /** Prevent instances of this class from being copied */ + ClComponentReshape &operator=(const ClComponentReshape &component) = delete; + /** Allow instances of this class to be move constructed */ + ClComponentReshape(ClComponentReshape &&component) = default; + /** Allow instances of this class to be moved */ + ClComponentReshape &operator=(ClComponentReshape &&component) = default; + /** Get writer for the component */ + const IGpuCkwComponentDriver *ckw_component_driver() const override; + /** Get component type */ + GpuComponentType type() const override + { + return GpuComponentType::Complex; + } + +private: +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute + +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESHAPE_H diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp new file mode 100644 index 0000000000..411eeca802 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ClComponentResize.h" + +#include "arm_compute/core/Error.h" + +#include "src/core/CL/CLValidate.h" +#include "src/core/utils/ScaleUtils.h" +#include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Forward declaration */ +class GpuCkwResize; + +Status ClComponentResize::validate(const IGpuKernelComponent::Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const ClComponentResize::Attributes &attributes) +{ + ARM_COMPUTE_UNUSED(properties); + + const ITensorInfo *src = tensors.get_const_tensor(TensorType::ACL_SRC_0); + const ITensorInfo *dst = tensors.get_const_tensor(TensorType::ACL_DST_0); + + // Mismatching data types and quantization info + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); + + // Device requirements met + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + + // Align corners and sampling policy conformance + ARM_COMPUTE_RETURN_ERROR_ON( + attributes.align_corners() && + !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(attributes.sampling_policy())); + + // All tensor infos are initialized + ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); + + return Status(); +} + +ClComponentResize::ClComponentResize(ComponentId id, + const IGpuKernelComponent::Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const ClComponentResize::Attributes &attributes) + : IGpuKernelComponent{id, properties, tensors}, + _component_writer{std::make_unique<GpuCkwResize>(id, tensors, attributes)} +{ +} + +ClComponentResize::~ClComponentResize() +{ +} + +const IGpuCkwComponentDriver *ClComponentResize::ckw_component_driver() const +{ + return _component_writer.get(); +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h new file mode 100644 index 0000000000..9a1169c45f --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESIZE_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESIZE_H + +#include "arm_compute/dynamic_fusion/sketch/attributes/ResizeAttributes.h" + +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" + +namespace arm_compute +{ +/** Forward declaration */ +class ITensorInfo; +namespace experimental +{ +namespace dynamic_fusion +{ +/** Forward declaration */ +template <typename T> +class ArgumentPack; + +/** Forward declaration */ +class GpuCkwResize; + +class ClComponentResize final : public IGpuKernelComponent +{ +public: + /** Attributes are a set of backend-agnostic parameters that define what a component does */ + using Attributes = ResizeAttributes; + + /** Validate the component + * + * @param[in] properties Component properties @ref Properties + * @param[in,out] tensors Tensor arguments to the component + * @param[in] attributes Component attributes @ref Attributes + * + * @return Status Validation results + * + * Tensor argument names: + * - ACL_SRC_0: Input + * - ACL_DST_0: Output + * + * Tensor argument constness: + * - ACL_SRC_0: Const + * - ACL_DST_0: Const + * + * Valid data layouts: + * - NHWC + * + ** Valid data type configurations: + * |ACL_SRC_0 |ACL_DST_0 | + * |:--------------|:--------------| + * |QASYMM8 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED | + * |F16 |F16 | + * |F32 |F32 | + * |U8 |U8 | + * |S16 |S16 | + */ + static Status + validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes); + + /** Constructor + * + * Similar to @ref ClComponentResize::validate() + */ + ClComponentResize(ComponentId id, + const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes); + + /** Destructor */ + ~ClComponentResize() override; + + /** Prevent instances of this class from being copy constructed */ + ClComponentResize(const ClComponentResize &component) = delete; + + /** Prevent instances of this class from being copied */ + ClComponentResize &operator=(const ClComponentResize &component) = delete; + + /** Allow instances of this class to be move constructed */ + ClComponentResize(ClComponentResize &&component) = default; + + /** Allow instances of this class to be moved */ + ClComponentResize &operator=(ClComponentResize &&component) = default; + + /** Get writer for the component */ + const IGpuCkwComponentDriver *ckw_component_driver() const override; + + /** Get component type */ + GpuComponentType type() const override + { + return GpuComponentType::Complex; + } + +private: + std::unique_ptr<GpuCkwResize> _component_writer; +}; + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute + +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESIZE_H diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp new file mode 100644 index 0000000000..3db6c5cd2d --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "ClComponentStore.h" + +#include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h" + +#include <memory> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +Status ClComponentStore::validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors) +{ + ARM_COMPUTE_UNUSED(properties, tensors); + return Status{}; +} +ClComponentStore::ClComponentStore(ComponentId id, + const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors) + : IGpuKernelComponent{id, properties, tensors}, _component_writer{std::make_unique<GpuCkwStore>(id, tensors)} +{ +} +ClComponentStore::~ClComponentStore() +{ +} +const IGpuCkwComponentDriver *ClComponentStore::ckw_component_driver() const +{ + return _component_writer.get(); +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h new file mode 100644 index 0000000000..2c1dd0f6fc --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE_H +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE_H + +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" + +#include <memory> + +namespace arm_compute +{ +/** Forward declaration */ +class ITensorInfo; +namespace experimental +{ +namespace dynamic_fusion +{ +/** Forward declaration */ +template <typename T> +class ArgumentPack; +class GpuCkwStore; + +class ClComponentStore final : public IGpuKernelComponent +{ +public: + /** Validate the component + * + * @param[in] properties Component properties + * @param[in] tensors Tensor arguments to the components + * + * @return Status Validation results + * + * Tensor argument names: + * - ACL_SRC_0: Input + * - ACL_DST_0: Output + * + * Tensor argument constness: + * - ACL_SRC_0: Const + * - ACL_DST_0: Const + * + * Valid data layouts: + * - NHWC + * + * Valid data type configurations: + * |ACL_SRC_0 |ACL_DST_0 | + * |:--------------|:--------------| + * |All |All | + */ + static Status validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors); + /** Constructor + * + * Similar to @ref ClComponentStore::validate() + */ + ClComponentStore(ComponentId id, const Properties &properties, const ArgumentPack<ITensorInfo> &tensors); + /** Destructor */ + ~ClComponentStore() override; + /** Prevent instances of this class from being copy constructed */ + ClComponentStore(const ClComponentStore &component) = delete; + /** Prevent instances of this class from being copied */ + ClComponentStore &operator=(const ClComponentStore &component) = delete; + /** Allow instances of this class to be move constructed */ + ClComponentStore(ClComponentStore &&component) = default; + /** Allow instances of this class to be moved */ + ClComponentStore &operator=(ClComponentStore &&component) = default; + /** Get writer for the component */ + const IGpuCkwComponentDriver *ckw_component_driver() const override; + /** Get component type */ + GpuComponentType type() const override + { + return GpuComponentType::Output; + } + +private: + std::unique_ptr<GpuCkwStore> _component_writer; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE_H diff --git a/src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h new file mode 100644 index 0000000000..4c3e84e59d --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_UTILS_TYPE_PRINTER_ELEMENTWISEBINARY +#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_UTILS_TYPE_PRINTER_ELEMENTWISEBINARY + +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h" + +#include <ostream> +#include <sstream> +#include <string> + +namespace arm_compute +{ +/** Type printers for all types related to the component @ref ClComponentElementwiseBinary + */ + +using namespace experimental::dynamic_fusion; + +/** Formatted output of the pute::experimental::dynamic_fusion::ClComponentElementwiseBinary::Attributes::ElementwiseOp type. + * + * @param[out] os Output stream. + * @param[in] op arm_compute::experimental::dynamic_fusion::ClComponentElementwiseBinary::Attributes::ElementwiseOp type to output. + * + * @return Modified output stream. + */ +inline ::std::ostream &operator<<(::std::ostream &os, const ClComponentElementwiseBinary::Attributes::ElementwiseOp &op) +{ + const std::map<ClComponentElementwiseBinary::Attributes::ElementwiseOp, std::string> op_name = { + {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Add, "add"}, + {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Div, "div"}, + {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Max, "max"}, + {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Min, "min"}, + {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Mul, "mul"}, + {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Power, "power"}, + {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Prelu, "prelu"}, + {ClComponentElementwiseBinary::Attributes::ElementwiseOp::SquaredDiff, "squareddiff"}, + {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Sub, "sub"}}; + os << op_name.at(op); + return os; +} +/** Formatted output of the arm_compute::experimental::dynamic_fusion::ClComponentElementwiseBinary::Attributes::ElementwiseOp type. + * + * @param[in] op arm_compute::experimental::dynamic_fusion::ClComponentElementwiseBinary::Attributes::ElementwiseOp type to output. + * + * @return Formatted string. + */ +inline std::string to_string(const ClComponentElementwiseBinary::Attributes::ElementwiseOp &op) +{ + std::stringstream str; + str << op; + return str.str(); +} +} // namespace arm_compute +#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_UTILS_TYPE_PRINTER_ELEMENTWISEBINARY */ diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp new file mode 100644 index 0000000000..201c9f243c --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h" + +#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h" + +#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +Status GpuAdd::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *lhs, const ITensorInfo *rhs) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, + DataType::S16, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type"); + + // Set the elementwise operation to Add then call the elementwise common validate_op + ElementwiseBinaryCommonAttributes common_attributes{}; + common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Add); + return GpuElementwiseBinaryCommon::validate_op(sketch, lhs, rhs, common_attributes); +} + +Status GpuAdd::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type"); + + // Set the elementwise operation to Add then call the elementwise common is_supported_op + ElementwiseBinaryCommonAttributes common_attributes{}; + common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Add); + return GpuElementwiseBinaryCommon::is_supported_op(context, lhs, rhs, common_attributes); +} + +ITensorInfo *GpuAdd::create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs) +{ + // No need to log or validate as they'll be handled inside GpuElementwiseBinaryCommon::create_op() + // Set the elementwise operation to Add then call the elementwise common create_op + ElementwiseBinaryCommonAttributes common_attributes{}; + common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Add); + return GpuElementwiseBinaryCommon::create_op(sketch, lhs, rhs, common_attributes); +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp new file mode 100644 index 0000000000..d25a2a3153 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +Status is_supported_op_helper(const GpuWorkloadContext &context, + const ITensorInfo *src, + const ITensorInfo *dst, + const CastAttributes &attributes) +{ + ARM_COMPUTE_RETURN_ERROR_ON(src == dst); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + + TensorInfo dst_info_to_validate; + const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate; + + if (dst != nullptr) + { + dst_info_to_validate_ptr = dst; + } + + auto_init_if_empty(dst_info_to_validate, src->clone()->set_data_type(attributes.data_type())); + + // Check support level + // Data Type + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst_info_to_validate_ptr, 1, DataType::F16, DataType::F32); + + if (context.gpu_language() == GpuLanguage::OpenCL) + { + const auto cl_compile_ctx = context.cl_compile_context(); + ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr); + // Validate Cast Component + { + const auto properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + auto settings = ClComponentCast::Settings(); + + ArgumentPack<ITensorInfo> arguments; + arguments.add_const_tensor(ACL_SRC_0, src); + arguments.add_const_tensor(ACL_DST_0, dst_info_to_validate_ptr); + ARM_COMPUTE_RETURN_ON_ERROR(ClComponentCast::validate(properties, arguments, attributes, settings)); + } + } + else + { + ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language"); + } + + return Status{}; +} +constexpr GpuOperatorType operator_type = GpuOperatorType::Simple; +} // namespace + +Status +GpuCast::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const CastAttributes &attributes) +{ + return is_supported_op_helper(context, src, nullptr, attributes); +} + +Status GpuCast::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const CastAttributes &attributes) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id()); + + // Refer to GpuConv2d::validate_op() for id-validness of this TensorInfo object + TensorInfo dst_info_to_validate; + + // Auto initialize dst tensor info + auto_init_if_empty(dst_info_to_validate, src->clone()->set_data_type(attributes.data_type())); + + // Perform fusion test + // Pack tensor infos + ArgumentPack<ITensorInfo> tensors; + tensors.add_const_tensor(ACL_SRC_0, src); + tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate); + const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op), + "Operator fusion test failed. This operator cannot be fused into the workload"); + + // Check if configuration is supported + return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate, attributes); +} + +ITensorInfo *GpuCast::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const CastAttributes &attributes) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_LOG_PARAMS(src, attributes); + ARM_COMPUTE_ERROR_THROW_ON(GpuCast::validate_op(sketch, src, attributes)); + + ITensorInfo *dst = sketch.implementation().create_virtual_tensor(); + ARM_COMPUTE_ERROR_ON_NULLPTR(dst); + + // Auto initialize dst tensor info if empty + auto_init_if_empty(*dst, src->clone()->set_data_type(attributes.data_type())); + + // Translate into components and add to component graph + GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph(); + const auto *sketch_ctx = sketch.implementation().context(); + + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + { + ARM_COMPUTE_ERROR_ON(sketch_ctx->cl_compile_context() == nullptr); + + // Add Depthwise Conv2d Component + { + const auto properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + auto settings = ClComponentCast::Settings(); + + ArgumentPack<ITensorInfo> arguments; + arguments.add_const_tensor(ACL_SRC_0, src); + arguments.add_const_tensor(ACL_DST_0, dst); + comp_graph.add_new_component<ClComponentCast>(properties, arguments, attributes, settings); + } + } + else + { + ARM_COMPUTE_ERROR("Unimplemented Gpu language"); + } + + // Set up fusion test by adding to the Operator Group + // Note this has to be performed after all the components have been successfully added to the component graph + + // Pack tensor infos + ArgumentPack<ITensorInfo> tensors; + tensors.add_const_tensor(ACL_SRC_0, src); + tensors.add_const_tensor(ACL_DST_0, dst); + + const Operator op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + sketch.implementation().operator_group().add_operator(op); + + return dst; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp new file mode 100644 index 0000000000..4d6e7f81bb --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuClamp.h" + +#include "arm_compute/core/experimental/Types.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +Status is_supported_op_helper(const GpuWorkloadContext &context, + const ITensorInfo *src, + const ITensorInfo *dst, + const ClampAttributes &attributes) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(attributes.max_val() < attributes.min_val(), + "Maximum clamp value cannot be lower than minimum value"); + + TensorInfo dst_info_to_validate; + const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate; + + if (dst != nullptr) + { + dst_info_to_validate_ptr = dst; + } + + auto_init_if_empty(dst_info_to_validate, *src->clone()); + + // CLAMP operator is implemented as LU_BOUNDED_RELU with the alpha and beta variables swapped + const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + attributes.max_val(), attributes.min_val()}; + + // Check components + if (context.gpu_language() == GpuLanguage::OpenCL) + { + // Validate Activation Component + const auto properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + + ArgumentPack<ITensorInfo> arguments; + arguments.add_const_tensor(ACL_SRC, src); + arguments.add_const_tensor(ACL_DST, dst_info_to_validate_ptr); + ARM_COMPUTE_RETURN_ON_ERROR(ClComponentActivation::validate(properties, arguments, act_info)); + } + else + { + ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language"); + } + return Status{}; +} + +constexpr GpuOperatorType operator_type = GpuOperatorType::Simple; +} // namespace + +Status +GpuClamp::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const ClampAttributes &attributes) +{ + return is_supported_op_helper(context, src, nullptr, attributes); +} + +Status GpuClamp::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const ClampAttributes &attributes) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); + + // Check if tensors have valid id, i.e. they are created from a sketch + ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id()); + + // Refer to GpuConv2d::validate_op() for id-validness of this TensorInfo object + TensorInfo dst_info_to_validate; + + // Auto initialize dst tensor info + auto_init_if_empty(dst_info_to_validate, *src->clone()); + + // Perform fusion test to check if the operator meets fusion constraints + ArgumentPack<ITensorInfo> tensors; + tensors.add_const_tensor(ACL_SRC, src); + tensors.add_const_tensor(ACL_DST, &dst_info_to_validate); + const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op), + "Operator fusion test failed. This operator cannot be fused into the workload"); + + // Check if configuration is supported + return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate, attributes); +} + +ITensorInfo *GpuClamp::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const ClampAttributes &attributes) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_LOG_PARAMS(src, attributes); + ARM_COMPUTE_ERROR_THROW_ON(GpuClamp::validate_op(sketch, src, attributes)); + + ITensorInfo *dst = sketch.implementation().create_virtual_tensor(); + ARM_COMPUTE_ERROR_ON_NULLPTR(dst); + + // Auto initialize dst tensor + auto_init_if_empty(*dst, *src->clone()); + + // Translate into components and add to component graph + GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph(); + + // CLAMP operator is implemented as LU_BOUNDED_RELU with the alpha and beta variables swapped + const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + attributes.max_val(), attributes.min_val()}; + + const auto *const sketch_ctx = sketch.implementation().context(); + + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + { + // Add Activation Component + auto properties = IGpuKernelComponent::Properties(); + properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + + ArgumentPack<ITensorInfo> arguments; + arguments.add_const_tensor(ACL_SRC, src); + arguments.add_const_tensor(ACL_DST, dst); + comp_graph.add_new_component<ClComponentActivation>(properties, arguments, act_info); + } + else + { + ARM_COMPUTE_ERROR("Unimplemented Gpu language"); + } + + // Set up fusion test by adding to the Operator Group + // Note this has to be performed after all the components have been successfully added to the component graph + + // Pack tensor infos + ArgumentPack<ITensorInfo> tensors; + tensors.add_const_tensor(ACL_SRC, src); + tensors.add_const_tensor(ACL_DST, dst); + + const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + sketch.implementation().operator_group().add_operator(op); + + return dst; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp new file mode 100644 index 0000000000..aaeec543f8 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h" + +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" +#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" +#include "src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h" +#include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +DirectConvComputeKernelInfo +config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info) +{ + // Get GPU target + GPUTarget gpu_target = CLScheduler::get().target(); + + std::unique_ptr<arm_compute::cl_direct_conv::IClDirectConvKernelConfig> t = + arm_compute::cl_direct_conv::ClDirectConvKernelConfigurationFactory::create(gpu_target); + + return t->configure(src, weights, conv_info); +} + +void calculate_and_init_dst_if_empty(ITensorInfo *dst, + const ITensorInfo *src, + const ITensorInfo *wei, + const Conv2dAttributes &attributes) +{ + if (dst->total_size() == 0U) + { + const auto shape = misc::shape_calculator::compute_deep_convolution_shape( + src->tensor_shape(), src->data_layout(), wei->tensor_shape(), + PadStrideInfo(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, + attributes.pad().right, attributes.pad().top, attributes.pad().bottom, + DimensionRoundingType::FLOOR)); // use the default DimensionRoundingType + + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape)); + } +} + +/* A helper method to reduce the duplication in dst tensor initialization +* when calling validate() +*/ +Status is_supported_op_helper(const GpuWorkloadContext &context, + const ITensorInfo *src, + const ITensorInfo *wei, + const ITensorInfo *bia, + const ITensorInfo *dst, + const Conv2dAttributes &attributes) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei); + + TensorInfo dst_info_to_validate; + const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate; + + if (dst != nullptr) + { + dst_info_to_validate_ptr = dst; + } + + calculate_and_init_dst_if_empty(&dst_info_to_validate, src, wei, attributes); + + // Check support level + // Data type + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); + // Data layout + ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC); + + // Check components + const auto gpu_target = context.gpu_target(); + if (context.gpu_language() == GpuLanguage::OpenCL) + { + const auto cl_compile_ctx = context.cl_compile_context(); + ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr); + // Validate Direct Conv2d Component + { + const auto properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + auto settings = ClComponentDirectConv2d::Settings(); + + settings.fast_relaxed_math( + (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) && + (dst_info_to_validate_ptr->data_type() == DataType::F32 || + dst_info_to_validate_ptr->data_type() == DataType::F16)); + + ArgumentPack<ITensorInfo> arguments; + arguments.add_const_tensor(ACL_SRC_0, src); + arguments.add_const_tensor(ACL_SRC_1, wei); + arguments.add_const_tensor(ACL_SRC_2, bia); + arguments.add_const_tensor(ACL_DST_0, dst_info_to_validate_ptr); + ARM_COMPUTE_RETURN_ON_ERROR(ClComponentDirectConv2d::validate(properties, arguments, attributes, settings)); + } + } + else + { + ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language"); + } + return Status{}; +} + +constexpr GpuOperatorType operator_type = GpuOperatorType::Complex; +} // namespace + +Status GpuConv2d::is_supported_op(const GpuWorkloadContext &context, + const ITensorInfo *src, + const ITensorInfo *wei, + const ITensorInfo *bia, + const Conv2dAttributes &attributes) +{ + return is_supported_op_helper(context, src, wei, bia, nullptr, attributes); +} + +Status GpuConv2d::validate_op(const GpuWorkloadSketch &sketch, + const ITensorInfo *src, + const ITensorInfo *wei, + const ITensorInfo *bia, + const Conv2dAttributes &attributes) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!wei->are_values_constant(), "Dynamic weights are not supported"); + + // Check if tensors have valid id. I.e. they are created from a sketch + ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id() || !wei->has_valid_id()); + if (bia != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON(!bia->has_valid_id()); + } + + // This tensor info will have invalid id but because all the existing tensors in the + // sketch have valid ids and the DependencyGraph implementation has no notion of validness + // regarding tensor ids, it'll be just another tensor id and will validate + // Additionally, a new dst id is added every time in create_op, thus there's no need to validate it + TensorInfo dst_info_to_validate; + + // Auto initialize dst tensor info + calculate_and_init_dst_if_empty(&dst_info_to_validate, src, wei, attributes); + + // Perform fusion test + // Check if operator meets fusion constraints + ArgumentPack<ITensorInfo> tensors; + tensors.add_const_tensor(ACL_SRC_0, src); + tensors.add_const_tensor(ACL_SRC_1, wei); + tensors.add_const_tensor(ACL_SRC_2, bia); + tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate); + const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op), + "Operator fusion test failed. This operator cannot be fused into the workload"); + + // Check if configuration is supported + return is_supported_op_helper(*sketch.gpu_context(), src, wei, bia, &dst_info_to_validate, attributes); +} + +ITensorInfo *GpuConv2d::create_op( + GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *wei, ITensorInfo *bia, const Conv2dAttributes &attributes) +{ + ARM_COMPUTE_LOG_PARAMS(src, wei, bia, attributes); + PadStrideInfo conv_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, + attributes.pad().right, attributes.pad().top, attributes.pad().bottom, + DimensionRoundingType::FLOOR); + // Initialize the direct convolution descriptor + const DirectConvComputeKernelInfo desc = config_direct_convolution_nhwc(src, wei, conv_info); + + ITensorInfo *dst = sketch.implementation().create_virtual_tensor(); + + // Assert validation + ARM_COMPUTE_ERROR_THROW_ON(GpuConv2d::validate_op(sketch, src, wei, bia, attributes)); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, wei, dst); + + // Auto initialize dst tensor + calculate_and_init_dst_if_empty(dst, src, wei, attributes); + + // Translate into components and add to component graph + auto &comp_graph = sketch.implementation().component_graph(); + + const auto sketch_ctx = sketch.implementation().context(); + + const auto gpu_target = sketch_ctx->gpu_target(); + + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + { + const auto cl_compile_ctx = sketch_ctx->cl_compile_context(); + ARM_COMPUTE_ERROR_ON(cl_compile_ctx == nullptr); + ARM_COMPUTE_UNUSED(cl_compile_ctx); + + // Add Direct Conv2d Component + { + auto properties = IGpuKernelComponent::Properties(); + properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + + auto settings = ClComponentDirectConv2d::Settings(); + + settings.fast_relaxed_math( + (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) && + (dst->data_type() == DataType::F32 || dst->data_type() == DataType::F16)); + + settings.direct_conv_descriptor(desc); + + if (settings.export_to_cl_image()) + { + arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(wei); + } + + ArgumentPack<ITensorInfo> arguments; + arguments.add_const_tensor(ACL_SRC_0, src); + arguments.add_const_tensor(ACL_SRC_1, wei); + arguments.add_const_tensor(ACL_SRC_2, bia); + arguments.add_const_tensor(ACL_DST_0, dst); + comp_graph.add_new_component<ClComponentDirectConv2d>(properties, arguments, attributes, settings); + } + } + else + { + ARM_COMPUTE_ERROR("Unimplemented Gpu language"); + } + + // Set up fusion test by adding to the Operator Group + // Note this has to be performed after all the components have been successfully added to the component graph + + // Pack tensor infos + ArgumentPack<ITensorInfo> tensors; + tensors.add_const_tensor(ACL_SRC_0, src); + tensors.add_const_tensor(ACL_SRC_1, wei); + tensors.add_const_tensor(ACL_SRC_2, bia); + tensors.add_const_tensor(ACL_DST_0, dst); + + const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + sketch.implementation().operator_group().add_operator(op); + + return dst; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp new file mode 100644 index 0000000000..e2b673bd43 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp @@ -0,0 +1,278 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.h" + +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" +#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" +#include "src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h" +#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +void calculate_and_init_dst_if_empty(ITensorInfo *dst, + const ITensorInfo *src, + const ITensorInfo *wei, + const DepthwiseConv2dAttributes &attributes) +{ + if (dst->total_size() == 0U) + { + const PadStrideInfo pad_stride_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, + attributes.pad().right, attributes.pad().top, attributes.pad().bottom, + attributes.dimension_rounding_type()); + + const ConvolutionInfo conv_info{pad_stride_info, attributes.depth_multiplier(), ActivationLayerInfo(), + attributes.dilation()}; + const TensorShape shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *wei, conv_info); + + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape)); + } +} + +/* A helper method to reduce the duplication in dst tensor initialization +* when calling validate() +*/ +Status is_supported_op_helper(const GpuWorkloadContext &context, + const ITensorInfo *src, + const ITensorInfo *wei, + const ITensorInfo *bia, + const ITensorInfo *dst, + const DepthwiseConv2dAttributes &attributes) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei); + + TensorInfo dst_info_to_validate; + const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate; + + if (dst != nullptr) + { + dst_info_to_validate_ptr = dst; + } + + calculate_and_init_dst_if_empty(&dst_info_to_validate, src, wei, attributes); + + // Check support level + // Data type + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); + // Data layout + ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC); + + const GpuTarget gpu_target = context.gpu_target(); + + if (context.gpu_language() == GpuLanguage::OpenCL) + { + const CLCompileContext *cl_compile_ctx = context.cl_compile_context(); + ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr); + + // Validate Depthwise Conv2d Component + { + const auto properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + auto settings = ClComponentDepthwiseConv2d::Settings(); + + const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(), + attributes.pad().left, attributes.pad().right, attributes.pad().top, + attributes.pad().bottom, DimensionRoundingType::FLOOR); + + // Get the depthwise convolution compute parameters + auto t = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target); + const DWCComputeKernelInfo dwc_info = + t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier()); + + settings.fast_relaxed_math( + (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) && + (dst_info_to_validate_ptr->data_type() == DataType::F32 || + dst_info_to_validate_ptr->data_type() == DataType::F16)); + + settings.is_fma_available(get_arch_from_target(gpu_target) == GPUTarget::MIDGARD) + .m0(dwc_info.m0) + .n0(dwc_info.n0) + .export_input_to_cl_image(dwc_info.export_input_to_cl_image) + .export_weights_to_cl_image(dwc_info.export_weights_to_cl_image); + + ArgumentPack<ITensorInfo> arguments; + arguments.add_const_tensor(ACL_SRC_0, src); + arguments.add_const_tensor(ACL_SRC_1, wei); + arguments.add_const_tensor(ACL_SRC_2, bia); + arguments.add_const_tensor(ACL_DST_0, dst_info_to_validate_ptr); + ARM_COMPUTE_RETURN_ON_ERROR( + ClComponentDepthwiseConv2d::validate(properties, arguments, attributes, settings)); + } + } + else + { + ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language"); + } + + return Status{}; +} + +constexpr GpuOperatorType operator_type = GpuOperatorType::Complex; +} // namespace + +Status GpuDepthwiseConv2d::is_supported_op(const GpuWorkloadContext &context, + const ITensorInfo *src, + const ITensorInfo *wei, + const ITensorInfo *bia, + const DepthwiseConv2dAttributes &attributes) +{ + return is_supported_op_helper(context, src, wei, bia, nullptr, attributes); +} + +Status GpuDepthwiseConv2d::validate_op(const GpuWorkloadSketch &sketch, + const ITensorInfo *src, + const ITensorInfo *wei, + const ITensorInfo *bia, + const DepthwiseConv2dAttributes &attributes) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!wei->are_values_constant(), "Dynamic weights are not supported"); + + ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id() || !wei->has_valid_id()); + + if (bia != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON(!bia->has_valid_id()); + } + + // Refer to GpuConv2d::validate_op() for id-validness of this TensorInfo object + TensorInfo dst_info_to_validate; + + // Auto initialize dst tensor info + calculate_and_init_dst_if_empty(&dst_info_to_validate, src, wei, attributes); + + // Perform fusion test + // Pack tensor infos + ArgumentPack<ITensorInfo> tensors; + tensors.add_const_tensor(ACL_SRC_0, src); + tensors.add_const_tensor(ACL_SRC_1, wei); + tensors.add_const_tensor(ACL_SRC_2, bia); + tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate); + const Operator op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op), + "Operator fusion test failed. This operator cannot be fused into the workload"); + + // Check if configuration is supported + return is_supported_op_helper(*sketch.gpu_context(), src, wei, bia, &dst_info_to_validate, attributes); +} + +ITensorInfo *GpuDepthwiseConv2d::create_op(GpuWorkloadSketch &sketch, + ITensorInfo *src, + ITensorInfo *wei, + ITensorInfo *bia, + const DepthwiseConv2dAttributes &attributes) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, wei); + ARM_COMPUTE_LOG_PARAMS(src, wei, bia, attributes); + ARM_COMPUTE_ERROR_THROW_ON(GpuDepthwiseConv2d::validate_op(sketch, src, wei, bia, attributes)); + + ITensorInfo *dst = sketch.implementation().create_virtual_tensor(); + ARM_COMPUTE_ERROR_ON_NULLPTR(dst); + + calculate_and_init_dst_if_empty(dst, src, wei, attributes); + + // Translate into components and add to component graph + GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph(); + const auto *sketch_ctx = sketch.implementation().context(); + const GpuTarget gpu_target = sketch_ctx->gpu_target(); + + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(sketch_ctx->cl_compile_context()); + + // Add Depthwise Conv2d Component + { + const auto properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + auto settings = ClComponentDepthwiseConv2d::Settings(); + + const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(), + attributes.pad().left, attributes.pad().right, attributes.pad().top, + attributes.pad().bottom, DimensionRoundingType::FLOOR); + + // Get the depthwise convolution compute parameters + auto t = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target); + const DWCComputeKernelInfo dwc_info = + t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier()); + + settings.is_fma_available(get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) + .m0(dwc_info.m0) + .n0(dwc_info.n0) + .export_input_to_cl_image(dwc_info.export_input_to_cl_image) + .export_weights_to_cl_image(dwc_info.export_weights_to_cl_image); + + if (settings.export_input_to_cl_image()) + { + arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(src); + } + + if (settings.export_weights_to_cl_image()) + { + arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(wei); + } + + ArgumentPack<ITensorInfo> arguments; + arguments.add_const_tensor(ACL_SRC_0, src); + arguments.add_const_tensor(ACL_SRC_1, wei); + arguments.add_const_tensor(ACL_SRC_2, bia); + arguments.add_const_tensor(ACL_DST_0, dst); + comp_graph.add_new_component<ClComponentDepthwiseConv2d>(properties, arguments, attributes, settings); + } + } + else + { + ARM_COMPUTE_ERROR("Unimplemented Gpu language"); + } + + // Set up fusion test by adding to the Operator Group + // Note this has to be performed after all the components have been successfully added to the component graph + + // Pack tensor infos + ArgumentPack<ITensorInfo> tensors; + tensors.add_const_tensor(ACL_SRC_0, src); + tensors.add_const_tensor(ACL_SRC_1, wei); + tensors.add_const_tensor(ACL_SRC_2, bia); + tensors.add_const_tensor(ACL_DST_0, dst); + + const Operator op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + sketch.implementation().operator_group().add_operator(op); + + return dst; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp new file mode 100644 index 0000000000..2997b28ec1 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp @@ -0,0 +1,245 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMatMul.h" + +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +void calculate_and_init_dst_if_empty(ITensorInfo *dst, + const ITensorInfo *lhs, + const ITensorInfo *rhs, + const MatMulAttributes &attributes, + const GpuMatMulSettings &settings) +{ + ARM_COMPUTE_UNUSED(attributes); + + if (dst->total_size() == 0U) + { + const auto dst_shape = misc::shape_calculator::compute_matmul_shape( + lhs->tensor_shape(), rhs->tensor_shape(), + MatMulKernelInfo(attributes.adj_lhs(), attributes.adj_rhs(), settings.m0(), settings.n0(), settings.k0())); + + auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(dst_shape)); + } +} + +/* A helper method to reduce the duplication in dst tensor initialization +* when calling validate() +*/ +Status is_supported_op_helper(const GpuWorkloadContext &context, + const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + const MatMulAttributes &attributes, + const GpuMatMulSettings &settings) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs); + + TensorInfo dst_info_to_validate; + const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate; + + if (dst != nullptr) + { + dst_info_to_validate_ptr = dst; + } + + calculate_and_init_dst_if_empty(&dst_info_to_validate, lhs, rhs, attributes, settings); + + // Check support level + // Data type + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32); + + // Check components + if (context.gpu_language() == GpuLanguage::OpenCL) + { + const auto cl_compile_ctx = context.cl_compile_context(); + ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr); + // Validate MatMul Component + { + const auto properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + + ArgumentPack<ITensorInfo> arguments; + arguments.add_const_tensor(ACL_SRC_0, lhs); + arguments.add_const_tensor(ACL_SRC_1, rhs); + arguments.add_const_tensor(ACL_DST_0, dst_info_to_validate_ptr); + + ARM_COMPUTE_RETURN_ON_ERROR(ClComponentMatMul::validate(properties, arguments, attributes, settings)); + } + } + else + { + ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language"); + } + return Status{}; +} + +constexpr GpuOperatorType operator_type = GpuOperatorType::Complex; +} // namespace + +int GpuMatMulSettings::n0() const +{ + return _n0; +} + +GpuMatMulSettings &GpuMatMulSettings::n0(int n0) +{ + _n0 = n0; + return *this; +} + +int GpuMatMulSettings::m0() const +{ + return _m0; +} + +GpuMatMulSettings &GpuMatMulSettings::m0(int m0) +{ + _m0 = m0; + return *this; +} + +int GpuMatMulSettings::k0() const +{ + return _k0; +} + +GpuMatMulSettings &GpuMatMulSettings::k0(int k0) +{ + _k0 = k0; + return *this; +} + +Status GpuMatMul::is_supported_op(const GpuWorkloadContext &context, + const ITensorInfo *lhs, + const ITensorInfo *rhs, + const MatMulAttributes &attributes, + const GpuMatMulSettings &settings) +{ + return is_supported_op_helper(context, lhs, rhs, nullptr, attributes, settings); +} + +Status GpuMatMul::validate_op(const GpuWorkloadSketch &sketch, + const ITensorInfo *lhs, + const ITensorInfo *rhs, + const MatMulAttributes &attributes, + const GpuMatMulSettings &settings) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs); + + // Check if tensors have valid id. I.e. they are created from a sketch + ARM_COMPUTE_RETURN_ERROR_ON(!lhs->has_valid_id() || !rhs->has_valid_id()); + + // Refer to GpuMatmul::validate_op() for id-validness of this TensorInfo object + TensorInfo dst_info_to_validate; + + // Auto initialize dst tensor info + calculate_and_init_dst_if_empty(&dst_info_to_validate, lhs, rhs, attributes, settings); + + // Perform fusion test + // Check if operator meets fusion constraints + ArgumentPack<ITensorInfo> tensors; + tensors.add_const_tensor(ACL_SRC_0, lhs); + tensors.add_const_tensor(ACL_SRC_1, rhs); + tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate); + const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op), + "Operator fusion test failed. This operator cannot be fused into the workload"); + + // Check if configuration is supported + return is_supported_op_helper(*sketch.gpu_context(), lhs, rhs, &dst_info_to_validate, attributes, settings); +} + +ITensorInfo *GpuMatMul::create_op(GpuWorkloadSketch &sketch, + ITensorInfo *lhs, + ITensorInfo *rhs, + const Attributes &attributes, + const Settings &settings) +{ + ARM_COMPUTE_LOG_PARAMS(lhs, rhs, attributes, settings); + + ITensorInfo *dst = sketch.implementation().create_virtual_tensor(); + + // Assert validation + ARM_COMPUTE_ERROR_THROW_ON(GpuMatMul::validate_op(sketch, lhs, rhs, attributes, settings)); + ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); + + // Auto initialize dst tensor + calculate_and_init_dst_if_empty(dst, lhs, rhs, attributes, settings); + + // Translate into components and add to component graph + auto &comp_graph = sketch.implementation().component_graph(); + const auto sketch_ctx = sketch.implementation().context(); + + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + { + auto properties = IGpuKernelComponent::Properties(); + properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + + ArgumentPack<ITensorInfo> arguments; + arguments.add_const_tensor(ACL_SRC_0, lhs); + arguments.add_const_tensor(ACL_SRC_1, rhs); + arguments.add_const_tensor(ACL_DST_0, dst); + comp_graph.add_new_component<ClComponentMatMul>(properties, arguments, attributes, settings); + } + else + { + ARM_COMPUTE_ERROR("Unimplemented Gpu language"); + } + + // Set up fusion test by adding to the Operator Group + // Note this has to be performed after all the components have been successfully added to the component graph + + // Pack tensor infos + ArgumentPack<ITensorInfo> tensors; + tensors.add_const_tensor(ACL_SRC_0, lhs); + tensors.add_const_tensor(ACL_SRC_1, rhs); + tensors.add_const_tensor(ACL_DST_0, dst); + + const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + sketch.implementation().operator_group().add_operator(op); + + return dst; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp new file mode 100644 index 0000000000..b871171e8d --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMul.h" + +#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h" + +#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +Status GpuMul::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *lhs, const ITensorInfo *rhs) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type"); + + // Set the elementwise operation to Mul then call the elementwise common validate_op + ElementwiseBinaryCommonAttributes common_attributes{}; + common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul); + return GpuElementwiseBinaryCommon::validate_op(sketch, lhs, rhs, common_attributes); +} + +Status GpuMul::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type"); + + // Set the elementwise operation to Mul then call the elementwise common is_supported_op + ElementwiseBinaryCommonAttributes common_attributes{}; + common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul); + return GpuElementwiseBinaryCommon::is_supported_op(context, lhs, rhs, common_attributes); +} + +ITensorInfo *GpuMul::create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs) +{ + // Set the elementwise operation to Mul then call the elementwise common create_op + ElementwiseBinaryCommonAttributes common_attributes{}; + common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul); + return GpuElementwiseBinaryCommon::create_op(sketch, lhs, rhs, common_attributes); +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp new file mode 100644 index 0000000000..f0d368d757 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" +#include "src/dynamic_fusion/utils/Utils.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +constexpr GpuOperatorType operator_type = GpuOperatorType::Simple; +} // namespace + +Status GpuOutput::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + + // Initialize the destination tensor info. + TensorInfo dst_to_validate = *dst; + auto_init_if_empty(dst_to_validate, *src); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, &dst_to_validate); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, &dst_to_validate); + + ARM_COMPUTE_UNUSED(context); + return Status{}; +} + +Status GpuOutput::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id()); + ARM_COMPUTE_RETURN_ERROR_ON(!is_alloc_tensor(dst)); + + // Initialize the destination tensor info. + TensorInfo dst_to_validate = *dst; + auto_init_if_empty(dst_to_validate, *src); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, &dst_to_validate); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, &dst_to_validate); + + // Perform fusion test. + ArgumentPack<ITensorInfo> tensors; + tensors.add_const_tensor(ACL_SRC_0, src); + tensors.add_const_tensor(ACL_DST_0, &dst_to_validate); + + const auto group = sketch.implementation().operator_group(); + const auto op = group.new_operator(operator_type, tensors); + const auto success = group.try_add_operator(op, true); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!success, "This operator cannot be fused into the workload."); + + const auto status = is_supported_op(*sketch.gpu_context(), src, dst); + return status; +} + +void GpuOutput::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(GpuOutput::validate_op(sketch, src, dst)); + + // Auto initialize dst tensor info if empty + auto_init_if_empty(*dst, *src); + + // Translate into components and add to component graph + auto &comp_graph = sketch.implementation().component_graph(); + const auto sketch_ctx = sketch.implementation().context(); + + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + { + ARM_COMPUTE_ERROR_ON(sketch_ctx->cl_compile_context() == nullptr); + + // Add store component + { + IGpuKernelComponent::Properties properties; + properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + + ArgumentPack<ITensorInfo> arguments; + arguments.add_const_tensor(ACL_SRC_0, src); + arguments.add_const_tensor(ACL_DST_0, dst); + comp_graph.add_new_component<ClComponentStore>(properties, arguments); + } + } + else + { + ARM_COMPUTE_ERROR("Unimplemented Gpu language"); + } + + // Set up fusion test by adding to the Operator Group + // Note this has to be performed after all the components have been successfully added to the component graph + + // Pack tensor infos + ArgumentPack<ITensorInfo> tensors; + tensors.add_const_tensor(ACL_SRC_0, src); + tensors.add_const_tensor(ACL_DST_0, dst); + + const Operator op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + sketch.implementation().operator_group().add_operator(op, true); +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp new file mode 100644 index 0000000000..2d04f75610 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h" + +#include "arm_compute/core/CL/CLCompileContext.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h" +#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h" +#include "src/dynamic_fusion/utils/Utils.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +void calculate_and_init_dst_if_empty(ITensorInfo *dst, + const ITensorInfo *src, + const Pool2dAttributes &attributes, + const GpuPool2dSettings &settings) +{ + ARM_COMPUTE_UNUSED(settings); + + if (dst->total_size() == 0U) + { + auto shape = misc::shape_calculator::compute_pool_shape( + *src, convert_pool_attr_to_pool_info(attributes, /* mixed_precision */ true)); + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape)); + } +} + +constexpr GpuOperatorType operator_type = GpuOperatorType::Complex; +} // namespace + +GpuPool2dSettings GpuPool2dSettings::use_inf_as_limit(bool use_inf_as_limit) +{ + _use_inf_as_limit = use_inf_as_limit; + return *this; +} + +bool GpuPool2dSettings::use_inf_as_limit() const +{ + return _use_inf_as_limit; +} + +Status GpuPool2d::validate_op(const GpuWorkloadSketch &sketch, + const ITensorInfo *src, + const Pool2dAttributes &attributes, + const GpuPool2dSettings &settings) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id()); + + // Auto initialize dst tensor info + TensorInfo dst_info_to_validate; + + calculate_and_init_dst_if_empty(&dst_info_to_validate, src, attributes, settings); + + // Perform fusion test + // Pack tensor infos + ArgumentPack<ITensorInfo> tensors; + tensors.add_const_tensor(ACL_SRC_0, src); + tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate); + + const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op), + "Operator fusion test failed. This operator cannot be fused into the workload"); + + // Check if configuration is supported + return is_supported_op(*sketch.gpu_context(), src, attributes, settings); +} + +Status GpuPool2d::is_supported_op(const GpuWorkloadContext &context, + const ITensorInfo *src, + const Pool2dAttributes &attributes, + const GpuPool2dSettings &settings) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); + // Data type + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); + // Data layout + ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC); + // Check exclude padding is not false + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!attributes.exclude_padding(), + "Exclude padding must be set to true in Attributes!"); + + // Auto initialize dst tensor info + TensorInfo dst_info_to_validate; + + calculate_and_init_dst_if_empty(&dst_info_to_validate, src, attributes, settings); + + // Check components + if (context.gpu_language() == GpuLanguage::OpenCL) + { + const auto cl_compile_ctx = context.cl_compile_context(); + ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr); + + // Validate Component + { + const KernelProperties properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + + ArgumentPack<ITensorInfo> arguments; + arguments.add_const_tensor(ACL_SRC_0, src); + arguments.add_const_tensor(ACL_DST_0, &dst_info_to_validate); + ARM_COMPUTE_RETURN_ON_ERROR(ClComponentPool2d::validate(properties, arguments, attributes, settings)); + } + } + else + { + ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language"); + } + return Status{}; +} + +ITensorInfo *GpuPool2d::create_op(GpuWorkloadSketch &sketch, + ITensorInfo *src, + const Pool2dAttributes &attributes, + const GpuPool2dSettings &settings) +{ + // Assert validation + ARM_COMPUTE_ERROR_THROW_ON(GpuPool2d::validate_op(sketch, src, attributes, settings)); + ARM_COMPUTE_LOG_PARAMS(src, attributes, settings); + + ITensorInfo *dst = sketch.implementation().create_virtual_tensor(); + ARM_COMPUTE_ERROR_ON_NULLPTR(dst); + + // Auto initialize dst tensor + calculate_and_init_dst_if_empty(dst, src, attributes, settings); + + // Translate into components and add to component graph + auto &comp_graph = sketch.implementation().component_graph(); + + const auto sketch_ctx = sketch.implementation().context(); + + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + { + const auto cl_compile_ctx = sketch_ctx->cl_compile_context(); + ARM_COMPUTE_UNUSED(cl_compile_ctx); + ARM_COMPUTE_ERROR_ON(cl_compile_ctx == nullptr); + + // Add Component + { + auto properties = IGpuKernelComponent::Properties(); + properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + + ArgumentPack<ITensorInfo> arguments; + arguments.add_const_tensor(ACL_SRC_0, src); + arguments.add_const_tensor(ACL_DST_0, dst); + comp_graph.add_new_component<ClComponentPool2d>(properties, arguments, attributes, settings); + } + } + else + { + ARM_COMPUTE_ERROR("Unimplemented Gpu language"); + } + + // Set up fusion test by adding to the Operator Group + // Note this has to be performed after all the components have been successfully added to the component graph + + // Pack tensor infos + ArgumentPack<ITensorInfo> tensors; + tensors.add_const_tensor(ACL_SRC_0, src); + tensors.add_tensor(ACL_DST_0, dst); + + const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + sketch.implementation().operator_group().add_operator(op); + + return dst; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp new file mode 100644 index 0000000000..0e1f16e8ff --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuReshape.h" + +#include "arm_compute/core/Error.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +Status is_supported_op_helper(const GpuWorkloadContext &context, + const ITensorInfo *src, + const ITensorInfo *dst, + const ReshapeAttributes &attributes) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); + + TensorInfo dst_info_to_validate; + const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate; + + if (dst != nullptr) + { + dst_info_to_validate_ptr = dst; + } + + auto_init_if_empty(dst_info_to_validate, src->clone()->set_tensor_shape(attributes.shape())); + + // Check components + if (context.gpu_language() == GpuLanguage::OpenCL) + { + const auto cl_compile_ctx = context.cl_compile_context(); + ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr); + + // Validate GpuReshape Component + ArgumentPack<ITensorInfo> arguments; + arguments.add_const_tensor(ACL_SRC_0, src); + arguments.add_const_tensor(ACL_DST_0, dst_info_to_validate_ptr); + + ARM_COMPUTE_RETURN_ON_ERROR(ClComponentReshape::validate(arguments)); + } + else + { + ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language"); + } + + return Status{ErrorCode::RUNTIME_ERROR, "GpuReshape is not Supported"}; +} + +GpuOperatorType operator_type = GpuOperatorType::Complex; +} // namespace + +Status +GpuReshape::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const Attributes &attributes) +{ + return is_supported_op_helper(context, src, nullptr, attributes); +} + +Status GpuReshape::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const Attributes &attributes) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id()); + + // Refer to GpuConv2d::validate_op() for id-validness of this TensorInfo object + TensorInfo dst_info_to_validate; + + // Auto initialize dst tensor info + auto_init_if_empty(dst_info_to_validate, src->clone()->set_tensor_shape(attributes.shape())); + + // Perform fusion test + // Pack tensor infos + ArgumentPack<ITensorInfo> tensors; + tensors.add_const_tensor(ACL_SRC_0, src); + tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate); + const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op), + "Operator fusion test failed. This operator cannot be fused into the workload"); + + // Check if configuration is supported + return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate, attributes); +} + +ITensorInfo *GpuReshape::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const Attributes &attributes) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_LOG_PARAMS(src, attributes.shape()); + ARM_COMPUTE_ERROR_THROW_ON(GpuReshape::validate_op(sketch, src, attributes)); + + ITensorInfo *dst = sketch.implementation().create_virtual_tensor(); + ARM_COMPUTE_ERROR_ON_NULLPTR(dst); + + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(attributes.shape())); + + // Translate into components and add to component graph + auto &comp_graph = sketch.implementation().component_graph(); + const auto sketch_ctx = sketch.implementation().context(); + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + { + const auto cl_compile_ctx = sketch_ctx->cl_compile_context(); + ARM_COMPUTE_UNUSED(cl_compile_ctx); + ARM_COMPUTE_ERROR_ON(cl_compile_ctx == nullptr); + + // Add ElementwiseBinary Component + { + auto properties = IGpuKernelComponent::Properties(); + properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + + ArgumentPack<ITensorInfo> arguments; + arguments.add_const_tensor(ACL_SRC_0, src); + arguments.add_const_tensor(ACL_DST_0, dst); + comp_graph.add_new_component<ClComponentReshape>(properties, arguments); + } + } + else + { + ARM_COMPUTE_ERROR("Unimplemented Gpu language"); + } + // Set up fusion test by adding to the Operator Group + // Note this has to be performed after all the components have been successfully added to the component graph + + // Pack tensor infos + ArgumentPack<ITensorInfo> tensors; + tensors.add_const_tensor(ACL_SRC_0, src); + tensors.add_tensor(ACL_DST_0, dst); + + const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + sketch.implementation().operator_group().add_operator(op); + + return dst; +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp new file mode 100644 index 0000000000..8e794c88b2 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuResize.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *src, const ResizeAttributes &attributes) +{ + if (dst->total_size() == 0U) + { + TensorShape out_shape = src->tensor_shape(); + + out_shape.set(1, attributes.output_width()); + out_shape.set(2, attributes.output_height()); + + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(out_shape)); + } +} + +Status is_supported_op_helper(const GpuWorkloadContext &context, + const ITensorInfo *src, + const ITensorInfo *dst, + const ResizeAttributes &attributes) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); + TensorInfo dst_info_to_validate; + const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate; + + if (dst != nullptr) + { + dst_info_to_validate_ptr = dst; + } + + calculate_and_init_dst_if_empty(&dst_info_to_validate, src, attributes); + + // Check support level + // Data type + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); + // Data layout + ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC); + // Interpolation policy + ARM_COMPUTE_RETURN_ERROR_ON_MSG(attributes.interpolation_policy() != InterpolationPolicy::NEAREST_NEIGHBOR && + attributes.interpolation_policy() != InterpolationPolicy::BILINEAR, + "Interpolation policy must be NEAREST_NEIGHBOR or BILINEAR"); + + // Check components + if (context.gpu_language() == GpuLanguage::OpenCL) + { + const auto cl_compile_ctx = context.cl_compile_context(); + ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr); + + // Validate Activation Component + { + const KernelProperties properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + + ArgumentPack<ITensorInfo> arguments; + arguments.add_const_tensor(ACL_SRC_0, src); + arguments.add_const_tensor(ACL_DST_0, dst_info_to_validate_ptr); + ARM_COMPUTE_RETURN_ON_ERROR(ClComponentResize::validate(properties, arguments, attributes)); + } + } + else + { + ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language"); + } + + return Status{}; +} + +constexpr GpuOperatorType operator_type = GpuOperatorType::Complex; +} // namespace + +Status +GpuResize::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const Attributes &attributes) +{ + return is_supported_op_helper(context, src, nullptr, attributes); +} + +Status +GpuResize::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const GpuResize::Attributes &attributes) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id()); + + // Refer to GpuConv2d::validate_op() for id-validness of this TensorInfo object + TensorInfo dst_info_to_validate; + + // Auto initialize dst tensor info + calculate_and_init_dst_if_empty(&dst_info_to_validate, src, attributes); + + // Perform fusion test + // Pack tensor infos + ArgumentPack<ITensorInfo> tensors; + tensors.add_const_tensor(ACL_SRC_0, src); + tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate); + const Operator op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op), + "Operator fusion test failed. This operator cannot be fused into the workload"); + + // Check if configuration is supported + return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate, attributes); +} + +ITensorInfo *GpuResize::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const GpuResize::Attributes &attributes) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_LOG_PARAMS(src, attributes); + ARM_COMPUTE_ERROR_THROW_ON(GpuResize::validate_op(sketch, src, attributes)); + + ITensorInfo *dst = sketch.implementation().create_virtual_tensor(); + ARM_COMPUTE_ERROR_ON_NULLPTR(dst); + + // Auto initialize dst tensor info if empty + calculate_and_init_dst_if_empty(dst, src, attributes); + + // Translate into components and add to component graph + GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph(); + const auto *sketch_ctx = sketch.implementation().context(); + + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(sketch_ctx->cl_compile_context()); + + // Add Resize Component + { + const auto properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + + ArgumentPack<ITensorInfo> arguments; + arguments.add_const_tensor(ACL_SRC_0, src); + arguments.add_const_tensor(ACL_DST_0, dst); + comp_graph.add_new_component<ClComponentResize>(properties, arguments, attributes); + } + } + else + { + ARM_COMPUTE_ERROR("Unimplemented Gpu language"); + } + + // Set up fusion test by adding to the Operator Group + // Note this has to be performed after all the components have been successfully added to the component graph + + // Pack tensor infos + ArgumentPack<ITensorInfo> tensors; + tensors.add_const_tensor(ACL_SRC_0, src); + tensors.add_const_tensor(ACL_DST_0, dst); + + const Operator op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + sketch.implementation().operator_group().add_operator(op); + + return dst; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.cpp new file mode 100644 index 0000000000..a2260c8c36 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.cpp @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.h" + +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +Status is_supported_op_helper(const GpuWorkloadContext &context, const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); + + TensorInfo dst_info_to_validate; + const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate; + + if (dst != nullptr) + { + dst_info_to_validate_ptr = dst; + } + + auto_init_if_empty(dst_info_to_validate, *src->clone()); + + const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::LOGISTIC}; + + // Check components + if (context.gpu_language() == GpuLanguage::OpenCL) + { + // Validate Activation Component + const auto properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + + ArgumentPack<ITensorInfo> arguments; + arguments.add_const_tensor(ACL_SRC, src); + arguments.add_const_tensor(ACL_DST, dst_info_to_validate_ptr); + ARM_COMPUTE_RETURN_ON_ERROR(ClComponentActivation::validate(properties, arguments, act_info)); + } + else + { + ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language"); + } + return Status{}; +} + +constexpr GpuOperatorType operator_type = GpuOperatorType::Simple; +} // namespace + +Status GpuSigmoid::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src) +{ + return is_supported_op_helper(context, src, nullptr); +} + +Status GpuSigmoid::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); + + // Check if tensors have valid id, i.e. they are created from a sketch + ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id()); + + // Refer to GpuConv2d::validate_op() for id-validness of this TensorInfo object + TensorInfo dst_info_to_validate; + + // Auto initialize dst tensor info + auto_init_if_empty(dst_info_to_validate, *src->clone()); + + // Perform fusion test to check if the operator meets fusion constraints + ArgumentPack<ITensorInfo> tensors; + tensors.add_const_tensor(ACL_SRC, src); + tensors.add_const_tensor(ACL_DST, &dst_info_to_validate); + const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op), + "Operator fusion test failed. This operator cannot be fused into the workload"); + + // Check if configuration is supported + return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate); +} + +ITensorInfo *GpuSigmoid::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_LOG_PARAMS(src); + ARM_COMPUTE_ERROR_THROW_ON(GpuSigmoid::validate_op(sketch, src)); + + ITensorInfo *dst = sketch.implementation().create_virtual_tensor(); + ARM_COMPUTE_ERROR_ON_NULLPTR(dst); + + // Auto initialize dst tensor + auto_init_if_empty(*dst, *src->clone()); + + // Translate into components and add to component graph + GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph(); + + const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::LOGISTIC}; + + const auto *const sketch_ctx = sketch.implementation().context(); + + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + { + // Add Activation Component + auto properties = IGpuKernelComponent::Properties(); + properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + + ArgumentPack<ITensorInfo> arguments; + arguments.add_const_tensor(ACL_SRC, src); + arguments.add_const_tensor(ACL_DST, dst); + comp_graph.add_new_component<ClComponentActivation>(properties, arguments, act_info); + } + else + { + ARM_COMPUTE_ERROR("Unimplemented Gpu language"); + } + + // Set up fusion test by adding to the Operator Group + // Note this has to be performed after all the components have been successfully added to the component graph + + // Pack tensor infos + ArgumentPack<ITensorInfo> tensors; + tensors.add_const_tensor(ACL_SRC, src); + tensors.add_const_tensor(ACL_DST, dst); + + const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + sketch.implementation().operator_group().add_operator(op); + + return dst; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp new file mode 100644 index 0000000000..d385752201 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.h" + +#include "arm_compute/core/Error.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +GpuOperatorType operator_type = GpuOperatorType::Unfusable; +} // namespace + +Status GpuSoftmax::is_supported_op(const GpuWorkloadContext &context, + const ITensorInfo *src, + const ITensorInfo *dst, + const Attributes &attributes) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + TensorInfo dst_info_to_validate; + + // Auto initialize dst tensor info + if (dst != nullptr) + { + dst_info_to_validate = *dst; + } + else + { + auto_init_if_empty(dst_info_to_validate, *src->clone()); + } + // Check components + if (context.gpu_language() == GpuLanguage::OpenCL) + { + const auto cl_compile_ctx = context.cl_compile_context(); + ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr); + const KernelProperties properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + + TensorShape logits_sum_shape = src->tensor_shape(); + TensorInfo logits(src->clone()->set_tensor_shape(logits_sum_shape)); + + // The sum tensor dim0 only need one element + logits_sum_shape.set(0, 1); + TensorInfo sum(src->clone()->set_tensor_shape(logits_sum_shape)); + + // Validate Component + ArgumentPack<ITensorInfo> arguments_exp_sum; + ArgumentPack<ITensorInfo> arguments_norm; + + arguments_exp_sum.add_const_tensor(ACL_SRC_0, src); + arguments_exp_sum.add_const_tensor(ACL_DST_0, &sum); + arguments_exp_sum.add_const_tensor(ACL_DST_1, &logits); + + arguments_norm.add_const_tensor(ACL_SRC_0, &logits); + arguments_norm.add_const_tensor(ACL_SRC_1, &sum); + arguments_norm.add_const_tensor(ACL_DST_0, &dst_info_to_validate); + + ARM_COMPUTE_UNUSED(properties, attributes); + return Status(ErrorCode::RUNTIME_ERROR, "GpuSoftmax is not implemented"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language"); + } + + return Status{ErrorCode::RUNTIME_ERROR, "GpuSoftmax is not Supported"}; +} + +Status GpuSoftmax::validate_op(const GpuWorkloadSketch &sketch, + const ITensorInfo *src, + const ITensorInfo *dst, + const Attributes &attributes) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id() || !dst->has_valid_id()); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 4, "Only up to 4 dimensions are supported"); + ARM_COMPUTE_RETURN_ERROR_ON(attributes.axis() < static_cast<int32_t>(-src->num_dimensions()) || + static_cast<int32_t>(src->num_dimensions()) <= attributes.axis()); + + // Auto initialize dst tensor info + TensorInfo dst_info_to_validate = *dst; + auto_init_if_empty(dst_info_to_validate, *src->clone()); + + const size_t actual_axis = + static_cast<size_t>(wrap_around(attributes.axis(), static_cast<int32_t>(src->num_dimensions()))); + const bool needs_permute = actual_axis != 0; + ARM_COMPUTE_RETURN_ERROR_ON_MSG(needs_permute, "Dynamic fusion softmax on axis!=0 not supported yet."); + + // Perform fusion test and check if the operator meets the fusion constraints + ArgumentPack<ITensorInfo> tensors; + tensors.add_const_tensor(ACL_SRC_0, src); + tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate); + + const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op), + "Operator fusion test failed. This operator cannot be fused into the workload"); + + // Check if configuration is supported + return is_supported_op(*sketch.gpu_context(), src, &dst_info_to_validate, attributes); +} + +void GpuSoftmax::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *dst, const Attributes &attributes) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_LOG_PARAMS(src, dst, attributes); + TensorShape logits_sum_shape = src->tensor_shape(); + ITensorInfo *logits = sketch.implementation().create_auxiliary_tensor( + src->clone()->set_id(ITensorInfo::invalid_tensor_id).set_tensor_shape(logits_sum_shape)); + logits_sum_shape.set(0, 1); + ITensorInfo *sum = sketch.implementation().create_auxiliary_tensor( + src->clone()->set_id(ITensorInfo::invalid_tensor_id).set_tensor_shape(logits_sum_shape)); + + // Auto initialize dst tensor info and the auxiliary tensor infos as well + auto_init_if_empty(*dst, *src->clone()); + + // Assert validation + ARM_COMPUTE_ERROR_THROW_ON(GpuSoftmax::validate_op(sketch, src, dst, attributes)); + ARM_COMPUTE_ERROR_ON_NULLPTR(logits, sum); + + // Translate into components and add to component graph + auto &comp_graph = sketch.implementation().component_graph(); + const auto sketch_ctx = sketch.implementation().context(); + + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + { + const auto cl_compile_ctx = sketch_ctx->cl_compile_context(); + ARM_COMPUTE_UNUSED(cl_compile_ctx); + ARM_COMPUTE_ERROR_ON(cl_compile_ctx == nullptr); + + // Add Direct Conv2d Component + { + auto properties = IGpuKernelComponent::Properties(); + properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + + ArgumentPack<ITensorInfo> arguments_exp_sum; + ArgumentPack<ITensorInfo> arguments_norm; + + arguments_exp_sum.add_const_tensor(ACL_SRC_0, src); + arguments_exp_sum.add_const_tensor(ACL_DST_0, sum); + arguments_exp_sum.add_const_tensor(ACL_DST_1, logits); + + arguments_norm.add_const_tensor(ACL_SRC_0, logits); + arguments_norm.add_const_tensor(ACL_SRC_1, sum); + arguments_norm.add_const_tensor(ACL_DST_0, dst); + + // Add to component graph -- NOT IMPLEMENTED + ARM_COMPUTE_UNUSED(comp_graph, attributes); + } + } + else + { + ARM_COMPUTE_ERROR("Unimplemented Gpu language"); + } + + // Set up fusion test by adding to the Operator Group + // Note this has to be performed after all the components have been successfully added to the component graph + + // Pack tensor infos + ArgumentPack<ITensorInfo> tensors; + tensors.add_const_tensor(ACL_SRC_0, src); + tensors.add_const_tensor(ACL_DST_0, dst); + + const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + sketch.implementation().operator_group().add_operator(op); +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp new file mode 100644 index 0000000000..c53453a15c --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSub.h" + +#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h" + +#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +Status GpuSub::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *lhs, const ITensorInfo *rhs) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type"); + + // Set the elementwise operation to Sub then call the elementwise common validate_op + ElementwiseBinaryCommonAttributes common_attributes{}; + common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub); + return GpuElementwiseBinaryCommon::validate_op(sketch, lhs, rhs, common_attributes); +} + +Status GpuSub::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type"); + + // Set the elementwise operation to Sub then call the elementwise common is_supported_op + ElementwiseBinaryCommonAttributes common_attributes{}; + common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub); + return GpuElementwiseBinaryCommon::is_supported_op(context, lhs, rhs, common_attributes); +} + +ITensorInfo *GpuSub::create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs) +{ + // No need to log or validate as they'll be handled inside GpuElementwiseBinaryCommon::create_op() + // Set the elementwise operation to Sub then call the elementwise common create_op + ElementwiseBinaryCommonAttributes common_attributes{}; + common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub); + return GpuElementwiseBinaryCommon::create_op(sketch, lhs, rhs, common_attributes); +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp new file mode 100644 index 0000000000..b9d01966b3 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuTanh.h" + +#include "arm_compute/core/experimental/Types.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +Status is_supported_op_helper(const GpuWorkloadContext &context, const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); + + TensorInfo dst_info_to_validate; + const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate; + + if (dst != nullptr) + { + dst_info_to_validate_ptr = dst; + } + + auto_init_if_empty(dst_info_to_validate, *src->clone()); + + const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::TANH}; + + // Check components + if (context.gpu_language() == GpuLanguage::OpenCL) + { + // Validate Activation Component + const auto properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + + ArgumentPack<ITensorInfo> arguments; + arguments.add_const_tensor(ACL_SRC, src); + arguments.add_const_tensor(ACL_DST, dst_info_to_validate_ptr); + ARM_COMPUTE_RETURN_ON_ERROR(ClComponentActivation::validate(properties, arguments, act_info)); + } + else + { + ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language"); + } + return Status{}; +} + +constexpr GpuOperatorType operator_type = GpuOperatorType::Simple; +} // namespace + +Status GpuTanh::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src) +{ + return is_supported_op_helper(context, src, nullptr); +} + +Status GpuTanh::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); + + // Check if tensors have valid id, i.e. they are created from a sketch + ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id()); + + // Refer to GpuConv2d::validate_op() for id-validness of this TensorInfo object + TensorInfo dst_info_to_validate; + + // Auto initialize dst tensor info + auto_init_if_empty(dst_info_to_validate, *src->clone()); + + // Perform fusion test to check if the operator meets fusion constraints + ArgumentPack<ITensorInfo> tensors; + tensors.add_const_tensor(ACL_SRC, src); + tensors.add_const_tensor(ACL_DST, &dst_info_to_validate); + const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op), + "Operator fusion test failed. This operator cannot be fused into the workload"); + + // Check if configuration is supported + return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate); +} + +ITensorInfo *GpuTanh::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_LOG_PARAMS(src); + ARM_COMPUTE_ERROR_THROW_ON(GpuTanh::validate_op(sketch, src)); + + ITensorInfo *dst = sketch.implementation().create_virtual_tensor(); + ARM_COMPUTE_ERROR_ON_NULLPTR(dst); + + // Auto initialize dst tensor + auto_init_if_empty(*dst, *src->clone()); + + // Translate into components and add to component graph + GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph(); + + const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::TANH}; + + const auto *const sketch_ctx = sketch.implementation().context(); + + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + { + // Add Activation Component + auto properties = IGpuKernelComponent::Properties(); + properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + + ArgumentPack<ITensorInfo> arguments; + arguments.add_const_tensor(ACL_SRC, src); + arguments.add_const_tensor(ACL_DST, dst); + comp_graph.add_new_component<ClComponentActivation>(properties, arguments, act_info); + } + else + { + ARM_COMPUTE_ERROR("Unimplemented Gpu language"); + } + + // Set up fusion test by adding to the Operator Group + // Note this has to be performed after all the components have been successfully added to the component graph + + // Pack tensor infos + ArgumentPack<ITensorInfo> tensors; + tensors.add_const_tensor(ACL_SRC, src); + tensors.add_const_tensor(ACL_DST, dst); + + const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + sketch.implementation().operator_group().add_operator(op); + + return dst; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp b/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp new file mode 100644 index 0000000000..d79a4c42c9 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *lhs, const ITensorInfo *rhs) +{ + if (dst->total_size() == 0U) + { + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*lhs, *rhs); + auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(broadcast_pair.first)); + } +} + +Status is_supported_op_helper(const GpuWorkloadContext &context, + const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + const ElementwiseBinaryCommonAttributes &attributes) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs); + + TensorInfo dst_info_to_validate; + const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate; + + if (dst != nullptr) + { + dst_info_to_validate_ptr = dst; + } + + calculate_and_init_dst_if_empty(&dst_info_to_validate, lhs, rhs); + + // Check components + if (context.gpu_language() == GpuLanguage::OpenCL) + { + const auto cl_compile_ctx = context.cl_compile_context(); + ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr); + + // Validate ElementwiseBinary Component + { + ArgumentPack<ITensorInfo> arguments; + arguments.add_const_tensor(ACL_SRC_0, lhs); + arguments.add_const_tensor(ACL_SRC_1, rhs); + arguments.add_const_tensor(ACL_DST_0, dst_info_to_validate_ptr); + + ARM_COMPUTE_RETURN_ON_ERROR(ClComponentElementwiseBinary::validate(arguments, attributes)); + } + } + else + { + ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language"); + } + + return Status{}; +} + +GpuOperatorType operator_type = GpuOperatorType::Simple; +} // namespace + +ElementwiseBinaryCommonAttributes & +ElementwiseBinaryCommonAttributes::operation(const ElementwiseBinaryCommonAttributes::ElementwiseOp &operation) +{ + _operation = operation; + return *this; +} + +ElementwiseBinaryCommonAttributes::ElementwiseOp ElementwiseBinaryCommonAttributes::operation() const +{ + return _operation; +} + +Status GpuElementwiseBinaryCommon::is_supported_op(const GpuWorkloadContext &context, + const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ElementwiseBinaryCommonAttributes &attributes) +{ + return is_supported_op_helper(context, lhs, rhs, nullptr, attributes); +} + +Status GpuElementwiseBinaryCommon::validate_op(const GpuWorkloadSketch &sketch, + const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ElementwiseBinaryCommonAttributes &attributes) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs); + ARM_COMPUTE_RETURN_ERROR_ON(!lhs->has_valid_id() || !rhs->has_valid_id()); + + // Refer to GpuConv2d::validate_op() for id-validness of this TensorInfo object + TensorInfo dst_info_to_validate; + + // Auto initialize dst tensor info + calculate_and_init_dst_if_empty(&dst_info_to_validate, lhs, rhs); + + // Perform fusion test + // Pack tensor infos + ArgumentPack<ITensorInfo> tensors; + tensors.add_const_tensor(ACL_SRC_0, lhs); + tensors.add_const_tensor(ACL_SRC_1, rhs); + tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate); + const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op), + "Operator fusion test failed. This operator cannot be fused into the workload"); + + // Check if configuration is supported + return is_supported_op_helper(*sketch.gpu_context(), lhs, rhs, &dst_info_to_validate, attributes); +} + +ITensorInfo *GpuElementwiseBinaryCommon::create_op(GpuWorkloadSketch &sketch, + ITensorInfo *lhs, + ITensorInfo *rhs, + const ElementwiseBinaryCommonAttributes &attributes) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs); + ARM_COMPUTE_LOG_PARAMS(lhs, rhs); + ARM_COMPUTE_ERROR_THROW_ON(GpuElementwiseBinaryCommon::validate_op(sketch, lhs, rhs, attributes)); + + ITensorInfo *dst = sketch.implementation().create_virtual_tensor(); + ARM_COMPUTE_ERROR_ON_NULLPTR(dst); + + // Auto initialize dst tensor + calculate_and_init_dst_if_empty(dst, lhs, rhs); + + // Translate into components and add to component graph + auto &comp_graph = sketch.implementation().component_graph(); + + const auto sketch_ctx = sketch.implementation().context(); + + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(sketch_ctx->cl_compile_context()); + + // Add ElementwiseBinary Component + { + auto properties = IGpuKernelComponent::Properties(); + properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + + ArgumentPack<ITensorInfo> arguments; + arguments.add_const_tensor(ACL_SRC_0, lhs); + arguments.add_const_tensor(ACL_SRC_1, rhs); + arguments.add_const_tensor(ACL_DST_0, dst); + comp_graph.add_new_component<ClComponentElementwiseBinary>(properties, arguments, attributes); + } + } + else + { + ARM_COMPUTE_ERROR("Unimplemented Gpu language"); + } + + // Set up fusion test by adding to the Operator Group + // Note this has to be performed after all the components have been successfully added to the component graph + + // Pack tensor infos + ArgumentPack<ITensorInfo> tensors; + tensors.add_const_tensor(ACL_SRC_0, lhs); + tensors.add_const_tensor(ACL_SRC_1, rhs); + tensors.add_tensor(ACL_DST_0, dst); + const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + sketch.implementation().operator_group().add_operator(op); + + return dst; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h b/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h new file mode 100644 index 0000000000..0b58b6eb96 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_INTERNAL_GPUELEMENTWISEBINARYCOMMON +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_INTERNAL_GPUELEMENTWISEBINARYCOMMON + +#include "arm_compute/core/Error.h" + +namespace arm_compute +{ +/** Forward declaration */ +class ITensorInfo; + +namespace experimental +{ +namespace dynamic_fusion +{ +class ElementwiseBinaryCommonAttributes +{ +public: + enum class ElementwiseOp + { + Add, /**< (x + y) */ + Sub, /**< (x - y) */ + Div, /**< (x / y) */ + Mul, /**< (x * y) */ + Min, /**< Min(x, y) */ + Max, /**< Max(x, y) */ + SquaredDiff, /**< (x - y)^2 */ + Power, /**< x ^ y */ + Prelu, /**< y*x if x < 0, x otherwise */ + }; + /** Set operation*/ + ElementwiseBinaryCommonAttributes &operation(const ElementwiseBinaryCommonAttributes::ElementwiseOp &operation); + /** Get operation*/ + ElementwiseOp operation() const; + +private: + ElementwiseOp _operation; /**< Elementwise operation */ +}; + +/** Forward declaration */ +class GpuWorkloadContext; +class GpuWorkloadSketch; + +/** Operator interface. */ +class GpuElementwiseBinaryCommon final +{ +public: + /** Create an operator and fuse it into the workload sketch. + * @note If @ref validate_op() fails, the creation also fails and may throw an error. + * @note If @ref validate_op() fails, @p sketch remains unchanged and valid. + * + * Valid data type configurations are checked at the operator level i.e. GpuAdd::validate_op(), GpuSub::validate_op(), ... etc. + * + * Valid data layouts: + * - Any + * + * @param[in,out] sketch Workload sketch into which the operator will be fused + * @param[in] lhs Left hand side tensor info. Data types supported: U8/S16/S32/F16/F32. + * @param[in] rhs Right hand side tensor info. Data types supported: U8/S16/S32/F16/F32. + * @param[in] attributes ElementwiseBinaryCommonAttributes containing the operator type: ADD, SUB, DIV, ... etc. + * + * @return Pointer for the destination tensor info + */ + static ITensorInfo *create_op(GpuWorkloadSketch &sketch, + ITensorInfo *lhs, + ITensorInfo *rhs, + const ElementwiseBinaryCommonAttributes &attributes); + /** Check if the operator configuration is supported, irrespective of fusion + * + * @param[in] context Workload context within which the operator is running + * @param[in] lhs Left hand side tensor info. Data types supported: U8/S16/S32/F16/F32. + * @param[in] rhs Right hand side tensor info. Data types supported: U8/S16/S32/F16/F32. + * @param[in] attributes ElementwiseBinaryCommonAttributes containing the operator type: ADD, SUB, DIV, ... etc. + * + * @return Status + */ + static Status is_supported_op(const GpuWorkloadContext &context, + const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ElementwiseBinaryCommonAttributes &attributes); + /** Validate the operator and check if it can be fused into the workload sketch. + * + * Parameters are similar to @ref GpuElementwiseBinaryCommon::create_op() + * + * @return Status + */ + static Status validate_op(const GpuWorkloadSketch &sketch, + const ITensorInfo *rhs, + const ITensorInfo *lhs, + const ElementwiseBinaryCommonAttributes &attributes); +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_INTERNAL_GPUELEMENTWISEBINARYCOMMON */ diff --git a/src/dynamic_fusion/sketch/utils/DependencyGraph.h b/src/dynamic_fusion/sketch/utils/DependencyGraph.h new file mode 100644 index 0000000000..c157c2b21c --- /dev/null +++ b/src/dynamic_fusion/sketch/utils/DependencyGraph.h @@ -0,0 +1,648 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_UTILS_DEPENDENCYGRAPH +#define SRC_DYNAMIC_FUSION_SKETCH_UTILS_DEPENDENCYGRAPH + +#include "arm_compute/core/Error.h" + +#include <cstdint> +#include <map> +#include <set> +#include <tuple> +#include <vector> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +template <typename T> +bool is_in(const T &v, const std::vector<T> &vec) +{ + return std::find(std::begin(vec), std::end(vec), v) != std::end(vec); +} +} // namespace + +/** A multi-input (tensors), multi-output (tensors) acyclic directed graph + * Represented as a doubly-linked adjacency list with the differentiation between source and destination + */ +class DependencyGraph +{ +public: + using Id = int32_t; + using TensorId = Id; + using OperatorId = Id; + /** Adjacency list + * + */ + using AdjList = std::map<Id, std::vector<Id>>; + + /** A pack of operator including its input and output tensors, used by traversing through the graph in topological order + * + */ + struct OpPack + { + OperatorId op{}; + std::vector<TensorId> inputs{}; + std::vector<TensorId> outputs{}; + friend bool operator==(const OpPack &opp0, const OpPack &opp1) + { + return std::make_tuple(opp0.op, opp0.inputs, opp0.outputs) == + std::make_tuple(opp1.op, opp1.inputs, opp1.outputs); + } + }; + +public: + DependencyGraph() = default; + friend std::ostream &operator<<(std::ostream &os, const DependencyGraph &); + + /** Try adding an operator (without actually adding it), while keeping the graph as a "linear sequence" / list + * + * Rule: If the new operator is not the first operator, at least one input tensor must be + * the output tensor of the last non-output operator. All other input tensors must be + * the global input of the graph (i.e. not the output of any operator). + * + * Rule: The output tensor of the new operator must not be the input tensor of any previously + * added operator. + * + * PRECONDITION: The current graph is already linear + * + * @return true If the operator can be added while keeping the graph as a linear sequence + * @return false Otherwise + */ + bool try_add_operator_as_linear(OperatorId op, + const std::vector<TensorId> &inputs, + const std::vector<TensorId> &outputs, + bool is_output = false) const + { + ARM_COMPUTE_UNUSED(op, is_output); + if (all_ops().empty()) + { + return true; + } + + // If the new operator is not the first operator, at least one input tensor must be + // the output tensor of the last non-output operator. All other input tensors must be + // the global input of the graph (i.e. not the output of any operator). + if (_last_op_available) + { + auto use_input_from_last_op = false; + + for (auto src_tensor : inputs) + { + const auto src_ops = _adj_src_ops.find(src_tensor); + + if (src_ops != _adj_src_ops.end()) + { + ARM_COMPUTE_ERROR_ON(src_ops->second.size() > 1); + + if (!src_ops->second.empty()) + { + const auto src_op = src_ops->second[0]; + + if (src_op == _last_op) + { + if (use_input_from_last_op) + { + // To be safe, we also forbid using the output tensor + // of the last operator twice. + return false; + } + + use_input_from_last_op = true; + } + else + { + // The input tensor of this operator must not be the output tensor + // of any other operator except the last non-output operator. + return false; + } + } + } + } + + if (!use_input_from_last_op) + { + // At least one input tensor must be the output tensor of the last non-output operator. + return false; + } + } + + // The output tensor of the new operator must not be the input tensor of any previously + // added operator. + for (auto dst_tensor : outputs) + { + if (_adj_dst_ops.find(dst_tensor) != _adj_dst_ops.end()) + { + return false; + } + } + + return true; + } + /** Add an operator, while keeping the graph as a "linear sequence" + * + * PRECONDITION: The current graph is already linear + * INVARIANT: The list can only grow from head to tail + * INVARIANT: POSTCONDITION: The graph is linear + */ + void add_operator_as_linear(OperatorId op, + const std::vector<TensorId> &inputs, + const std::vector<TensorId> &outputs, + bool is_output = false) + { + const auto success = add_operator(op, inputs, outputs, is_output); + ARM_COMPUTE_UNUSED(success); + ARM_COMPUTE_ERROR_ON(!success); + } + /** Add a new operator + * Return invalid if it violates the DAG invariant + * Invalid operation will not change the graph + * + * @param[in] op Operator to add + * @param[in] inputs Input tensors to the operator + * @param[in] outputs Output tensors to the operator + * @param[in] is_output Whether this is an output operator + */ + bool add_operator(OperatorId op, + const std::vector<TensorId> &inputs, + const std::vector<TensorId> &outputs, + bool is_output = false) + { + if (operator_exists(op)) + { + return false; + } + _adj_src_tensors[op] = {}; + _adj_dst_tensors[op] = {}; + for (auto in_tensor : inputs) + { + // Linking input tensor to operator node will never create a cycle / loop because we guarantee + // each op is newly created, so every <input, op> pair / edge is new + link_input(op, in_tensor); + } + for (auto out_tensor : outputs) + { + // If there exists a back path from op's output tensor to op already, then linking the two will create a loop / cycle + if (path_exists_from_tensor_to_op(out_tensor, op)) + { + remove_operator(op); + return false; + } + else + { + link_output(op, out_tensor); + } + } + + if (!is_output) + { + _last_op_available = true; + _last_op = op; + } + + return true; + } + + /** Build a sequence of operators from the acyclic graph of operators. + * + * The graph will be visited in depth-first strategy. The operator can only be added to + * the sequence when all operators that supply the input tensors have been added. Otherwise, + * the operator will be ignored and later visited again. In other words, the dependency between + * operators will be preserved in the sequence. + */ + std::vector<OpPack> build_operators_sequence() const + { + std::vector<OpPack> ops_seq; + std::set<Id> done_ops; + std::set<Id> done_tensors; + + const auto input_tensors = global_src_tensors(); + + for (auto tensor : input_tensors) + { + done_tensors.insert(tensor); + + for (auto op : _adj_dst_ops.at(tensor)) + { + build_operators_sequence_from_op(op, ops_seq, done_ops, done_tensors); + } + } + + return ops_seq; + } + + /** Strict equality comparison (all internal ids and order of insertion matter). + * In the future this may be replaced with a topological comparison, allowing equivalent graphs with different internal ids to be equal + * + * + * @param[in] g0 + * @param[in] g1 + * @return true If the same + * @return false Otherwise + */ + friend bool operator==(const DependencyGraph &g0, const DependencyGraph &g1) + { + // Do not compare id allocators + return std::make_tuple(g0._adj_src_tensors, g0._adj_dst_tensors, g0._adj_src_ops, g0._adj_dst_ops) == + std::make_tuple(g1._adj_src_tensors, g1._adj_dst_tensors, g1._adj_src_ops, g1._adj_dst_ops); + } + std::vector<OperatorId> src_ops_from_tensor(TensorId tensor) const + { + return _adj_src_ops.at(tensor); + } + std::vector<OperatorId> dst_ops_from_tensor(TensorId tensor) const + { + return _adj_dst_ops.at(tensor); + } + /** Get all tensors + * + * @return std::vector<TensorId> + */ + std::vector<TensorId> all_tensors() const + { + std::vector<TensorId> tensors{}; + std::transform(std::begin(_adj_src_ops), std::end(_adj_src_ops), std::back_inserter(tensors), + [](const auto &it) { return it.first; }); + return tensors; + } + /** Get source tensors of the whole graph + * + * @return std::vector<TensorId> + */ + std::vector<TensorId> global_src_tensors() const + { + std::vector<TensorId> tensors; + for (auto tensor_src_ops : _adj_src_ops) + { + if (tensor_src_ops.second.empty()) + { + tensors.push_back(tensor_src_ops.first); + } + } + return tensors; + } + /** Get destination tensors of the whole graph + * + * @return std::vector<TensorId> + */ + std::vector<TensorId> global_dst_tensors() const + { + std::vector<TensorId> tensors; + for (auto tensor_dst_ops : _adj_dst_ops) + { + if (tensor_dst_ops.second.empty()) + { + tensors.push_back(tensor_dst_ops.first); + } + } + return tensors; + } + /** Get intermediate tensors of the whole graph. + * + * @return std::vector<TensorId> + */ + std::vector<TensorId> intermediate_tensors() const + { + std::vector<TensorId> tensors; + + // If a tensor is used to connect the input of an operator and the output of another operator, + // it is not allocated in the memory. The tensor exists as a temporary variable only. + for (auto src_tensor : _adj_src_ops) + { + if (!src_tensor.second.empty()) + { + const auto dst_tensor = _adj_dst_ops.find(src_tensor.first); + if (dst_tensor != _adj_dst_ops.end()) + { + if (!dst_tensor->second.empty()) + { + tensors.push_back(src_tensor.first); + } + } + } + } + + return tensors; + } + /** Get all root ops. Root ops can also be referred to as "src ops" of the whole graph + * + * @return std::vector<OperatorId> + */ + std::vector<OperatorId> get_root_ops() const + { + std::vector<OperatorId> ops{}; + const auto op_list = all_ops(); + + for (auto op : op_list) + { + if (src_ops(op).empty()) + { + ops.emplace_back(op); + } + } + return ops; + } + +private: + void link_input(OperatorId op, TensorId in_tensor) + { + ARM_COMPUTE_ERROR_ON(!operator_exists(op)); + if (!tensor_exists(in_tensor)) + { + insert_new_tensor(in_tensor); + } + ARM_COMPUTE_ERROR_ON(are_connected(op, in_tensor)); // Prevent repetitive linking + _adj_src_tensors[op].push_back(in_tensor); + _adj_dst_ops[in_tensor].push_back(op); + } + void link_output(OperatorId op, TensorId out_tensor) + { + ARM_COMPUTE_ERROR_ON(!operator_exists(op)); + if (!tensor_exists(out_tensor)) + { + insert_new_tensor(out_tensor); + } + ARM_COMPUTE_ERROR_ON(are_connected(op, out_tensor)); // Prevent repetitive linking + _adj_dst_tensors[op].push_back(out_tensor); + _adj_src_ops[out_tensor].push_back(op); + } + + std::vector<OperatorId> src_ops(OperatorId op) const + { + ARM_COMPUTE_ERROR_ON(!operator_exists(op)); + std::vector<OperatorId> ops{}; + for (TensorId src_tensor : src_tensors(op)) + { + ops.insert(ops.end(), std::begin(_adj_src_ops.at(src_tensor)), std::end(_adj_src_ops.at(src_tensor))); + } + return ops; + } + std::vector<OperatorId> dst_ops(OperatorId op) const + { + ARM_COMPUTE_ERROR_ON(!operator_exists(op)); + std::vector<OperatorId> ops{}; + for (TensorId dst_tensor : _adj_dst_tensors.at(op)) + { + ops.insert(ops.end(), std::begin(_adj_dst_ops.at(dst_tensor)), std::end(_adj_dst_ops.at(dst_tensor))); + } + return ops; + } + + /** Get source tensors to an operator + * + * @param[in] op + * @return std::vector<TensorId> + */ + std::vector<TensorId> src_tensors(OperatorId op) const + { + ARM_COMPUTE_ERROR_ON(!operator_exists(op)); + return _adj_src_tensors.at(op); + } + /** Get destination tensors to an operator + * + * @param[in] op + * @return std::vector<TensorId> + */ + std::vector<TensorId> dst_tensors(OperatorId op) const + { + ARM_COMPUTE_ERROR_ON(!operator_exists(op)); + return _adj_dst_tensors.at(op); + } + /** Get all operators + * + * @return std::vector<OperatorId> + */ + std::vector<OperatorId> all_ops() const + { + std::vector<OperatorId> ops{}; + std::transform(std::begin(_adj_src_tensors), std::end(_adj_src_tensors), std::back_inserter(ops), + [](const auto &it) { return it.first; }); + return ops; + } + /** Remove an operator from graph. + * + * @param[in] op + */ + void remove_operator(OperatorId op) + { + for (auto src_tensor : _adj_src_tensors.at(op)) + { + auto &dst_ops = _adj_dst_ops.at(src_tensor); + dst_ops.erase(std::remove(std::begin(dst_ops), std::end(dst_ops), op), std::end(dst_ops)); + } + for (auto dst_tensor : _adj_dst_tensors.at(op)) + { + auto &src_ops = _adj_src_ops.at(dst_tensor); + src_ops.erase(std::remove(std::begin(src_ops), std::end(src_ops), op), std::end(src_ops)); + } + // Remove any isolated tensors + // An isolated tensor is one where both its _adj_src_ops and _adj_dst_ops are empty + for (auto t : all_tensors()) + { + if (_adj_src_ops.at(t).empty() && _adj_dst_ops.at(t).empty()) + { + _adj_src_ops.erase(t); + _adj_dst_ops.erase(t); + } + } + _adj_src_tensors.erase(op); + _adj_dst_tensors.erase(op); + } + void insert_new_tensor(TensorId tensor) + { + _adj_src_ops[tensor] = {}; + _adj_dst_ops[tensor] = {}; + } + bool tensor_exists(TensorId tensor) const + { + return _adj_src_ops.find(tensor) != _adj_src_ops.end() && _adj_dst_ops.find(tensor) != _adj_dst_ops.end(); + } + bool operator_exists(OperatorId op) const + { + return _adj_src_tensors.find(op) != _adj_src_tensors.end() && + _adj_dst_tensors.find(op) != _adj_dst_tensors.end(); + } + bool is_src_tensor_of(OperatorId op, TensorId tensor) const + { + if (!operator_exists(op) || !tensor_exists(tensor)) + { + return false; + } + const auto op_inputs = src_tensors(op); + return std::find(op_inputs.begin(), op_inputs.end(), tensor) != op_inputs.end(); + } + bool is_dst_tensor_of(OperatorId op, TensorId tensor) const + { + if (!operator_exists(op) || !tensor_exists(tensor)) + { + return false; + } + const auto op_outputs = dst_tensors(op); + return std::find(op_outputs.begin(), op_outputs.end(), tensor) != op_outputs.end(); + } + bool are_connected(OperatorId op, TensorId tensor) const + { + return is_src_tensor_of(op, tensor) || is_dst_tensor_of(op, tensor); + } + /** If op is the destination / leaf operator of the whole graph + * + * @param[in] op + * @return true + * @return false + */ + bool is_dst_op(OperatorId op) const + { + return dst_ops(op).empty(); + } + std::vector<OperatorId> get_dst_ops() const + { + std::vector<OperatorId> ops{}; + const auto op_list = all_ops(); + + for (auto op : op_list) + { + if (is_dst_op(op)) + { + ops.emplace_back(op); + } + } + return ops; + } + bool path_exists_from_tensor_to_op(TensorId src_tensor, OperatorId dst_op) const + { + if (!tensor_exists(src_tensor) || !operator_exists(dst_op)) + { + return false; + } + for (auto child_op : dst_ops_from_tensor(src_tensor)) + { + if (path_exists_from_op_to_op(child_op, dst_op)) + { + return true; + } + } + return false; + } + + bool path_exists_from_op_to_op(OperatorId src_op, OperatorId dst_op) const + { + if (!operator_exists(src_op) || !operator_exists(dst_op)) + { + return false; + } + if (src_op == dst_op) + { + return true; + } + if (is_in(src_op, get_dst_ops())) + { + return false; + } + for (auto child_tensor : dst_tensors(src_op)) + { + if (path_exists_from_tensor_to_op(child_tensor, dst_op)) + { + return true; + } + } + return false; + } + + void build_operators_sequence_from_op(Id op, + std::vector<OpPack> &ops_seq, + std::set<Id> &done_ops, + std::set<Id> &done_tensors) const + { + while (true) + { + // If the operator has been added to the sequence, ignore it. + if (done_ops.find(op) != done_ops.end()) + { + return; + } + + // If not all the input tensors of the operator are available, this operator cannot be + // added to the sequence for now. It will be visited again after the source operator + // is added to the sequence. + const auto src_tensors = _adj_src_tensors.at(op); + + for (auto src : src_tensors) + { + if (done_tensors.find(src) == done_tensors.end()) + { + return; + } + } + + // This operator is ready to be added to the sequence. + const auto dst_tensors = _adj_dst_tensors.at(op); + + done_ops.insert(op); + + OpPack pack{op, src_tensors, dst_tensors}; + ops_seq.push_back(pack); + + done_tensors.insert(dst_tensors.begin(), dst_tensors.end()); + + // Visit all the sink operators. + // Call this function recursively unless there is only one sink. + if (dst_tensors.size() == 1 && _adj_dst_ops.at(dst_tensors[0]).size() == 1) + { + op = _adj_dst_ops.at(dst_tensors[0])[0]; + } + else + { + for (auto dst_tensor : dst_tensors) + { + const auto dst_ops = _adj_dst_ops.at(dst_tensor); + + for (auto dst_op : dst_ops) + { + build_operators_sequence_from_op(dst_op, ops_seq, done_ops, done_tensors); + } + } + + return; + } + } + } + +private: + AdjList _adj_src_tensors{}; + AdjList _adj_dst_tensors{}; + AdjList _adj_src_ops{}; + AdjList _adj_dst_ops{}; + + bool _last_op_available{false}; + OperatorId _last_op{0}; +}; + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_UTILS_DEPENDENCYGRAPH */ |