Add temporary tile support for dynamic fusion

* Multiple intermediate tensors can share the same tile. - A simple operator can reuse the input tensor for the result if the input tensor has the same shape, data type and it is only consumed by that operator. - The special case is a simple operator and an output operator consume the same tensor. However as the output operator doesn't change the content of the input tensor, it doesn't count as "consuming" the input tensor. * These temporary tiles are declared automatically by the template writer. Individual operator doesn't need to generate output tile declaration. * Cast is now simple operator. Resolves: COMPMID-5778 Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com> Change-Id: I232647ac976645e2d266a62e055b9eb48c356a8e Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8877 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
author: Viet-Hoa Do <viet-hoa.do@arm.com> 2022-12-16 14:45:57 +0000
committer: Viet-Hoa Do <viet-hoa.do@arm.com> 2022-12-30 13:59:23 +0000
commit: 3558c5840e7c973e2b1a86ae3a9335b44cad59d4 (patch)
tree: b5f14b344ff8bc03e5143a54a5f3480263db543e /src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
parent: 9d3bd41030366326e9c8afe5db3a5812a76b135b (diff)
download: ComputeLibrary-3558c5840e7c973e2b1a86ae3a9335b44cad59d4.tar.gz
1 files changed, 124 insertions, 4 deletions
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
index 0d2574957f..81c3f0c800 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
@@ -133,8 +133,9 @@ void GpuKernelComponentGroup::finalize()
 
     _finalized = true;
 
-    std::set<const ITensorInfo *> input_tensors;
     std::set<const ITensorInfo *> output_tensors;
+    std::map<const ITensorInfo *, std::vector<const ITensorInfo *>> possible_tile_map;
+    std::map<const ITensorInfo *, int32_t> tile_usages;
 
     for(auto component : _components)
     {
@@ -156,26 +157,139 @@ void GpuKernelComponentGroup::finalize()
             }
             else if(_interm_tensors.find(tensor) == _interm_tensors.end())
             {
-                input_tensors.insert(tensor);
+                _input_tensors.insert(tensor);
+
+                tile_usages[tensor] = 0;
+                possible_tile_map.emplace(tensor, std::vector<const ITensorInfo *>());
             }
         }
 
         for(auto tensor : dst_tensors)
         {
-            ARM_COMPUTE_ERROR_ON(input_tensors.find(tensor) != input_tensors.end());
+            ARM_COMPUTE_ERROR_ON(_input_tensors.find(tensor) != _input_tensors.end());
             ARM_COMPUTE_ERROR_ON(output_tensors.find(tensor) != output_tensors.end());
             ARM_COMPUTE_ERROR_ON(_interm_tensors.find(tensor) != _interm_tensors.end());
             output_tensors.insert(tensor);
+
+            tile_usages[tensor] = 0;
+            possible_tile_map.emplace(tensor, std::vector<const ITensorInfo *>());
+        }
+
+        // Check if the output can overwrite the input tile.
+        const auto component_type = component->type();
+        if(component_type == GpuComponentType::Simple || component_type == GpuComponentType::Output)
+        {
+            ARM_COMPUTE_ERROR_ON(dst_tensors.size() != 1);
+
+            const auto dst_tensor = dst_tensors[0];
+            const auto &dst_shape = dst_tensor->tensor_shape();
+            const auto &dst_type = dst_tensor->data_type();
+
+            tile_usages[dst_tensor] = 0;
+
+            for(auto src_tensor : src_tensors)
+            {
+                const auto &src_shape = src_tensor->tensor_shape();
+                const auto &src_type = src_tensor->data_type();
+
+                if(src_shape == dst_shape && src_type == dst_type)
+                {
+                    const auto tile_usages_it = tile_usages.find(src_tensor);
+                    ARM_COMPUTE_ERROR_ON(tile_usages_it == tile_usages.end());
+
+                    if(component_type == GpuComponentType::Simple || tile_usages_it->second > 0)
+                    {
+                        // Increase the number of tile usages unless this component is an output
+                        // and the tile has not been shared with any component.
+                        // (Reason: output component doesn't change the content of the tile)
+                        ++tile_usages_it->second;
+                    }
+
+                    possible_tile_map[dst_tensor].push_back(src_tensor);
+                }
+            }
+        }
+        else
+        {
+            // Outputs of complex and unfusable components need dedicated tile.
+            for(auto tensor : dst_tensors)
+            {
+                tile_usages[tensor] = 0;
+            }
+        }
+    }
+
+    // Find the smallest list of tiles that the intermediate tensors need to write to.
+    for(auto tensor : _input_tensors)
+    {
+        _tile_map[tensor] = tensor;
+    }
+
+    for(auto component : _components)
+    {
+        const auto dst_tensors = component->tensors().get_const_dst_tensors();
+
+        for(auto tensor : dst_tensors)
+        {
+            const auto target_tiles = possible_tile_map.at(tensor);
+            _tile_map[tensor] = tensor;
+
+            for(auto target : target_tiles)
+            {
+                const auto num_usage = tile_usages[target];
+
+                if(num_usage <= 1)
+                {
+                    // The target tile is consumed by only this operator, so we can reuse it
+                    // for the destination tensor data.
+                    _tile_map[tensor] = _tile_map.at(target);
+                    break;
+                }
+            }
+        }
+    }
+
+    for(auto tensor : output_tensors)
+    {
+        _tile_map[tensor] = tensor;
+    }
+
+    // All intermediate tensors that cannot be shared with any previous tensor
+    // will need to be declared as tile variable.
+    for(auto tensor_tile : _tile_map)
+    {
+        if(tensor_tile.first == tensor_tile.second &&
+           _interm_tensors.find(tensor_tile.first) != _interm_tensors.end())
+        {
+            _tiles.push_back(tensor_tile.first);
         }
     }
 
     std::set_union(
-        input_tensors.begin(), input_tensors.end(),
+        _input_tensors.begin(), _input_tensors.end(),
         output_tensors.begin(), output_tensors.end(),
         std::back_inserter(_argument_tensors));
     _any_output_tensor = *output_tensors.begin();
 }
 
+std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_tiles() const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
+    return _tiles;
+}
+
+const ITensorInfo *GpuKernelComponentGroup::get_tile_for_tensor(const ITensorInfo *tensor) const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
+
+    if(_tile_map.find(tensor) != _tile_map.end())
+    {
+        return _tile_map.at(tensor);
+    }
+
+    return tensor;
+}
+
 const ITensorInfo *GpuKernelComponentGroup::get_any_dst_tensor() const
 {
     ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
@@ -203,6 +317,12 @@ bool GpuKernelComponentGroup::is_intermediate_tensor(const ITensorInfo *tensor)
     return _interm_tensors.find(tensor) != _interm_tensors.end();
 }
 
+bool GpuKernelComponentGroup::is_input_tensor(const ITensorInfo *tensor) const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
+    return _input_tensors.find(tensor) != _input_tensors.end();
+}
+
 size_t GpuKernelComponentGroup::size() const
 {
     return _components.size();
author	Viet-Hoa Do <viet-hoa.do@arm.com>	2022-12-16 14:45:57 +0000
committer	Viet-Hoa Do <viet-hoa.do@arm.com>	2022-12-30 13:59:23 +0000
commit	3558c5840e7c973e2b1a86ae3a9335b44cad59d4 (patch)
tree	b5f14b344ff8bc03e5143a54a5f3480263db543e /src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
parent	9d3bd41030366326e9c8afe5db3a5812a76b135b (diff)
download	ComputeLibrary-3558c5840e7c973e2b1a86ae3a9335b44cad59d4.tar.gz