aboutsummaryrefslogtreecommitdiff
path: root/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp')
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp128
1 files changed, 124 insertions, 4 deletions
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
index 0d2574957f..81c3f0c800 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
@@ -133,8 +133,9 @@ void GpuKernelComponentGroup::finalize()
_finalized = true;
- std::set<const ITensorInfo *> input_tensors;
std::set<const ITensorInfo *> output_tensors;
+ std::map<const ITensorInfo *, std::vector<const ITensorInfo *>> possible_tile_map;
+ std::map<const ITensorInfo *, int32_t> tile_usages;
for(auto component : _components)
{
@@ -156,26 +157,139 @@ void GpuKernelComponentGroup::finalize()
}
else if(_interm_tensors.find(tensor) == _interm_tensors.end())
{
- input_tensors.insert(tensor);
+ _input_tensors.insert(tensor);
+
+ tile_usages[tensor] = 0;
+ possible_tile_map.emplace(tensor, std::vector<const ITensorInfo *>());
}
}
for(auto tensor : dst_tensors)
{
- ARM_COMPUTE_ERROR_ON(input_tensors.find(tensor) != input_tensors.end());
+ ARM_COMPUTE_ERROR_ON(_input_tensors.find(tensor) != _input_tensors.end());
ARM_COMPUTE_ERROR_ON(output_tensors.find(tensor) != output_tensors.end());
ARM_COMPUTE_ERROR_ON(_interm_tensors.find(tensor) != _interm_tensors.end());
output_tensors.insert(tensor);
+
+ tile_usages[tensor] = 0;
+ possible_tile_map.emplace(tensor, std::vector<const ITensorInfo *>());
+ }
+
+ // Check if the output can overwrite the input tile.
+ const auto component_type = component->type();
+ if(component_type == GpuComponentType::Simple || component_type == GpuComponentType::Output)
+ {
+ ARM_COMPUTE_ERROR_ON(dst_tensors.size() != 1);
+
+ const auto dst_tensor = dst_tensors[0];
+ const auto &dst_shape = dst_tensor->tensor_shape();
+ const auto &dst_type = dst_tensor->data_type();
+
+ tile_usages[dst_tensor] = 0;
+
+ for(auto src_tensor : src_tensors)
+ {
+ const auto &src_shape = src_tensor->tensor_shape();
+ const auto &src_type = src_tensor->data_type();
+
+ if(src_shape == dst_shape && src_type == dst_type)
+ {
+ const auto tile_usages_it = tile_usages.find(src_tensor);
+ ARM_COMPUTE_ERROR_ON(tile_usages_it == tile_usages.end());
+
+ if(component_type == GpuComponentType::Simple || tile_usages_it->second > 0)
+ {
+ // Increase the number of tile usages unless this component is an output
+ // and the tile has not been shared with any component.
+ // (Reason: output component doesn't change the content of the tile)
+ ++tile_usages_it->second;
+ }
+
+ possible_tile_map[dst_tensor].push_back(src_tensor);
+ }
+ }
+ }
+ else
+ {
+ // Outputs of complex and unfusable components need dedicated tile.
+ for(auto tensor : dst_tensors)
+ {
+ tile_usages[tensor] = 0;
+ }
+ }
+ }
+
+ // Find the smallest list of tiles that the intermediate tensors need to write to.
+ for(auto tensor : _input_tensors)
+ {
+ _tile_map[tensor] = tensor;
+ }
+
+ for(auto component : _components)
+ {
+ const auto dst_tensors = component->tensors().get_const_dst_tensors();
+
+ for(auto tensor : dst_tensors)
+ {
+ const auto target_tiles = possible_tile_map.at(tensor);
+ _tile_map[tensor] = tensor;
+
+ for(auto target : target_tiles)
+ {
+ const auto num_usage = tile_usages[target];
+
+ if(num_usage <= 1)
+ {
+ // The target tile is consumed by only this operator, so we can reuse it
+ // for the destination tensor data.
+ _tile_map[tensor] = _tile_map.at(target);
+ break;
+ }
+ }
+ }
+ }
+
+ for(auto tensor : output_tensors)
+ {
+ _tile_map[tensor] = tensor;
+ }
+
+ // All intermediate tensors that cannot be shared with any previous tensor
+ // will need to be declared as tile variable.
+ for(auto tensor_tile : _tile_map)
+ {
+ if(tensor_tile.first == tensor_tile.second &&
+ _interm_tensors.find(tensor_tile.first) != _interm_tensors.end())
+ {
+ _tiles.push_back(tensor_tile.first);
}
}
std::set_union(
- input_tensors.begin(), input_tensors.end(),
+ _input_tensors.begin(), _input_tensors.end(),
output_tensors.begin(), output_tensors.end(),
std::back_inserter(_argument_tensors));
_any_output_tensor = *output_tensors.begin();
}
+std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_tiles() const
+{
+ ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
+ return _tiles;
+}
+
+const ITensorInfo *GpuKernelComponentGroup::get_tile_for_tensor(const ITensorInfo *tensor) const
+{
+ ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
+
+ if(_tile_map.find(tensor) != _tile_map.end())
+ {
+ return _tile_map.at(tensor);
+ }
+
+ return tensor;
+}
+
const ITensorInfo *GpuKernelComponentGroup::get_any_dst_tensor() const
{
ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
@@ -203,6 +317,12 @@ bool GpuKernelComponentGroup::is_intermediate_tensor(const ITensorInfo *tensor)
return _interm_tensors.find(tensor) != _interm_tensors.end();
}
+bool GpuKernelComponentGroup::is_input_tensor(const ITensorInfo *tensor) const
+{
+ ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
+ return _input_tensors.find(tensor) != _input_tensors.end();
+}
+
size_t GpuKernelComponentGroup::size() const
{
return _components.size();