From 9bfe0f86ea525055954b160a3c678024743030ec Mon Sep 17 00:00:00 2001 From: Louis Verhaard Date: Thu, 3 Dec 2020 12:26:25 +0100 Subject: MLBEDSW-1373: Added search based allocator Added a new tensor allocator that is based on searching, implemented in C++ (C++11 compatible). Change-Id: Ie96e9fcfc8e6c58d1fa53911f37de290eeba88cf Signed-off-by: Louis Verhaard --- ethosu/tensor_allocator/search_allocator.cpp | 266 +++++++++++++++++++++++++++ 1 file changed, 266 insertions(+) create mode 100644 ethosu/tensor_allocator/search_allocator.cpp (limited to 'ethosu/tensor_allocator/search_allocator.cpp') diff --git a/ethosu/tensor_allocator/search_allocator.cpp b/ethosu/tensor_allocator/search_allocator.cpp new file mode 100644 index 00000000..ce5c46de --- /dev/null +++ b/ethosu/tensor_allocator/search_allocator.cpp @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2020 Arm Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Description: + * Implementation of the search-based allocator. + */ + +#include +#include +#include +#include + +#include "search_allocator.h" + +SearchAllocator::SearchAllocator(const std::vector &live_ranges, uint32_t size_limit) { + lrs = live_ranges; + uint32_t max_end_time = 0; + for (size_t i = 0; i < lrs.size(); ++i) { + auto &lr = lrs[i]; + lr.id = i; + max_end_time = std::max(max_end_time, lr.end_time); + } + lrs_at_time.resize(max_end_time + 1); + size_at_time.resize(max_end_time + 1); + neighbours.resize(lrs.size()); + // Calculate which live ranges are active at every timestamp + for (size_t t = 0; t <= max_end_time; ++t) { + lrs_at_time[t].clear(); + } + for (auto &lr : lrs) { + for (auto t = lr.start_time; t <= lr.end_time; ++t) { + lrs_at_time[t].push_back(&lr); + } + } + min_required_size = 0; + for (size_t t = 0; t <= max_end_time; ++t) { + // Calculate minimum needed size at each timestamp + uint32_t size_at_t = 0; + for (auto &lr : lrs_at_time[t]) { + size_at_t += lr->size; + } + size_at_time[t] = size_at_t; + min_required_size = std::max(size_at_t, min_required_size); + // Calculate all neighbours + for (size_t i = 0; i < lrs_at_time[t].size(); ++i) { + auto lr1 = lrs_at_time[t][i]; + auto &nb1 = neighbours[lr1->id]; + for (size_t j = i + 1; j < lrs_at_time[t].size(); ++j) { + auto lr2 = lrs_at_time[t][j]; + if (find(nb1.begin(), nb1.end(), lr2) == nb1.end()) { + nb1.push_back(lr2); + neighbours[lr2->id].push_back(lr1); + } + } + } + } + target_size = std::max(min_required_size, available_size); + // Calculate the urgency of each live range + lr_urgency.resize(lrs.size()); + for (size_t i = 0; i < lrs.size(); ++i) { + auto &lr = lrs[i]; + uint32_t urgency = 0; + for (size_t t = lr.start_time; t <= lr.end_time; ++t) { + urgency = std::max(size_at_time[t], urgency); + } + lr_urgency[i] = urgency; + } + best_size = UINT32_MAX; +} + +uint32_t SearchAllocator::allocate(std::vector &output) { + output.clear(); + nr_iterations = 0; + std::vector indices; + // Initial solution, using a heuristic allocator + for (size_t i = 0; i < lrs.size(); ++i) { + indices.push_back(i); + } + sort_indices_on_prio(indices); + // Allocate the initial solution + best_size = UINT32_MAX; + best_size = allocate_indices(indices); + if (best_size <= target_size) { + // The heuristic allocation returned an optimal solution. + // No need to search. + } else { + // Try to improve the heuristic allocation + search(indices, best_size, MAX_ITERATIONS); + } + output.clear(); + for (auto &lr : lrs) { + output.push_back(lr.address); + } + return best_size; +} + +void SearchAllocator::allocate_lr(LiveRange &lr) const { + uint32_t address = 0; + int predecessor = NO_PREDECESSOR; + bool fits = false; + while (!fits) { + fits = true; + // Find neighbours that overlap with address + for (auto lr2_p : neighbours[lr.id]) { + if (lr2_p->address == NOT_ALLOCATED || lr2_p->end_address <= address) { + continue; + } + if (lr2_p->overlaps(address, lr.size)) { + // Overlap found; increase address + fits = false; + address = lr2_p->end_address; + predecessor = lr2_p->id; + } + } + } + lr.address = address; + lr.end_address = address + lr.size; + lr.predecessor = predecessor; +} + +uint32_t SearchAllocator::allocate_indices(const std::vector &indices) { + ++nr_iterations; + std::vector count(indices.size()); + for (auto &lr : lrs) { + lr.address = NOT_ALLOCATED; + } + uint32_t size = 0; + for (size_t turn = 0; size <= best_size && turn < indices.size(); ++turn) { + auto &lr = lrs[indices[turn]]; + allocate_lr(lr); + lr.turn = turn; + size = std::max(size, lr.end_address); + } + return size; +} + +void SearchAllocator::sort_indices_on_prio(std::vector &indices) const { + std::sort(indices.begin(), indices.end(), + [this] (size_t const& a, size_t const& b) { + if (lr_urgency[a] != lr_urgency[b]) { + return lr_urgency[a] > lr_urgency[b]; + } + auto &lr1 = lrs[a]; + auto &lr2 = lrs[b]; + auto duration1 = lr1.end_time - lr1.start_time; + auto duration2 = lr2.end_time - lr2.start_time; + if (duration1 != duration2) { + return duration1 > duration2; + } + if (lr1.start_time != lr2.start_time) { + return lr1.start_time < lr2.start_time; + } + if (lr1.size != lr2.size) { + return lr1.size > lr2.size; + } + return lr1.id < lr2.id; + }); +} + +void SearchAllocator::add_predecessor_turns(std::set &turns, const LiveRange &lr) const { + turns.insert(lr.turn); + int id = lr.id; + while (lrs[id].predecessor != NO_PREDECESSOR) { + id = lrs[id].predecessor; + turns.insert(lrs[id].turn); + } +} + +void SearchAllocator::attempt_bottleneck_fix(std::vector &indices) { + // Find the bottleneck + LiveRange *max_lr = &lrs[0]; + for (auto &lr : lrs) { + if (lr.end_address > max_lr->end_address) { + max_lr = &lr; + } + } + // Find all live ranges that affected the placement of the bottleneck live range. + // This consists of two types of live ranges: + // - direct neighbours of the bottleneck live range + // - direct and indirect predecessors of these neighbours + bottleneck + // The turns at which these live ranges were allocated are put in the turns vector. + std::set turns; + add_predecessor_turns(turns, *max_lr); + for (auto lr_p : neighbours[max_lr->id]) { + add_predecessor_turns(turns, *lr_p); + } + // Non-direct neighbours that interfere with the allocation of the bottleneck are the + // immediate cause for gaps in the allocation, and are selected with higher probability. + std::vector turn_list; + std::vector non_nb_turn_list; + for (auto turn : turns) { + turn_list.push_back(turn); + auto &lr = lrs[indices[turn]]; + if (!max_lr->is_neighbour(lr)) { + non_nb_turn_list.push_back(turn); + } + } + size_t ix1; + if (rng() % 100 < 30 && !non_nb_turn_list.empty()) { + // Pick a live range from the "non-neighbour list" + ix1 = non_nb_turn_list[rng() % non_nb_turn_list.size()]; + } else { + // Pick any affecting live range. + ix1 = turn_list[rng() % turn_list.size()]; + } + size_t ix2 = turn_list[rng() % turn_list.size() - 1]; + if (ix1 == ix2) { + ix2 = turn_list[turn_list.size() - 1]; + } + // Swap indices + std::swap(indices[ix1], indices[ix2]); +} + +void SearchAllocator::search(std::vector &indices, uint32_t initial_size, int iterations) { + std::vector best_indices = indices; + std::vector best_lrs = lrs; + for (int i = 0; i < iterations; ++i) { + // Reorder the indices + attempt_bottleneck_fix(indices); + // Allocate the reordered indices and check if it gave an improvement + auto new_size = allocate_indices(indices); + if (new_size <= best_size) { + // The new allocation produced a new best result; remember it + best_size = new_size; + best_indices = indices; + best_lrs = lrs; + if (best_size <= target_size) { + // Target reached; stop + return; + } + } else { + // The new allocation produced worse result; undo the change + indices = best_indices; + lrs = best_lrs; + } + } + lrs = best_lrs; +} + +uint32_t allocate(const std::vector &input, int available_size, std::vector &output) { + // Convert input to vector of live ranges + std::vector lrs; + for (size_t i = 0; i < input.size(); i += 3) { + LiveRange lr; + lr.start_time = input[i]; + lr.end_time = input[i+1]; + lr.size = input[i+2]; + lrs.push_back(lr); + } + SearchAllocator allocator(lrs, available_size); + return allocator.allocate(output); +} -- cgit v1.2.1