aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLouis Verhaard <louis.verhaard@arm.com>2021-01-20 17:23:54 +0100
committerLouis Verhaard <louis.verhaard@arm.com>2021-02-12 17:40:21 +0100
commitd70025250fc49997801ea3a6ce83f2fa29a09d78 (patch)
tree07462a32ed30ba9893bb7825e44b9606a400e709
parentad45f792e699fe6abdc381f62690801aa50bd412 (diff)
downloadethos-u-vela-d70025250fc49997801ea3a6ce83f2fa29a09d78.tar.gz
MLBEDSW-3808: Ported search allocator to python2.1.0.rc1
- Straight port of the C++ implementation to python. - Renamed the allocator from "Search" to "HillClimb" Change-Id: I50797d541f326d0264daf79bf7866aef32350a60 Signed-off-by: Louis Verhaard <louis.verhaard@arm.com>
-rw-r--r--OPTIONS.md4
-rw-r--r--README.md6
-rw-r--r--ethosu/tensor_allocator/makefile50
-rw-r--r--ethosu/tensor_allocator/search_allocator.cpp267
-rw-r--r--ethosu/tensor_allocator/search_allocator.h181
-rw-r--r--ethosu/tensor_allocator/tensor_allocator_main.cpp77
-rw-r--r--ethosu/tensor_allocator/tensor_allocatormodule.cpp105
-rw-r--r--ethosu/vela/hillclimb_allocation.py310
-rw-r--r--ethosu/vela/nn_graph.py2
-rw-r--r--ethosu/vela/tensor_allocation.py28
-rw-r--r--ethosu/vela/test/test_hillclimb_allocation.py (renamed from ethosu/tensor_allocator/test/test_tensor_allocator.py)32
-rw-r--r--ethosu/vela/vela.py2
-rw-r--r--setup.py9
13 files changed, 344 insertions, 729 deletions
diff --git a/OPTIONS.md b/OPTIONS.md
index e191d4e..1293f17 100644
--- a/OPTIONS.md
+++ b/OPTIONS.md
@@ -183,8 +183,8 @@ vela network.tflite --config my_vela_cfg.ini --memory-mode My_Mem_Mode
Specify which allocator algorithm to use for non-constant NPU and CPU tensor
allocation.
**Type: String**
-**Default: Search**
-**Choices: [Greedy, LinearAlloc, Search]**
+**Default: HillClimb**
+**Choices: [Greedy, LinearAlloc, HillClimb]**
```bash
vela network.tflite --tensor-allocator=LinearAlloc
diff --git a/README.md b/README.md
index 81bb0a0..1d6d073 100644
--- a/README.md
+++ b/README.md
@@ -54,9 +54,9 @@ source code from
[ML Platform](https://review.mlplatform.org/plugins/gitiles/ml/ethos-u/ethos-u-vela).
Both methods will automatically install all the required dependencies.
-**Note:** For installing on Microsoft Windows you need to have a C99 and C++11
-capable toolchain installed. The recommended and tested toolchain is Microsoft
-Visual C++ 14.x Build Tools, see <https://wiki.python.org/moin/WindowsCompilers>
+**Note:** For installing on Microsoft Windows you need to have a C99 capable
+toolchain installed. The recommended and tested toolchain is Microsoft Visual
+C++ 14.x Build Tools, see <https://wiki.python.org/moin/WindowsCompilers>
### PyPi
diff --git a/ethosu/tensor_allocator/makefile b/ethosu/tensor_allocator/makefile
deleted file mode 100644
index 9636eb9..0000000
--- a/ethosu/tensor_allocator/makefile
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Description:
-# Makefile to build tensor_allocator_main
-
-UNAME=$(shell uname -o)
-
-CXXFLAGS=--std=c++11 -pedantic-errors -Wall -Werror -Wdate-time
-CXXFLAGS+=-fwrapv -fstack-protector-strong -flto -fuse-linker-plugin -ffat-lto-objects -fPIC
-
-ifeq ($(DEBUG),1)
- CXXFLAGS+=-g -O0 -DDEBUG
-else
- CXXFLAGS+=-O2
-endif
-
-LIBSRCS=tensor_allocator_main.cpp search_allocator.cpp
-LIBHDRS=search_allocator.h
-
-ifeq ($(UNAME),Cygwin)
- TENSOR_ALLOCATOR_EXE=tensor_allocator_main.exe
-else
- TENSOR_ALLOCATOR_EXE=tensor_allocator_main
-endif
-
-all: tensor_allocator_exe
-
-.PHONY: tensor_allocator_exe
-tensor_allocator_exe: $(TENSOR_ALLOCATOR_EXE)
-
-clean:
- rm -f $(TENSOR_ALLOCATOR_EXE)
-
-$(TENSOR_ALLOCATOR_EXE): $(LIBSRCS) $(LIBHDRS) makefile
- g++ $(CXXFLAGS) $(LIBSRCS) -o $(TENSOR_ALLOCATOR_EXE)
diff --git a/ethosu/tensor_allocator/search_allocator.cpp b/ethosu/tensor_allocator/search_allocator.cpp
deleted file mode 100644
index 5c7492b..0000000
--- a/ethosu/tensor_allocator/search_allocator.cpp
+++ /dev/null
@@ -1,267 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * Description:
- * Implementation of the search-based allocator.
- */
-
-#include <algorithm>
-#include <cstdint>
-#include <set>
-#include <vector>
-
-#include "search_allocator.h"
-
-SearchAllocator::SearchAllocator(const std::vector<LiveRange> &live_ranges, uint32_t size_limit) {
- lrs = live_ranges;
- uint32_t max_end_time = 0;
- for (size_t i = 0; i < lrs.size(); ++i) {
- auto &lr = lrs[i];
- lr.id = static_cast<int>(i);
- max_end_time = std::max(max_end_time, lr.end_time);
- }
- lrs_at_time.resize(max_end_time + 1);
- size_at_time.resize(max_end_time + 1);
- neighbours.resize(lrs.size());
- // Calculate which live ranges are active at every timestamp
- for (size_t t = 0; t <= max_end_time; ++t) {
- lrs_at_time[t].clear();
- }
- for (auto &lr : lrs) {
- for (auto t = lr.start_time; t <= lr.end_time; ++t) {
- lrs_at_time[t].push_back(&lr);
- }
- }
- min_required_size = 0;
- for (size_t t = 0; t <= max_end_time; ++t) {
- // Calculate minimum needed size at each timestamp
- uint32_t size_at_t = 0;
- for (auto &lr : lrs_at_time[t]) {
- size_at_t += lr->size;
- }
- size_at_time[t] = size_at_t;
- min_required_size = std::max(size_at_t, min_required_size);
- // Calculate all neighbours
- for (size_t i = 0; i < lrs_at_time[t].size(); ++i) {
- auto lr1 = lrs_at_time[t][i];
- auto &nb1 = neighbours[lr1->id];
- for (size_t j = i + 1; j < lrs_at_time[t].size(); ++j) {
- auto lr2 = lrs_at_time[t][j];
- if (find(nb1.begin(), nb1.end(), lr2) == nb1.end()) {
- nb1.push_back(lr2);
- neighbours[lr2->id].push_back(lr1);
- }
- }
- }
- }
- target_size = std::max(min_required_size, size_limit);
- // Calculate the urgency of each live range
- lr_urgency.resize(lrs.size());
- for (size_t i = 0; i < lrs.size(); ++i) {
- auto &lr = lrs[i];
- uint32_t urgency = 0;
- for (size_t t = lr.start_time; t <= lr.end_time; ++t) {
- urgency = std::max(size_at_time[t], urgency);
- }
- lr_urgency[i] = urgency;
- }
- best_size = UINT32_MAX;
-}
-
-uint32_t SearchAllocator::allocate(std::vector<uint32_t> &output) {
- output.clear();
- nr_iterations = 0;
- std::vector<size_t> indices;
- // Initial solution, using a heuristic allocator
- for (size_t i = 0; i < lrs.size(); ++i) {
- indices.push_back(i);
- }
- sort_indices_on_prio(indices);
- // Allocate the initial solution
- best_size = UINT32_MAX;
- best_size = allocate_indices(indices);
- if (best_size <= target_size) {
- // The heuristic allocation returned an optimal solution.
- // No need to search.
- } else {
- // Try to improve the heuristic allocation
- search(indices, best_size, MAX_ITERATIONS);
- }
- output.clear();
- for (auto &lr : lrs) {
- output.push_back(lr.address);
- }
- return best_size;
-}
-
-void SearchAllocator::allocate_lr(LiveRange &lr) const {
- uint32_t address = 0;
- int predecessor = NO_PREDECESSOR;
- bool fits = false;
- while (!fits) {
- fits = true;
- // Find neighbours that overlap with address
- for (auto lr2_p : neighbours[lr.id]) {
- if (lr2_p->address == NOT_ALLOCATED || lr2_p->end_address <= address) {
- continue;
- }
- if (lr2_p->overlaps(address, lr.size)) {
- // Overlap found; increase address
- fits = false;
- address = lr2_p->end_address;
- predecessor = lr2_p->id;
- }
- }
- }
- lr.address = address;
- lr.end_address = address + lr.size;
- lr.predecessor = predecessor;
-}
-
-uint32_t SearchAllocator::allocate_indices(const std::vector<size_t> &indices) {
- ++nr_iterations;
- std::vector<size_t> count(indices.size());
- for (auto &lr : lrs) {
- lr.address = NOT_ALLOCATED;
- }
- uint32_t size = 0;
- for (size_t turn = 0; size <= best_size && turn < indices.size(); ++turn) {
- auto &lr = lrs[indices[turn]];
- allocate_lr(lr);
- lr.turn = turn;
- size = std::max(size, lr.end_address);
- }
- return size;
-}
-
-void SearchAllocator::sort_indices_on_prio(std::vector<size_t> &indices) const {
- std::sort(indices.begin(), indices.end(),
- [this] (size_t const& a, size_t const& b) {
- if (lr_urgency[a] != lr_urgency[b]) {
- return lr_urgency[a] > lr_urgency[b];
- }
- auto &lr1 = lrs[a];
- auto &lr2 = lrs[b];
- auto duration1 = lr1.end_time - lr1.start_time;
- auto duration2 = lr2.end_time - lr2.start_time;
- if (duration1 != duration2) {
- return duration1 > duration2;
- }
- if (lr1.start_time != lr2.start_time) {
- return lr1.start_time < lr2.start_time;
- }
- if (lr1.size != lr2.size) {
- return lr1.size > lr2.size;
- }
- return lr1.id < lr2.id;
- });
-}
-
-void SearchAllocator::add_predecessor_turns(std::set<size_t> &turns, const LiveRange &lr) const {
- turns.insert(lr.turn);
- int id = lr.id;
- while (lrs[id].predecessor != NO_PREDECESSOR) {
- id = lrs[id].predecessor;
- turns.insert(lrs[id].turn);
- }
-}
-
-void SearchAllocator::attempt_bottleneck_fix(std::vector<size_t> &indices) {
- // Find the bottleneck
- LiveRange *max_lr = &lrs[0];
- for (auto &lr : lrs) {
- if (lr.end_address > max_lr->end_address) {
- max_lr = &lr;
- }
- }
- // Find all live ranges that affected the placement of the bottleneck live range.
- // This consists of two types of live ranges:
- // - direct neighbours of the bottleneck live range
- // - direct and indirect predecessors of these neighbours + bottleneck
- // The turns at which these live ranges were allocated are put in the turns vector.
- std::set<size_t> turns;
- add_predecessor_turns(turns, *max_lr);
- for (auto lr_p : neighbours[max_lr->id]) {
- add_predecessor_turns(turns, *lr_p);
- }
- // Non-direct neighbours that interfere with the allocation of the bottleneck are the
- // immediate cause for gaps in the allocation, and are selected with higher probability.
- std::vector<size_t> turn_list;
- std::vector<size_t> non_nb_turn_list;
- for (auto turn : turns) {
- turn_list.push_back(turn);
- auto &lr = lrs[indices[turn]];
- if (!max_lr->is_neighbour(lr)) {
- non_nb_turn_list.push_back(turn);
- }
- }
- size_t ix1;
- if (rng() % 100 < 30 && !non_nb_turn_list.empty()) {
- // Pick a live range from the "non-neighbour list"
- ix1 = non_nb_turn_list[rng() % non_nb_turn_list.size()];
- } else {
- // Pick any affecting live range.
- ix1 = turn_list[rng() % turn_list.size()];
- }
- // Note: turn_list has always at least 2 elements for bottlenecks
- size_t ix2 = turn_list[rng() % (turn_list.size() - 1)];
- if (ix1 == ix2) {
- ix2 = turn_list[turn_list.size() - 1];
- }
- // Swap indices
- std::swap(indices[ix1], indices[ix2]);
-}
-
-void SearchAllocator::search(std::vector<size_t> &indices, uint32_t initial_size, int iterations) {
- std::vector<size_t> best_indices = indices;
- std::vector<LiveRange> best_lrs = lrs;
- for (int i = 0; i < iterations; ++i) {
- // Reorder the indices
- attempt_bottleneck_fix(indices);
- // Allocate the reordered indices and check if it gave an improvement
- auto new_size = allocate_indices(indices);
- if (new_size <= best_size) {
- // The new allocation produced a new best result; remember it
- best_size = new_size;
- best_indices = indices;
- best_lrs = lrs;
- if (best_size <= target_size) {
- // Target reached; stop
- return;
- }
- } else {
- // The new allocation produced worse result; undo the change
- indices = best_indices;
- lrs = best_lrs;
- }
- }
- lrs = best_lrs;
-}
-
-uint32_t allocate(const std::vector<uint32_t> &input, int available_size, std::vector<uint32_t> &output) {
- // Convert input to vector of live ranges
- std::vector<LiveRange> lrs;
- for (size_t i = 0; i < input.size(); i += 3) {
- LiveRange lr;
- lr.start_time = input[i];
- lr.end_time = input[i+1];
- lr.size = input[i+2];
- lrs.push_back(lr);
- }
- SearchAllocator allocator(lrs, available_size);
- return allocator.allocate(output);
-}
diff --git a/ethosu/tensor_allocator/search_allocator.h b/ethosu/tensor_allocator/search_allocator.h
deleted file mode 100644
index 6c75015..0000000
--- a/ethosu/tensor_allocator/search_allocator.h
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * Description:
- * Declaration of the search-based allocator.
- */
-
-#ifndef __SEARCH_ALLOCATOR_H
-#define __SEARCH_ALLOCATOR_H
-
-#include <algorithm>
-#include <cstdint>
-#include <random>
-#include <set>
-#include <vector>
-
-/**
- * Live range
- */
-struct LiveRange {
- /** Start time (input to the allocator algorithm) */
- uint32_t start_time;
- /** End time, inclusive (input to the allocator algorithm) */
- uint32_t end_time;
- /** Size in bytes (input to the allocator algorithm) */
- uint32_t size;
- /** Index of this live range */
- int id;
- /** Allocated address (the main output from the allocator algorithm) */
- uint32_t address;
- /** End address, exclusive */
- uint32_t end_address;
- /** id of predecessor live range (predecessor's end address == this lr's address) */
- int predecessor;
- /** Turn at which the live range was allocated */
- size_t turn;
-
- bool overlaps(uint32_t addr2, uint32_t size2) const {
- return address < addr2 + size2 && addr2 < end_address;
- }
- bool is_neighbour(const LiveRange &lr) const {
- return start_time <= lr.end_time && lr.start_time <= end_time;
- }
-};
-
-/**
- * Implements tensor allocator using state space exploration.
- *
- * The basic algorithm is:
- *
- * Use a heuristic allocator to find an initial allocation
- * while allocation is not optimal and iterations < MAX_ITERATIONS {
- * find the "bottleneck": the live range with highest end address
- * find all live ranges that affected the allocation of the bottleneck
- * swap the order of any two affecting live ranges
- * reallocate tensors using the reordered live ranges
- * if the new allocation is better: keep it, else set allocation to previous allocation
- * }
- */
-class SearchAllocator {
-private:
- static constexpr int MAX_ITERATIONS = 500;
- static constexpr uint32_t NOT_ALLOCATED = UINT32_MAX;
- /** Used for live ranges allocated at address 0 */
- static constexpr int NO_PREDECESSOR = -1;
- /** Contains the live ranges */
- std::vector<LiveRange> lrs;
- /** Contains active live ranges at each timestamp */
- std::vector<std::vector<LiveRange*>> lrs_at_time;
- /**
- * Contains neighbours of each live range (indexed by lr.id), i.e.
- * live ranges with overlapping start/end time.
- */
- std::vector<std::vector<LiveRange*>> neighbours;
- /**
- * At each timestamp: accumulated size of active live ranges
- */
- std::vector<uint32_t> size_at_time;
- /**
- * For each live range: max value of size_at_time (only used in the heuristic allocation)
- */
- std::vector<uint32_t> lr_urgency;
- /**
- * The minimum possible size, assuming all live ranges can be perfectly allocated
- */
- uint32_t min_required_size;
- /** The algorithm stops once the target size has been achieved */
- uint32_t target_size;
- /** The highest end address of the best found allocation */
- uint32_t best_size;
- /** Number of performed iterations */
- size_t nr_iterations = 0;
- /** Random number generator; use default seed (which is well-defined) */
- std::mt19937 rng;
-public:
- SearchAllocator(const std::vector<LiveRange> &live_ranges, uint32_t size_limit);
- /**
- * Runs the allocation algorithm. Finishes when the target size has been
- * reached or when maximum iterations have been run.
- * The allocated addresses are placed in the output vector, in the same
- * order as the input vector.
- *
- * Implementation note: the algorithm produces reproduceable results by using
- * a well-defined random number generator with well-defined default seed,
- * and using a fixed number of iterations.
- */
- uint32_t allocate(std::vector<uint32_t> &output);
- uint32_t get_min_required_size() const {
- return min_required_size;
- }
- size_t get_nr_iterations() const {
- return nr_iterations;
- }
-private:
- /**
- * Allocates the given live range at the smallest possible address
- */
- void allocate_lr(LiveRange &lr) const;
- /**
- * Allocates the live ranges in the order indicated by the indices;
- * allocates each live range at the lowest possible address.
- */
- uint32_t allocate_indices(const std::vector<size_t> &indices);
- /** Sorts live ranges based on heuristics, used for the initial allocation */
- void sort_indices_on_prio(std::vector<size_t> &indices) const;
- /** Adds the given live range + predecessors to the turns vector */
- void add_predecessor_turns(std::set<size_t> &turns, const LiveRange &lr) const;
- /**
- * Finds the "bottleneck", the live range with highest end address, and reorders the indices
- * such that a next allocation might lower the memory usage.
- *
- * ---------
- * | |
- * | D |
- * | |
- * ----------------------------------
- * | B |
- * -------------------------------
- * | |
- * |A| ---
- * | | |C|
- * | | | |
- * ---------------------------------------
- *
- * In the above example, the allocation order was [A, B, C, D] and D is the resulting bottle-neck.
- * The live ranges that affected the allocation of D are the direct neighbours of D (i.e. B and C),
- * and all direct and indirect predecessors of D and its neighbours
- * (i.e. A, which is the predecessor of B, and indirect predecessor of D).
- *
- * By permuting the order in which the affecting live ranges are allocated, the bottleneck might
- * be lowered. In the above example, almost any permutation would lower the bottleneck.
- *
- * Note that there is room to improve the efficiency of the algorithm.
- * One way could be to first allocate all direct neighbours of the bottleneck
- * (i.e. B, C, D) and then the other affecting live ranges (i.e. A). The algorithm currently does
- * not actively try this, as it may lead to allocation loops (A could become the new bottle-neck);
- * it just uses a higher probability of selecting A.
- */
- void attempt_bottleneck_fix(std::vector<size_t> &indices);
- /** Search for a solution, using the given indices as initial solution. */
- void search(std::vector<size_t> &indices, uint32_t initial_size, int iterations);
-};
-
-/** Wrapper function to perform live range allocation */
-uint32_t allocate(const std::vector<uint32_t> &input, int available_size, std::vector<uint32_t> &output);
-
-#endif // __SEARCH_ALLOCATOR_H
diff --git a/ethosu/tensor_allocator/tensor_allocator_main.cpp b/ethosu/tensor_allocator/tensor_allocator_main.cpp
deleted file mode 100644
index 27d96ef..0000000
--- a/ethosu/tensor_allocator/tensor_allocator_main.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <iostream>
-#include <vector>
-
-#include "search_allocator.h"
-
-using namespace std;
-
-/**
- * Reads live ranges from the input, and then performs allocation.
- * The input has format:
-
-<number of live ranges>
-<start_time> <end_time> <size>
-...
-
- * e.g.:
-4
-0 20 4096
-2 8 16000
-4 10 800
-6 20 1024
- */
-int main() {
- int lr_count;
- cin >> lr_count;
- cin.ignore();
- vector<uint32_t> input;
- vector<uint32_t> output;
- for (int i = 0; i < lr_count; ++i) {
- LiveRange lr;
- cin >> lr.start_time >> lr.end_time >> lr.size;
- lr.id = i;
- cin.ignore();
- input.push_back(lr.start_time);
- input.push_back(lr.end_time);
- input.push_back(lr.size);
- }
- vector<LiveRange> lrs;
- for (size_t i = 0; i < input.size(); i += 3) {
- LiveRange lr;
- lr.start_time = input[i];
- lr.end_time = input[i+1];
- lr.size = input[i+2];
- lrs.push_back(lr);
- }
- SearchAllocator allocator(lrs, 0);
- uint32_t result = allocator.allocate(output);
- printf("Output:\n");
- for (int i = 0; i < lr_count; ++i) {
- printf("%4d: %6d %4d-%4d size %6d\n", i, output[i], input[3*i], input[3*i+1], input[3*i+2]);
- }
- uint32_t min_size = allocator.get_min_required_size();
- double overhead = 100.0 * (result - min_size)/(double)min_size;
- printf("Total size: %d (%1.1f K), minimum required size: %d, overhead: %1.2f%%\n",
- result, result/1024.0, min_size, overhead);
- printf("Search used %ld iterations\n", (long)allocator.get_nr_iterations());
- return 0;
-}
diff --git a/ethosu/tensor_allocator/tensor_allocatormodule.cpp b/ethosu/tensor_allocator/tensor_allocatormodule.cpp
deleted file mode 100644
index 52f1c69..0000000
--- a/ethosu/tensor_allocator/tensor_allocatormodule.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#define PY_SSIZE_T_CLEAN
-#include <Python.h>
-#include <cstdint>
-
-#include <vector>
-#include "search_allocator.h"
-
-
-
-/**
- * C++ extension wrapper for allocate
- *
- * This method is exposed directly in python with the arguments with a
- * prototype of the form:
- *
- * output = tensor_allocator.allocate(input, available_size=0)
- *
- * input: [int]
- * available_size: int
- * output: [int]
- */
-static PyObject *method_allocate (PyObject *self, PyObject *args)
-{
- /* Object to hold the input integer list. */
- PyObject *input_list_object;
-
- /* Object to hold the available size */
- int available_size = 0;
-
- /* Arguments to the method are delivered as a tuple, unpack the
- * tuple to get the individual arguments, note the second is
- * optional.
- */
- if (!PyArg_ParseTuple(args, "O|i", &input_list_object, &available_size)) {
- return NULL;
- }
-
- /* Unpack the length of the input integer list. */
- auto input_length = PyObject_Length(input_list_object);
- if (input_length < 0) {
- return NULL;
- }
- if (input_length % 3 != 0) {
- PyErr_SetString(PyExc_ValueError, "Input length must be multiple of 3");
- return NULL;
- }
- std::vector<uint32_t> input;
- std::vector<uint32_t> output;
- for (int i = 0; i < input_length; ++i) {
- PyObject *obj = PyList_GetItem(input_list_object, i);
- if (!PyLong_Check(obj)) {
- PyErr_SetString(PyExc_ValueError, "Illegal value in input");
- return NULL;
- }
- auto value = PyLong_AsLong(obj);
- if (value < 0 || value > UINT32_MAX) {
- PyErr_SetString(PyExc_ValueError, "Input value out of bounds");
- return NULL;
- }
- input.push_back(value);
- }
- allocate(input, available_size, output);
- PyObject *output_list = PyList_New(output.size());
- for (size_t i = 0; i < output.size(); ++i) {
- PyList_SetItem(output_list, i, PyLong_FromLong(output[i]));
- }
- return output_list;
-}
-
-/** tensor_allocator method descriptors. */
-static PyMethodDef tensor_allocator_methods[] = {
- {"allocate", method_allocate, METH_VARARGS, "Python interface for allocate"},
- {NULL, NULL, 0, NULL}
-};
-
-/** tensor_allocator module descriptor. */
-static struct PyModuleDef tensor_allocatormodule = {
- PyModuleDef_HEAD_INIT,
- "tensor_allocator",
- "Python interface for tensor_allocator",
- -1,
- tensor_allocator_methods
-};
-
-PyMODINIT_FUNC PyInit_tensor_allocator(void) {
- return PyModule_Create(&tensor_allocatormodule);
-}
diff --git a/ethosu/vela/hillclimb_allocation.py b/ethosu/vela/hillclimb_allocation.py
new file mode 100644
index 0000000..de53ab8
--- /dev/null
+++ b/ethosu/vela/hillclimb_allocation.py
@@ -0,0 +1,310 @@
+# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Description:
+# Tensor allocator based on a hill-climb search
+import random
+from typing import List
+from typing import Set
+
+from .live_range import LiveRange
+
+
+class LiveRangeInfo:
+ def __init__(self, id: int, start_time: int, end_time: int, size: int):
+ # Index of this live range
+ self.id = id
+ # Start time (input to the allocator algorithm)
+ self.start_time = start_time
+ # End time, inclusive (input to the allocator algorithm)
+ self.end_time = end_time
+ # Size in bytes (input to the allocator algorithm)
+ self.size = size
+ # Allocated address (the main output from the allocator algorithm)
+ self.address: int = 0
+ # End address, exclusive
+ self.end_address: int = 0
+ # id of predecessor live range (predecessor's end address == this lr's address)
+ self.predecessor: int = 0
+ # Turn at which the live range was allocated
+ self.turn: int = 0
+ # Max value of size_at_time (only used in the heuristic allocation)
+ self.urgency = 0
+ self.neighbours: List["LiveRangeInfo"] = []
+
+ def overlaps(self, addr2: int, size2: int) -> int:
+ return self.address < addr2 + size2 and addr2 < self.end_address
+
+ def is_neighbour(self, lr: "LiveRangeInfo") -> bool:
+ return self.start_time <= lr.end_time and lr.start_time <= self.end_time
+
+ def __str__(self):
+ return "<LiveRangeInfo: id={}, start_time={}, end_time={}, size={}, address={}>".format(
+ self.id, self.start_time, self.end_time, self.size, self.address
+ )
+
+ def __lt__(self, other) -> bool:
+ if self.urgency != other.urgency:
+ return self.urgency > other.urgency
+ duration1 = self.end_time - self.start_time
+ duration2 = other.end_time - other.start_time
+ if duration1 != duration2:
+ return duration1 > duration2
+
+ if self.start_time != other.start_time:
+ return self.start_time < other.start_time
+
+ if self.size != other.size:
+ return self.size > other.size
+
+ return self.id < other.id
+
+
+class HillClimbAllocator:
+ """
+ Implements tensor allocator using a hill climb search.
+
+ The basic algorithm is:
+
+ Use a heuristic allocator to find an initial allocation
+ while allocation is not optimal and iterations < MAX_ITERATIONS:
+ find the "bottleneck": the live range with highest end address
+ find all live ranges that affected the allocation of the bottleneck
+ swap the order of any two affecting live ranges
+ reallocate tensors using the reordered live ranges
+ if the new allocation is better: keep it, else set allocation to previous allocation
+ """
+
+ MAX_ITERATIONS = 500
+ NOT_ALLOCATED = -1
+ # Used for live ranges allocated at address 0
+ NO_PREDECESSOR = -1
+
+ def __init__(self, live_ranges: List[LiveRange]):
+ # Contains the live ranges
+ self.lrs: List[LiveRangeInfo] = [
+ LiveRangeInfo(id, lr.start_time, lr.end_time, lr.size) for id, lr in enumerate(live_ranges)
+ ]
+ self.lrs_at_time = []
+ # The available size (input to algorithm).
+ self.available_size: int = 0
+ # The algorithm stops once the target size has been achieved
+ self.target_size: int = 0
+ # The highest end address of the best found allocation
+ self.best_size: int = 1 << 63
+ # For each live range: max value of size_at_time (only used in the heuristic allocation)
+ self.lr_urgency = len(self.lrs) * [0]
+ nr_time_slots = 1 + max(lr.end_time for lr in self.lrs)
+ # Contains active live ranges at each timestamp
+ self.lrs_at_time = [[] for i in range(nr_time_slots)]
+ for lr in self.lrs:
+ for t in range(lr.start_time, lr.end_time + 1):
+ self.lrs_at_time[t].append(lr)
+ # At each timestamp: accumulated size of active live ranges
+ size_at_time = [sum(lr.size for lr in self.lrs_at_time[t]) for t in range(nr_time_slots)]
+ # The minimum possible size, assuming all live ranges can be perfectly allocated
+ self.min_required_size: int = max(size_at_time)
+ # Calculate all neighbours + the urgency of each live range
+ for lr in self.lrs:
+ lr.urgency = 0
+ lr.neighbours = []
+ neighbours = set()
+ for t in range(lr.start_time, lr.end_time + 1):
+ lr.urgency = max(size_at_time[t], lr.urgency)
+ for lr2 in self.lrs_at_time[t]:
+ if lr2 not in neighbours and lr != lr2:
+ neighbours.add(lr2)
+ lr.neighbours.append(lr2)
+
+ def allocate_lr(self, lr: LiveRangeInfo):
+ """
+ Allocates the given live range at the smallest possible address
+ """
+ address = 0
+ predecessor = HillClimbAllocator.NO_PREDECESSOR
+ fits = False
+ while not fits:
+ fits = True
+ # Find neighbours that overlap with address
+ for lr2 in lr.neighbours:
+ if lr2.address == HillClimbAllocator.NOT_ALLOCATED or lr2.end_address <= address:
+ continue
+ if lr2.overlaps(address, lr.size):
+ # Overlap found increase address
+ fits = False
+ address = lr2.end_address
+ predecessor = lr2.id
+ lr.address = address
+ lr.end_address = address + lr.size
+ lr.predecessor = predecessor
+
+ def allocate_indices(self, indices: List[int]):
+ """
+ Allocates the live ranges in the order indicated by the indices;
+ allocates each live range at the lowest possible address.
+ """
+ for lr in self.lrs:
+ lr.address = HillClimbAllocator.NOT_ALLOCATED
+ size = 0
+ for turn, index in enumerate(indices):
+ lr = self.lrs[index]
+ self.allocate_lr(lr)
+ lr.turn = turn
+ size = max(size, lr.end_address)
+ if size > self.best_size:
+ # This allocation is worse than the best known allocation;
+ # no need to continue
+ break
+ return size
+
+ def add_predecessor_turns(self, turn_set: Set[int], turn_list: List[int], lr: LiveRangeInfo):
+ """
+ Adds the given live range + predecessors to the turns vector.
+ Note: the turn_set is for quick detection of duplicates,
+ the turn_list is to get reproduceable results
+ """
+ if lr.turn not in turn_set:
+ turn_set.add(lr.turn)
+ turn_list.append(lr.turn)
+ id = lr.id
+ while self.lrs[id].predecessor != HillClimbAllocator.NO_PREDECESSOR:
+ id = self.lrs[id].predecessor
+ turn = self.lrs[id].turn
+ if turn not in turn_set:
+ turn_set.add(turn)
+ turn_list.append(turn)
+
+ def attempt_bottleneck_fix(self, indices: List[int]):
+ """
+ Finds the "bottleneck", the live range with highest end address, and reorders the indices
+ such that a next allocation might lower the memory usage.
+
+ ---------
+ | |
+ | D |
+ | |
+ ----------------------------------
+ | B |
+ -------------------------------
+ | |
+ |A| ---
+ | | |C|
+ | | | |
+ ---------------------------------------
+
+ In the above example, the allocation order was [A, B, C, D] and D is the resulting bottle-neck.
+ The live ranges that affected the allocation of D are the direct neighbours of D (i.e. B and C),
+ and all direct and indirect predecessors of D and its neighbours
+ (i.e. A, which is the predecessor of B, and indirect predecessor of D).
+
+ By permuting the order in which the affecting live ranges are allocated, the bottleneck might
+ be lowered. In the above example, almost any permutation would lower the bottleneck.
+ """
+ # Find the bottleneck
+ max_lr = self.lrs[0]
+ for lr in self.lrs[1:]:
+ if lr.end_address > max_lr.end_address:
+ max_lr = lr
+
+ # Find all live ranges that affected the placement of the bottleneck live range.
+ # This consists of two types of live ranges:
+ # - direct neighbours of the bottleneck live range
+ # - direct and indirect predecessors of these neighbours + bottleneck
+ # The turns at which these live ranges were allocated are put in the turns set.
+ turn_set = set()
+ turn_list = list()
+ self.add_predecessor_turns(turn_set, turn_list, max_lr)
+ for lr2 in max_lr.neighbours:
+ self.add_predecessor_turns(turn_set, turn_list, lr2)
+
+ # Non-direct neighbours that interfere with the allocation of the bottleneck are the
+ # immediate cause for gaps in the allocation, and are selected with higher probability.
+ non_nb_turn_list = []
+ for turn in turn_list:
+ lr = self.lrs[indices[turn]]
+ if not max_lr.is_neighbour(lr):
+ non_nb_turn_list.append(turn)
+ assert turn_list
+ # Pick from non-neighbour list with 30% probability
+ # (magic number based on tuning)
+ if random.randint(0, 100) < 30 and non_nb_turn_list:
+ # Pick a live range from the "non-neighbour list"
+ ix1 = non_nb_turn_list[random.randint(0, len(non_nb_turn_list) - 1)]
+ else:
+ # Pick any affecting live range.
+ ix1 = turn_list[random.randint(0, len(turn_list) - 1)]
+
+ ix2 = turn_list[random.randint(0, len(turn_list) - 2)]
+ if ix1 == ix2:
+ ix2 = turn_list[-1]
+ # Swap indices
+ indices[ix1], indices[ix2] = indices[ix2], indices[ix1]
+
+ def search(self, indices: List[int], iterations: int):
+ """
+ Search for a solution, using the given indices as initial solution.
+ """
+ best_indices = indices[:]
+ for _ in range(iterations):
+ # Reorder the indices
+ self.attempt_bottleneck_fix(indices)
+ # Allocate the reordered indices and check if it gave an improvement
+ new_size = self.allocate_indices(indices)
+ if new_size <= self.best_size:
+ # The new allocation produced a new best result remember it
+ self.best_size = new_size
+ best_indices = indices[:]
+ self.allocated_addresses = [lr.address for lr in self.lrs]
+ if self.best_size <= self.min_required_size:
+ # Target reached stop
+ return
+ else:
+ # The new allocation produced worse result undo the change
+ indices = best_indices[:]
+
+ def allocate(self) -> List[int]:
+ """
+ Runs the allocation algorithm. Finishes when an optimal solution has been
+ found or when maximum iterations have been run.
+ The allocated addresses are placed in the output vector, in the same
+ order as the input vector.
+
+ Implementation note: the algorithm produces reproduceable results by using
+ a well-defined random number generator with well-defined default seed,
+ and using a fixed number of iterations.
+ """
+ random.seed(1)
+ # Sort indices on priority. Note: self.lrs must be left unchanged
+ indices = [lr.id for lr in sorted(self.lrs)]
+ # Allocate the initial solution
+ self.best_size = self.allocate_indices(indices)
+ self.allocated_addresses = [lr.address for lr in self.lrs]
+ if self.best_size > self.min_required_size:
+ # Try to improve the heuristic allocation
+ self.search(indices, HillClimbAllocator.MAX_ITERATIONS)
+ # else the heuristic allocation returned an optimal solution; no search needed
+ return self.allocated_addresses
+
+
+def allocate_live_ranges(lrs: List[LiveRange]) -> List[int]:
+ """
+ Allocates live ranges using a search based allocator.
+ Returns the list of allocated addresses (one for each live range)
+ """
+ if not lrs:
+ return []
+ allocator = HillClimbAllocator(lrs)
+ return allocator.allocate()
diff --git a/ethosu/vela/nn_graph.py b/ethosu/vela/nn_graph.py
index db878bc..c45d0e3 100644
--- a/ethosu/vela/nn_graph.py
+++ b/ethosu/vela/nn_graph.py
@@ -38,7 +38,7 @@ class PassPlacement(enum.Enum):
class TensorAllocator(enum.Enum):
LinearAlloc = 1
Greedy = 2
- Search = 3
+ HillClimb = 3
def __str__(self):
return self.name
diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py
index 0a7da5d..b7057f0 100644
--- a/ethosu/vela/tensor_allocation.py
+++ b/ethosu/vela/tensor_allocation.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
#
@@ -20,6 +20,7 @@ import math
import numpy as np
+from . import hillclimb_allocation
from . import live_range
from . import numeric_util
from .errors import AllocationError
@@ -30,7 +31,6 @@ from .tensor import MemArea
from .tensor import MemType
from .tensor import Tensor
from .tensor import TensorPurpose
-from ethosu import tensor_allocator
def linear_allocate_live_ranges(live_ranges, alloc_granularity=Tensor.AllocationQuantum):
@@ -63,22 +63,14 @@ def linear_allocate_live_ranges(live_ranges, alloc_granularity=Tensor.Allocation
return total_sz
-def search_allocate_live_ranges(live_ranges: LiveRangeGraph, alloc_granularity: int) -> int:
- # Allocates using the search-based allocator (implemented in C++)
- input = []
- lrs = []
- lr_set = set()
- for lr in live_ranges.ranges.values():
- lr_set.add((lr.start_time, lr.end_time, lr))
- lr_list = sorted(lr_set)
- # Create a single array of ints containing start/end/size of the live ranges
- for start, end, lr in lr_list:
- input += [start, end, numeric_util.round_up(lr.size, alloc_granularity)]
- lrs.append(lr)
- addresses = tensor_allocator.allocate(input, 0)
+def hillclimb_allocate_live_ranges(live_ranges: LiveRangeGraph, alloc_granularity: int) -> int:
+ # Allocates using the hill climb allocator
+ lr_set = {(lr.start_time, lr.end_time, lr) for lr in live_ranges.ranges.values()}
+ lr_list = [lr for _, _, lr in lr_set]
+ addresses = hillclimb_allocation.allocate_live_ranges(lr_list)
# The result is a list containing the allocated addresses
total_sz = 0
- for lr, address in zip(lrs, addresses):
+ for lr, address in zip(lr_list, addresses):
total_sz = max(total_sz, address + lr.size)
lr.set_address(address)
verify_allocation(live_ranges, alloc_granularity)
@@ -179,8 +171,8 @@ def allocate_tensors(
verify_allocation(lrs, cpu_tensor_alignment)
elif tens_alloc == TensorAllocator.LinearAlloc:
total_sz = linear_allocate_live_ranges(lrs, cpu_tensor_alignment)
- elif tens_alloc == TensorAllocator.Search:
- total_sz = search_allocate_live_ranges(lrs, cpu_tensor_alignment)
+ elif tens_alloc == TensorAllocator.HillClimb:
+ total_sz = hillclimb_allocate_live_ranges(lrs, cpu_tensor_alignment)
else:
assert 0
alloc_ok = max_size is None or total_sz <= max_size
diff --git a/ethosu/tensor_allocator/test/test_tensor_allocator.py b/ethosu/vela/test/test_hillclimb_allocation.py
index 1011279..8a56c3f 100644
--- a/ethosu/tensor_allocator/test/test_tensor_allocator.py
+++ b/ethosu/vela/test/test_hillclimb_allocation.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
#
@@ -15,10 +15,12 @@
# limitations under the License.
#
# Description:
-# Unit tests for tensor_allocator.
+# Unit tests for hillclimb_allocator.
import pytest
-from ethosu import tensor_allocator
+from ethosu.vela.hillclimb_allocation import allocate_live_ranges
+from ethosu.vela.live_range import LiveRange
+
test_data = [
([(0, 100, 8000), (0, 1, 8016), (100, 110, 2000), (108, 110, 4000), (109, 110, 6000)], 16016),
@@ -40,24 +42,22 @@ test_data = [
]
+def live_range(start_time, end_time, size):
+ lr = LiveRange(None, 1)
+ lr.start_time = start_time
+ lr.end_time = end_time
+ lr.size = size
+ return lr
+
+
@pytest.mark.parametrize("lrs, expected_size", test_data)
def test_allocate(lrs, expected_size):
"""Tests the search allocator"""
- input = [x for lr in lrs for x in lr]
- res = tensor_allocator.allocate(input, 0)
+ lr_list = [live_range(start, end, size) for start, end, size in lrs]
+ res = allocate_live_ranges(lr_list)
assert len(res) == len(lrs)
assert max(addr + lr[2] for addr, lr in zip(res, lrs)) == expected_size
def test_allocate_empty_input():
- assert [] == tensor_allocator.allocate([], 0)
-
-
-invalid_input_test_data = [None, 3, [1, 2, 16, 9, 15], [1, 5, None], [-1, 0, 16], [1, 2, 10000000000]]
-
-
-@pytest.mark.parametrize("input", invalid_input_test_data)
-def test_allocate_invalid_input(input):
- """Tests the search allocator with invalid input data"""
- with pytest.raises(Exception):
- tensor_allocator.allocate(input, 0)
+ assert [] == allocate_live_ranges([])
diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py
index c4510b1..7271002 100644
--- a/ethosu/vela/vela.py
+++ b/ethosu/vela/vela.py
@@ -317,7 +317,7 @@ def main(args=None):
)
parser.add_argument(
"--tensor-allocator",
- default=TensorAllocator.Search,
+ default=TensorAllocator.HillClimb,
type=lambda s: TensorAllocator[s],
choices=list(TensorAllocator),
help="Tensor Allocator algorithm (default: %(default)s)",
diff --git a/setup.py b/setup.py
index f67d42e..98dfe52 100644
--- a/setup.py
+++ b/setup.py
@@ -44,12 +44,6 @@ mlw_module = Extension(
["ethosu/mlw_codec/mlw_encode.c", "ethosu/mlw_codec/mlw_decode.c", "ethosu/mlw_codec/mlw_codecmodule.c"],
)
-tensor_allocator_module = Extension(
- "ethosu.tensor_allocator",
- ["ethosu/tensor_allocator/tensor_allocatormodule.cpp", "ethosu/tensor_allocator/search_allocator.cpp"],
- depends=["ethosu/tensor_allocator/search_allocator.h"],
-)
-
setup(
name="ethos-u-vela",
use_scm_version=True,
@@ -66,7 +60,6 @@ setup(
"Operating System :: POSIX :: Linux",
"Operating System :: Microsoft :: Windows",
"Programming Language :: C",
- "Programming Language :: C++ :: C++11",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
@@ -84,6 +77,6 @@ setup(
"lxml>=4.5.1",
],
entry_points={"console_scripts": ["vela = ethosu.vela.vela:main"]},
- ext_modules=[mlw_module, tensor_allocator_module],
+ ext_modules=[mlw_module],
setup_requires=["setuptools_scm"],
)